diff --git a/.gitignore b/.gitignore index 5eb1071..94c93b5 100644 --- a/.gitignore +++ b/.gitignore @@ -67,7 +67,7 @@ target/ .idea # VS Code -.vscode/ +# .vscode/ # Spyder .spyproject/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..5df93ce --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,16 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.8.0 + hooks: + # Run the linter. + - id: ruff + types_or: [ python, pyi ] + args: [ --fix ] + # Run the formatter. + - id: ruff-format + types_or: [ python, pyi ] \ No newline at end of file diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..cb75462 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "ms-python.python", + "charliermarsh.ruff", + "bdsoftware.format-on-auto-save" + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..ebc63ac --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,10 @@ +{ + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff", + "editor.formatOnSave": true + }, + "files.autoSave": "afterDelay", + "editor.rulers": [ + 88 + ], +} \ No newline at end of file diff --git a/README.md b/README.md index 65936f4..899f96f 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,9 @@ # AI4EOSC - Platform API [![Conventional Commits](https://img.shields.io/badge/Conventional%20Commits-1.0.0-%23FE5196?logo=conventionalcommits&logoColor=white)](https://conventionalcommits.org) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) +[![Build Docker](https://github.com/ai4os/ai4-papi/actions/workflows/build-docker-prod.yml/badge.svg)](https://github.com/ai4os/ai4-papi/actions/workflows/build-docker-prod.yml) +[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/ai4os/ai4-papi/master.svg)](https://results.pre-commit.ci/latest/github/ai4os/ai4-papi/master) [//]: # ([![GitHub license](https://img.shields.io/github/license/ai4papi/ai4papi.svg)](https://github.com/ai4papi/ai4papi/blob/master/LICENSE)) [//]: # ([![GitHub release](https://img.shields.io/github/release/ai4papi/ai4papi.svg)](https://github.com/ai4papi/ai4papi/releases)) @@ -271,3 +274,10 @@ The pattern for the subfolders follows: - `user.yaml`: user customizable configuration to make a deployment in Nomad. Also contains the generic quotas for hardware (see `range` parameter). - `nomad.hcl`: additional non-customizable values (eg. ports) + +### Implementation notes + +This repository is formatted with [Ruff](https://docs.astral.sh/ruff/). +We use [Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) and [FormatOnSave](https://marketplace.visualstudio.com/items?itemName=BdSoftware.format-on-auto-save) ([issue](https://github.com/microsoft/vscode/issues/45997#issuecomment-950405496)) VScode extensions to make the development workflow smoother. + +We use [Precommit](https://pre-commit.com/) locally to enforce format in commits. Then use [Precommit.CI](https://pre-commit.ci/) to enforce it at the Github level. diff --git a/ai4papi/auth.py b/ai4papi/auth.py index 44bc09e..5308002 100644 --- a/ai4papi/auth.py +++ b/ai4papi/auth.py @@ -35,27 +35,26 @@ def get_user_info(token): - try: user_infos = flaat.get_user_infos_from_access_token(token) except Exception as e: raise HTTPException( status_code=401, detail=str(e), - ) + ) # Check output if user_infos is None: raise HTTPException( status_code=401, detail="Invalid token", - ) + ) # Retrieve VOs the user belongs to # VOs can be empty if the user does not belong to any VO, or the # 'eduperson_entitlement wasn't correctly retrieved from the token vos = [] - for i in user_infos.get('eduperson_entitlement', []): + for i in user_infos.get("eduperson_entitlement", []): # Parse Virtual Organizations manually from URNs # If more complexity is need in the future, check https://github.com/oarepo/urnparse ent_i = re.search(r"group:(.+?):", i) @@ -63,18 +62,18 @@ def get_user_info(token): vos.append(ent_i.group(1)) # Generate user info dict - for k in ['sub', 'iss', 'name', 'email']: + for k in ["sub", "iss", "name", "email"]: if user_infos.get(k) is None: raise HTTPException( status_code=401, detail=f"You token should have scopes for {k}.", - ) + ) out = { - 'id': user_infos.get('sub'), # subject, user-ID - 'issuer': user_infos.get('iss'), # URL of the access token issuer - 'name': user_infos.get('name'), - 'email': user_infos.get('email'), - 'vos': vos, + "id": user_infos.get("sub"), # subject, user-ID + "issuer": user_infos.get("iss"), # URL of the access token issuer + "name": user_infos.get("name"), + "email": user_infos.get("email"), + "vos": vos, } return out @@ -90,5 +89,5 @@ def check_vo_membership( if requested_vo not in user_vos: raise HTTPException( status_code=401, - detail=f"The requested Virtual Organization ({requested_vo}) does not match with any of your available VOs: {user_vos}." - ) + detail=f"The requested Virtual Organization ({requested_vo}) does not match with any of your available VOs: {user_vos}.", + ) diff --git a/ai4papi/conf.py b/ai4papi/conf.py index 5ed1867..a1cdc3d 100644 --- a/ai4papi/conf.py +++ b/ai4papi/conf.py @@ -14,17 +14,19 @@ # intensive (eg. disables calls to Github API) # The variables 'FORWARDED_ALLOW_IPS' serves as proxy for this, as it is only defined # when running from the Docker container -IS_DEV = False if os.getenv('FORWARDED_ALLOW_IPS') else True +IS_DEV = False if os.getenv("FORWARDED_ALLOW_IPS") else True # Harbor token is kind of mandatory in production, otherwise snapshots won't work. HARBOR_USER = "robot$user-snapshots+snapshot-api" -HARBOR_PASS = os.environ.get('HARBOR_ROBOT_PASSWORD') +HARBOR_PASS = os.environ.get("HARBOR_ROBOT_PASSWORD") if not HARBOR_PASS: if IS_DEV: # Not enforce this for developers - print("You should define the variable \"HARBOR_ROBOT_PASSWORD\" to use the \"/snapshots\" endpoint.") + print( + 'You should define the variable "HARBOR_ROBOT_PASSWORD" to use the "/snapshots" endpoint.' + ) else: - raise Exception("You need to define the variable \"HARBOR_ROBOT_PASSWORD\".") + raise Exception('You need to define the variable "HARBOR_ROBOT_PASSWORD".') # Paths main_path = Path(__file__).parent.absolute() @@ -34,7 +36,7 @@ } # Load main API configuration -with open(paths['conf'] / 'main.yaml', 'r') as f: +with open(paths["conf"] / "main.yaml", "r") as f: MAIN_CONF = yaml.safe_load(f) @@ -42,7 +44,7 @@ def load_nomad_job(fpath): """ Load default Nomad job configuration """ - with open(fpath, 'r') as f: + with open(fpath, "r") as f: raw_job = f.read() job_template = Template(raw_job) return job_template @@ -52,84 +54,84 @@ def load_yaml_conf(fpath): """ Load user customizable parameters """ - with open(fpath, 'r') as f: + with open(fpath, "r") as f: conf_full = yaml.safe_load(f) conf_values = {} for group_name, params in conf_full.items(): conf_values[group_name] = {} for k, v in params.items(): - if 'name' not in v.keys(): + if "name" not in v.keys(): raise Exception(f"Parameter {k} needs to have a name.") - if 'value' not in v.keys(): + if "value" not in v.keys(): raise Exception(f"Parameter {k} needs to have a value.") - conf_values[group_name][k] = v['value'] + conf_values[group_name][k] = v["value"] return conf_full, conf_values # Standard modules -nmd = load_nomad_job(paths['conf'] / 'modules' / 'nomad.hcl') -yml = load_yaml_conf(paths['conf'] / 'modules' / 'user.yaml') +nmd = load_nomad_job(paths["conf"] / "modules" / "nomad.hcl") +yml = load_yaml_conf(paths["conf"] / "modules" / "user.yaml") MODULES = { - 'nomad': nmd, - 'user': { - 'full': yml[0], - 'values': yml[1], + "nomad": nmd, + "user": { + "full": yml[0], + "values": yml[1], }, } # Tools -tool_dir = paths['conf'] / 'tools' +tool_dir = paths["conf"] / "tools" tool_list = [f for f in tool_dir.iterdir() if f.is_dir()] TOOLS = {} for tool_path in tool_list: - nmd = load_nomad_job(tool_path / 'nomad.hcl') - yml = load_yaml_conf(tool_path / 'user.yaml') + nmd = load_nomad_job(tool_path / "nomad.hcl") + yml = load_yaml_conf(tool_path / "user.yaml") TOOLS[tool_path.name] = { - 'nomad': nmd, - 'user': { - 'full': yml[0], - 'values': yml[1], + "nomad": nmd, + "user": { + "full": yml[0], + "values": yml[1], }, } # For tools, map the Nomad job name prefixes to tool IDs tools_nomad2id = { - 'fl': 'ai4os-federated-server', - 'cvat': 'ai4os-cvat', + "fl": "ai4os-federated-server", + "cvat": "ai4os-cvat", } for tool in TOOLS.keys(): if tool not in tools_nomad2id.values(): raise Exception(f"The tool {tool} is missing from the mapping dictionary.") # OSCAR template -with open(paths['conf'] / 'oscar.yaml', 'r') as f: +with open(paths["conf"] / "oscar.yaml", "r") as f: OSCAR_TMPL = Template(f.read()) # Try-me endpoints -nmd = load_nomad_job(paths['conf'] / 'try_me' / 'nomad.hcl') +nmd = load_nomad_job(paths["conf"] / "try_me" / "nomad.hcl") TRY_ME = { - 'nomad': nmd, + "nomad": nmd, } # Snapshot endpoints -nmd = load_nomad_job(paths['conf'] / 'snapshots' / 'nomad.hcl') +nmd = load_nomad_job(paths["conf"] / "snapshots" / "nomad.hcl") SNAPSHOTS = { - 'nomad': nmd, + "nomad": nmd, } # Retrieve git info from PAPI, to show current version in the docs papi_commit = subprocess.run( - ['git', 'log', '-1', '--format=%H'], + ["git", "log", "-1", "--format=%H"], stdout=subprocess.PIPE, text=True, cwd=main_path, ).stdout.strip() papi_branch = subprocess.run( - ['git', 'rev-parse', '--abbrev-ref', '--symbolic-full-name', '@{u}'], + ["git", "rev-parse", "--abbrev-ref", "--symbolic-full-name", "@{u}"], stdout=subprocess.PIPE, text=True, cwd=main_path, ).stdout.strip() -papi_branch = papi_branch.split('/')[-1] # remove the "origin/" part +papi_branch = papi_branch.split("/")[-1] # remove the "origin/" part diff --git a/ai4papi/main.py b/ai4papi/main.py index d101a5f..404450f 100644 --- a/ai4papi/main.py +++ b/ai4papi/main.py @@ -19,33 +19,29 @@ " src='https://ai4eosc.eu/wp-content/uploads/sites/10/2023/01/horizontal-bg-dark.png'" " width=200 alt='' />" "

" - "This is the Platform API for interacting with the AI4EOSC services. " "It aims at providing a stable UI, effectively decoupling the services offered by " "the project from the underlying tools we use to provide them (ie. Nomad)." "

" - "You can also access the functionalities of the API through our dashboards:
" "- [AIEOSC Dashboard](https://dashboard.cloud.ai4eosc.eu/)
" "- [iMagine Dashboard](https://dashboard.cloud.imagine-ai.eu/)" "

" - "For more information, please visit:
" "- [AI4EOSC Homepage](https://ai4eosc.eu)
" "- [API Github repository](https://github.com/AI4EOSC/ai4-papi)" "

" - "**Acknowledgements**
" "This work is co-funded by [AI4EOSC](https://ai4eosc.eu/) project that has " "received funding from the European Union's Horizon Europe 2022 research and " "innovation programme under agreement No 101058593" "

" - "PAPI version:" f"[`ai4-papi/{papi_branch}@{papi_commit[:5]}`]" f"(https://github.com/ai4os/ai4-papi/tree/{papi_commit})" ) + @asynccontextmanager async def lifespan(app: fastapi.FastAPI): # on startup @@ -114,11 +110,11 @@ async def favicon(): def run( - host:str = "0.0.0.0", - port:int = 8080, - ssl_keyfile:str = None, - ssl_certfile:str = None, - ): + host: str = "0.0.0.0", + port: int = 8080, + ssl_keyfile: str = None, + ssl_certfile: str = None, +): uvicorn.run( app, host=host, diff --git a/ai4papi/module_patches.py b/ai4papi/module_patches.py index 718cb7f..2c1dfed 100644 --- a/ai4papi/module_patches.py +++ b/ai4papi/module_patches.py @@ -3,10 +3,11 @@ fix/rebuild them. """ + def patch_nextcloud_mount( docker_image: str, task: dict, - ): +): """ Some module are blocked when running deepaas. @@ -37,10 +38,10 @@ def patch_nextcloud_mount( "DEEP-OC-image-classification-tf-dicom", "DEEP-OC-speech-to-text-tf", ] - modules = [f'deephdc/{m.lower()}' for m in modules] + modules = [f"deephdc/{m.lower()}" for m in modules] # TODO: this will need to be updated to ai4os-hub if docker_image in modules: - task['Env']['RCLONE_CONTIMEOUT'] = '1s' + task["Env"]["RCLONE_CONTIMEOUT"] = "1s" return task diff --git a/ai4papi/nomad/common.py b/ai4papi/nomad/common.py index 0f59662..9631bcc 100644 --- a/ai4papi/nomad/common.py +++ b/ai4papi/nomad/common.py @@ -23,18 +23,9 @@ Nomad = nomad.Nomad() # TODO: Remove monkey-patches when the code is merged to python-nomad Pypi package -Nomad.job.deregister_job = types.MethodType( - nomad_patches.deregister_job, - Nomad.job - ) -Nomad.job.get_allocations = types.MethodType( - nomad_patches.get_allocations, - Nomad.job - ) -Nomad.job.get_evaluations = types.MethodType( - nomad_patches.get_allocations, - Nomad.job - ) +Nomad.job.deregister_job = types.MethodType(nomad_patches.deregister_job, Nomad.job) +Nomad.job.get_allocations = types.MethodType(nomad_patches.get_allocations, Nomad.job) +Nomad.job.get_evaluations = types.MethodType(nomad_patches.get_allocations, Nomad.job) # Persistent requests session for faster requests session = requests.Session() @@ -44,15 +35,16 @@ def get_deployments( namespace: str, owner: str, prefix: str = "", - ): +): """ Returns a list of all deployments belonging to a user, in a given namespace. """ - job_filter = \ - 'Status != "dead" and ' + \ - f'Name matches "^{prefix}" and ' + \ - 'Meta is not empty and ' + \ - f'Meta.owner == "{owner}"' + job_filter = ( + 'Status != "dead" and ' + + f'Name matches "^{prefix}" and ' + + "Meta is not empty and " + + f'Meta.owner == "{owner}"' + ) jobs = Nomad.jobs.get_jobs(namespace=namespace, filter_=job_filter) return jobs @@ -62,7 +54,7 @@ def get_deployment( namespace: str, owner: str, full_info: True, - ): +): """ Retrieve the info of a specific deployment. Format outputs to a Nomad-independent format to be used by the Dashboard @@ -80,60 +72,60 @@ def get_deployment( j = Nomad.job.get_job( id_=deployment_uuid, namespace=namespace, - ) + ) except exceptions.URLNotFoundNomadException: raise HTTPException( status_code=400, detail="No deployment exists with this uuid.", - ) + ) # Check job does belong to owner - if j['Meta'] and owner != j['Meta'].get('owner', ''): + if j["Meta"] and owner != j["Meta"].get("owner", ""): raise HTTPException( status_code=400, detail="You are not the owner of that deployment.", - ) + ) # Create job info dict info = { - 'job_ID': j['ID'], - 'name': j['Name'], - 'status': '', # do not use j['Status'] as misleading - 'owner': j['Meta']['owner'], - 'title': j['Meta']['title'], - 'description': j['Meta']['description'], - 'docker_image': None, - 'docker_command': None, - 'submit_time': datetime.fromtimestamp( - j['SubmitTime'] // 1000000000 - ).strftime('%Y-%m-%d %H:%M:%S'), # nanoseconds to timestamp - 'resources': {}, - 'endpoints': {}, - 'active_endpoints': None, - 'main_endpoint': None, - 'alloc_ID': None, - 'datacenter': None, + "job_ID": j["ID"], + "name": j["Name"], + "status": "", # do not use j['Status'] as misleading + "owner": j["Meta"]["owner"], + "title": j["Meta"]["title"], + "description": j["Meta"]["description"], + "docker_image": None, + "docker_command": None, + "submit_time": datetime.fromtimestamp(j["SubmitTime"] // 1000000000).strftime( + "%Y-%m-%d %H:%M:%S" + ), # nanoseconds to timestamp + "resources": {}, + "endpoints": {}, + "active_endpoints": None, + "main_endpoint": None, + "alloc_ID": None, + "datacenter": None, } # Retrieve tasks - tasks = j['TaskGroups'][0]['Tasks'] - usertask = [t for t in tasks if t['Name'] == 'main'][0] + tasks = j["TaskGroups"][0]["Tasks"] + usertask = [t for t in tasks if t["Name"] == "main"][0] # Retrieve Docker image - info['docker_image'] = usertask['Config']['image'] - command = usertask['Config'].get('command', '') - args = usertask['Config'].get('args', []) - info['docker_command'] = f"{command} {' '.join(args)}".strip() + info["docker_image"] = usertask["Config"]["image"] + command = usertask["Config"].get("command", "") + args = usertask["Config"].get("args", []) + info["docker_command"] = f"{command} {' '.join(args)}".strip() # Add endpoints - info['endpoints'] = {} - for s in j['TaskGroups'][0]['Services']: - label = s['PortLabel'] + info["endpoints"] = {} + for s in j["TaskGroups"][0]["Services"]: + label = s["PortLabel"] # Iterate through tags to find `Host` tag - for t in s['Tags']: + for t in s["Tags"]: try: - url = re.search(r'Host\(`(.+?)`', t).group(1) + url = re.search(r"Host\(`(.+?)`", t).group(1) break except Exception: url = "missing-endpoint" @@ -141,126 +133,139 @@ def get_deployment( # Old deployments had network ports with names [deepaas, ide, monitor] # instead of [api, ide, monitor] so we have to manually replace them # see: https://github.com/AI4EOSC/ai4-papi/issues/22 - if label == 'deepaas': - label = 'api' + if label == "deepaas": + label = "api" - info['endpoints'][label] = f"http://{url}" + info["endpoints"][label] = f"http://{url}" # Add '/ui' to deepaas endpoint # If in the future we support other APIs, this will have to be removed. - if 'api' in info['endpoints'].keys(): - info['endpoints']['api'] += '/ui' + if "api" in info["endpoints"].keys(): + info["endpoints"]["api"] += "/ui" # Add quick-access (main endpoint) + customize endpoints service2endpoint = { - 'deepaas': 'api', - 'jupyter': 'ide', - 'vscode': 'ide', + "deepaas": "api", + "jupyter": "ide", + "vscode": "ide", } try: # deep-start compatible service service = re.search( - 'deep-start --(.*)$', - info['docker_command'], - ).group(1) + "deep-start --(.*)$", + info["docker_command"], + ).group(1) - info['main_endpoint'] = service2endpoint[service] + info["main_endpoint"] = service2endpoint[service] except Exception: # return first endpoint - info['main_endpoint'] = list(info['endpoints'].keys())[0] + info["main_endpoint"] = list(info["endpoints"].keys())[0] # Only fill resources if the job is allocated allocs = Nomad.job.get_allocations( - id_=j['ID'], + id_=j["ID"], namespace=namespace, - ) + ) evals = Nomad.job.get_evaluations( - id_=j['ID'], + id_=j["ID"], namespace=namespace, - ) + ) if allocs: - # Reorder allocations based on recency - dates = [a['CreateTime'] for a in allocs] - allocs = [x for _, x in sorted( - zip(dates, allocs), - key=lambda pair: pair[0], - )][::-1] # more recent first + dates = [a["CreateTime"] for a in allocs] + allocs = [ + x + for _, x in sorted( + zip(dates, allocs), + key=lambda pair: pair[0], + ) + ][::-1] # more recent first # Select the proper allocation - statuses = [a['ClientStatus'] for a in allocs] - if 'unknown' in statuses: + statuses = [a["ClientStatus"] for a in allocs] + if "unknown" in statuses: # The node has lost connection. Avoid showing temporary reallocated job, # to avoid confusions when the original allocation is restored back again. - idx = statuses.index('unknown') - elif 'running' in statuses: + idx = statuses.index("unknown") + elif "running" in statuses: # If an allocation is running, return that allocation # It happens that after a network cut, when the network is restored, # the temporary allocation created in the meantime (now with status # 'complete') is more recent than the original allocation that we # recovered (with status 'running'), so using only recency does not work. - idx = statuses.index('running') + idx = statuses.index("running") else: # Return most recent allocation idx = 0 - a = Nomad.allocation.get_allocation(allocs[idx]['ID']) + a = Nomad.allocation.get_allocation(allocs[idx]["ID"]) # Add ID - info['alloc_ID'] = a['ID'] + info["alloc_ID"] = a["ID"] # Add datacenter - info['datacenter'] = Nomad.node.get_node(a['NodeID'])['Datacenter'] + info["datacenter"] = Nomad.node.get_node(a["NodeID"])["Datacenter"] # Replace Nomad status with a more user-friendly status # Final list includes: starting, down, running, complete, failed, ... # We use the status of the "main" task because it isn more relevant the the # status of the overall job (a['ClientStatus']) - status = a['TaskStates']['main']['State'] if a.get('TaskStates') else 'queued' + status = a["TaskStates"]["main"]["State"] if a.get("TaskStates") else "queued" status_map = { # nomad: papi - 'pending': 'starting', - 'unknown': 'down', + "pending": "starting", + "unknown": "down", } - info['status'] = status_map.get(status, status) # if not mapped, then return original status + info["status"] = status_map.get( + status, status + ) # if not mapped, then return original status # Add error messages if needed - if info['status'] == 'failed': - info['error_msg'] = a['TaskStates']['main']['Events'][0]['Message'] + if info["status"] == "failed": + info["error_msg"] = a["TaskStates"]["main"]["Events"][0]["Message"] # Replace with clearer message - if info['error_msg'] == 'Docker container exited with non-zero exit code: 1': - info['error_msg'] = \ - "An error seems to appear when running this Docker container. " \ - "Try to run this Docker locally with the command " \ - f"`{info['docker_command']}` to find what is the error " \ + if ( + info["error_msg"] + == "Docker container exited with non-zero exit code: 1" + ): + info["error_msg"] = ( + "An error seems to appear when running this Docker container. " + "Try to run this Docker locally with the command " + f"`{info['docker_command']}` to find what is the error " "or contact the module owner." + ) - elif info['status'] == 'down': - info['error_msg'] = \ - "There seems to be network issues in the cluster. Please wait until " \ - "the network is restored and you should be able to fully recover " \ + elif info["status"] == "down": + info["error_msg"] = ( + "There seems to be network issues in the cluster. Please wait until " + "the network is restored and you should be able to fully recover " "your deployment." + ) # Add resources - res = a['AllocatedResources']['Tasks']['main'] - gpu = [d for d in res['Devices'] if d['Type'] == 'gpu'][0] if res['Devices'] else None - cpu_cores = res['Cpu']['ReservedCores'] - info['resources'] = { - 'cpu_num': len(cpu_cores) if cpu_cores else 0, - 'cpu_MHz': res['Cpu']['CpuShares'], - 'gpu_num': len(gpu['DeviceIDs']) if gpu else 0, - 'memory_MB': res['Memory']['MemoryMB'], - 'disk_MB': a['AllocatedResources']['Shared']['DiskMB'], + res = a["AllocatedResources"]["Tasks"]["main"] + gpu = ( + [d for d in res["Devices"] if d["Type"] == "gpu"][0] + if res["Devices"] + else None + ) + cpu_cores = res["Cpu"]["ReservedCores"] + info["resources"] = { + "cpu_num": len(cpu_cores) if cpu_cores else 0, + "cpu_MHz": res["Cpu"]["CpuShares"], + "gpu_num": len(gpu["DeviceIDs"]) if gpu else 0, + "memory_MB": res["Memory"]["MemoryMB"], + "disk_MB": a["AllocatedResources"]["Shared"]["DiskMB"], } # Retrieve the node the jobs landed at in order to properly fill the endpoints - n = Nomad.node.get_node(a['NodeID']) - for k, v in info['endpoints'].items(): - info['endpoints'][k] = v.replace('${meta.domain}', n['Meta']['domain']) + n = Nomad.node.get_node(a["NodeID"]) + for k, v in info["endpoints"].items(): + info["endpoints"][k] = v.replace("${meta.domain}", n["Meta"]["domain"]) # Add active endpoints if full_info: - info['active_endpoints'] = [] - for k, v in info['endpoints'].items(): + info["active_endpoints"] = [] + for k, v in info["endpoints"].items(): try: # We use GET and not HEAD, because HEAD is not returning the correct status_codes (even with "allow_redirects=True") # Anyway, both latencies are almost the same when using "allow_redirects=True" @@ -269,33 +274,40 @@ def get_deployment( # * Non existing domain: GET (404), HEAD (404) | latency: ~40 ms r = session.get(v, timeout=2) if r.ok: - info['active_endpoints'].append(k) - except (requests.exceptions.Timeout, requests.exceptions.ConnectionError): + info["active_endpoints"].append(k) + except ( + requests.exceptions.Timeout, + requests.exceptions.ConnectionError, + ): continue # Disable access to endpoints if there is a network cut - if info['status'] == 'down' and info['active_endpoints']: - info['active_endpoints'] = [] + if info["status"] == "down" and info["active_endpoints"]: + info["active_endpoints"] = [] elif evals: # Something happened, job didn't deploy (eg. job needs port that's currently being used) # We have to return `placement failures message`. - info['status'] = 'error' - info['error_msg'] = f"{evals[0].get('FailedTGAllocs', '')}" + info["status"] = "error" + info["error_msg"] = f"{evals[0].get('FailedTGAllocs', '')}" else: # info['error_msg'] = f"Job has not been yet evaluated. Contact with support sharing your job ID: {j['ID']}." - info['status'] = 'queued' + info["status"] = "queued" # Fill info with _requested_ resources instead - res = usertask['Resources'] - gpu = [d for d in res['Devices'] if d['Name'] == 'gpu'][0] if res['Devices'] else None - info['resources'] = { - 'cpu_num': res['Cores'], - 'cpu_MHz': 0, # not known before allocation - 'gpu_num': gpu['Count'] if gpu else 0, - 'memory_MB': res['MemoryMB'], - 'disk_MB': j['TaskGroups'][0]['EphemeralDisk']['SizeMB'], + res = usertask["Resources"] + gpu = ( + [d for d in res["Devices"] if d["Name"] == "gpu"][0] + if res["Devices"] + else None + ) + info["resources"] = { + "cpu_num": res["Cores"], + "cpu_MHz": 0, # not known before allocation + "gpu_num": gpu["Count"] if gpu else 0, + "memory_MB": res["MemoryMB"], + "disk_MB": j["TaskGroups"][0]["EphemeralDisk"]["SizeMB"], } return info @@ -303,7 +315,7 @@ def get_deployment( def load_job_conf( raw_str: str, - ): +): """ Transform raw hcl string to Nomad dict object """ @@ -312,21 +324,21 @@ def load_job_conf( def create_deployment( conf: dict, - ): +): """ Submit a deployment to Nomad. """ # Submit job try: - _ = Nomad.jobs.register_job({'Job': conf}) + _ = Nomad.jobs.register_job({"Job": conf}) return { - 'status': 'success', - 'job_ID': conf['ID'], + "status": "success", + "job_ID": conf["ID"], } except Exception as e: return { - 'status': 'fail', - 'error_msg': str(e), + "status": "fail", + "error_msg": str(e), } @@ -334,7 +346,7 @@ def delete_deployment( deployment_uuid: str, namespace: str, owner: str, - ): +): """ Delete a deployment. Users can only delete their own deployments. @@ -352,12 +364,12 @@ def delete_deployment( namespace=namespace, owner=owner, full_info=False, - ) + ) # If job is in stuck status, allow deleting with purge. # Most of the time, when a job is in this status, it is due to a platform error. # It gets stuck and cannot be deleted without purge - if info['status'] in ['queued', 'complete', 'failed', 'error', 'down'] : + if info["status"] in ["queued", "complete", "failed", "error", "down"]: purge = True else: purge = False @@ -367,12 +379,12 @@ def delete_deployment( id_=deployment_uuid, namespace=namespace, purge=purge, - ) + ) - return {'status': 'success'} + return {"status": "success"} -@cached(cache=TTLCache(maxsize=1024, ttl=1*60*60)) +@cached(cache=TTLCache(maxsize=1024, ttl=1 * 60 * 60)) def get_gpu_models(vo): """ Retrieve available GPU models in the cluster, filtering nodes by VO. @@ -381,18 +393,18 @@ def get_gpu_models(vo): nodes = Nomad.nodes.get_nodes(resources=True) for node in nodes: # Discard nodes that don't belong to the requested VO - meta = Nomad.node.get_node(node['ID'])['Meta'] - if papiconf.MAIN_CONF['nomad']['namespaces'][vo] not in meta['namespace']: + meta = Nomad.node.get_node(node["ID"])["Meta"] + if papiconf.MAIN_CONF["nomad"]["namespaces"][vo] not in meta["namespace"]: continue # Discard GPU models of nodes that are not eligible - if node['SchedulingEligibility'] != 'eligible': + if node["SchedulingEligibility"] != "eligible": continue # Retrieve GPU models of the node - devices = node['NodeResources']['Devices'] - gpus = [d for d in devices if d['Type'] == 'gpu'] if devices else [] + devices = node["NodeResources"]["Devices"] + gpus = [d for d in devices if d["Type"] == "gpu"] if devices else [] for gpu in gpus: - gpu_models.add(gpu['Name']) + gpu_models.add(gpu["Name"]) return list(gpu_models) diff --git a/ai4papi/nomad/patches.py b/ai4papi/nomad/patches.py index 9154257..7488de6 100644 --- a/ai4papi/nomad/patches.py +++ b/ai4papi/nomad/patches.py @@ -2,6 +2,7 @@ Miscellaneous Nomad patches #TODO: remove when new nomad-python release is launched. """ + from typing import Union import nomad @@ -17,44 +18,44 @@ def deregister_job( global_: Union[bool, None] = None, namespace: Union[str, None] = None, purge: Union[bool, None] = None, - ): - """ ================================================================================ - This is a monkey-patch of the default function in the python-nomad module, - that did not support `namespace` as a parameter of the function. - - Remove when PR is merged: - https://github.com/jrxFive/python-nomad/pull/153 - - ================================================================================ - - Deregisters a job, and stops all allocations part of it. - - https://www.nomadproject.io/docs/http/job.html - - arguments: - - id - - eval_priority (int) optional. - Override the priority of the evaluations produced as a result - of this job deregistration. By default, this is set to the - priority of the job. - - global (bool) optional. - Stop a multi-region job in all its regions. By default, job - stop will stop only a single region at a time. Ignored for - single-region jobs. - - purge (bool) optional. - Specifies that the job should be stopped and purged immediately. - This means the job will not be queryable after being stopped. - If not set, the job will be purged by the garbage collector. - - namespace (str) optional. - Specifies the target namespace. If ACL is enabled, this value - must match a namespace that the token is allowed to access. - This is specified as a query string parameter. - - returns: dict - raises: - - nomad.api.exceptions.BaseNomadException - - nomad.api.exceptions.URLNotFoundNomadException - - nomad.api.exceptions.InvalidParameters +): + """================================================================================ + This is a monkey-patch of the default function in the python-nomad module, + that did not support `namespace` as a parameter of the function. + + Remove when PR is merged: + https://github.com/jrxFive/python-nomad/pull/153 + + ================================================================================ + + Deregisters a job, and stops all allocations part of it. + + https://www.nomadproject.io/docs/http/job.html + + arguments: + - id + - eval_priority (int) optional. + Override the priority of the evaluations produced as a result + of this job deregistration. By default, this is set to the + priority of the job. + - global (bool) optional. + Stop a multi-region job in all its regions. By default, job + stop will stop only a single region at a time. Ignored for + single-region jobs. + - purge (bool) optional. + Specifies that the job should be stopped and purged immediately. + This means the job will not be queryable after being stopped. + If not set, the job will be purged by the garbage collector. + - namespace (str) optional. + Specifies the target namespace. If ACL is enabled, this value + must match a namespace that the token is allowed to access. + This is specified as a query string parameter. + + returns: dict + raises: + - nomad.api.exceptions.BaseNomadException + - nomad.api.exceptions.URLNotFoundNomadException + - nomad.api.exceptions.InvalidParameters """ params = { "eval_priority": eval_priority, @@ -70,7 +71,7 @@ def get_allocations( id_: str, all_: Union[bool, None] = None, namespace: Union[str, None] = None, - ): +): """Query the allocations belonging to a single job. https://www.nomadproject.io/docs/http/job.html @@ -98,7 +99,7 @@ def get_evaluations( self, id_: str, namespace: Union[str, None] = None, - ): +): """Query the evaluations belonging to a single job. https://www.nomadproject.io/docs/http/job.html diff --git a/ai4papi/quotas.py b/ai4papi/quotas.py index b1d0142..cc35c11 100644 --- a/ai4papi/quotas.py +++ b/ai4papi/quotas.py @@ -1,6 +1,7 @@ """ Accounting of resources. """ + from copy import deepcopy from fastapi import HTTPException @@ -11,88 +12,91 @@ def check_jobwise( conf: dict, vo: str, - ): +): """ Check the job configuration does not overflow the generic hardware limits. """ # Retrieve generic quotas (vo-dependent) - item_name = conf['general']['docker_image'].split('/')[-1] + item_name = conf["general"]["docker_image"].split("/")[-1] ref = limit_resources( item_name=item_name, vo=vo, ) # Compare with user options - user_conf = conf['hardware'] + user_conf = conf["hardware"] for k in ref.keys(): - if 'range' in ref[k].keys(): - if user_conf[k] < ref[k]['range'][0]: + if "range" in ref[k].keys(): + if user_conf[k] < ref[k]["range"][0]: raise HTTPException( status_code=400, - detail=f"The parameter {k} should bigger or equal to {ref[k]['range'][0]}." - ) - if user_conf[k] > ref[k]['range'][1]: + detail=f"The parameter {k} should bigger or equal to {ref[k]['range'][0]}.", + ) + if user_conf[k] > ref[k]["range"][1]: raise HTTPException( status_code=400, - detail=f"The parameter {k} should smaller or equal to {ref[k]['range'][1]}." - ) + detail=f"The parameter {k} should smaller or equal to {ref[k]['range'][1]}.", + ) def check_userwise( conf: dict, deployments: dict, - ): +): """ Check the job configuration does not overflow the generic hardware limits. For example, a user cannot have more than two GPUs running/queued. """ # Aggregate user resources - user = {'gpu_num': 0} + user = {"gpu_num": 0} for d in deployments: - user['gpu_num'] += d['resources']['gpu_num'] + user["gpu_num"] += d["resources"]["gpu_num"] # Check if aggregate is within the limits - threshold = {'gpu_num': 2} - if (user['gpu_num'] + conf['hardware']['gpu_num']) > threshold['gpu_num'] and \ - conf['hardware']['gpu_num']: + threshold = {"gpu_num": 2} + if (user["gpu_num"] + conf["hardware"]["gpu_num"]) > threshold["gpu_num"] and conf[ + "hardware" + ]["gpu_num"]: # TODO: remove this last line ("and conf['hardware']['gpu_num']"") once everyone # is within the quotas. For the time being this line is enabling users that have # overpassed the quotas (*) to make CPU deployments. # (*) before the quotas were in place raise HTTPException( status_code=400, - detail="You already have at least 2 GPUs running and/or queued. " \ - "If you want to make a new GPU deployment please delete one of your " \ - "existing ones." - ) + detail="You already have at least 2 GPUs running and/or queued. " + "If you want to make a new GPU deployment please delete one of your " + "existing ones.", + ) def limit_resources( item_name: str, vo: str, - ): +): """ Implement hardware limits for specific users or VOs. """ # Select appropriate conf if item_name in papiconf.TOOLS.keys(): - conf = deepcopy(papiconf.TOOLS[item_name]['user']['full']) + conf = deepcopy(papiconf.TOOLS[item_name]["user"]["full"]) else: - conf = deepcopy(papiconf.MODULES['user']['full']) - conf = conf['hardware'] + conf = deepcopy(papiconf.MODULES["user"]["full"]) + conf = conf["hardware"] # Limit resources for tutorial users - if vo == 'training.egi.eu': - if 'cpu_num' in conf.keys(): + if vo == "training.egi.eu": + if "cpu_num" in conf.keys(): conf["cpu_num"]["value"] = 2 conf["cpu_num"]["range"] = [2, 4] - if 'gpu_num' in conf.keys(): + if "gpu_num" in conf.keys(): conf["gpu_num"]["range"] = [0, 0] - conf["gpu_num"]["description"] = "Tutorial users are not allowed to deploy on GPUs." - if 'ram' in conf.keys(): + conf["gpu_num"]["description"] = ( + "Tutorial users are not allowed to deploy on GPUs." + ) + if "ram" in conf.keys(): conf["ram"]["value"] = 2000 conf["ram"]["range"] = [2000, 4000] - if 'disk' in conf.keys(): + if "disk" in conf.keys(): conf["disk"]["value"] = 500 conf["disk"]["range"] = [300, 1000] diff --git a/ai4papi/routers/__init__.py b/ai4papi/routers/__init__.py index bbf8c7e..326cdf4 100644 --- a/ai4papi/routers/__init__.py +++ b/ai4papi/routers/__init__.py @@ -1 +1 @@ -from . import v1 \ No newline at end of file +from . import v1 diff --git a/ai4papi/routers/v1/__init__.py b/ai4papi/routers/v1/__init__.py index acce788..34383bd 100644 --- a/ai4papi/routers/v1/__init__.py +++ b/ai4papi/routers/v1/__init__.py @@ -1,6 +1,15 @@ import fastapi -from . import catalog, deployments, inference, secrets, stats, storage, try_me, snapshots +from . import ( + catalog, + deployments, + inference, + secrets, + stats, + storage, + try_me, + snapshots, +) router = fastapi.APIRouter() diff --git a/ai4papi/routers/v1/catalog/__init__.py b/ai4papi/routers/v1/catalog/__init__.py index d98ecca..eb9c31d 100644 --- a/ai4papi/routers/v1/catalog/__init__.py +++ b/ai4papi/routers/v1/catalog/__init__.py @@ -6,13 +6,13 @@ router = fastapi.APIRouter() router.include_router( router=modules.router, - prefix='/catalog', - ) + prefix="/catalog", +) router.include_router( router=tools.router, - prefix='/catalog', - ) + prefix="/catalog", +) router.include_router( router=datasets.router, - prefix='/datasets', - ) + prefix="/datasets", +) diff --git a/ai4papi/routers/v1/catalog/common.py b/ai4papi/routers/v1/catalog/common.py index ae048e7..3d294fe 100644 --- a/ai4papi/routers/v1/catalog/common.py +++ b/ai4papi/routers/v1/catalog/common.py @@ -40,12 +40,11 @@ security = HTTPBearer() -JENKINS_TOKEN = os.getenv('PAPI_JENKINS_TOKEN') +JENKINS_TOKEN = os.getenv("PAPI_JENKINS_TOKEN") class Catalog: - - def __init__(self, repo:str, item_type:str='item') -> None: + def __init__(self, repo: str, item_type: str = "item") -> None: """ Parameters: * repo: Github repo where the catalog is hosted (via git submodules) @@ -54,11 +53,10 @@ def __init__(self, repo:str, item_type:str='item') -> None: self.repo = repo self.item_type = item_type - - @cached(cache=TTLCache(maxsize=1024, ttl=6*60*60)) + @cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60)) def get_items( self, - ): + ): """ Retrieve a dict of *all* items. ``` @@ -72,7 +70,9 @@ def get_items( This is implemented in a separate function as many functions from this router are using this function, so we need to avoid infinite recursions. """ - gitmodules_url = f"https://raw.githubusercontent.com/{self.repo}/master/.gitmodules" + gitmodules_url = ( + f"https://raw.githubusercontent.com/{self.repo}/master/.gitmodules" + ) r = requests.get(gitmodules_url) cfg = configparser.ConfigParser() @@ -81,26 +81,26 @@ def get_items( modules = {} for section in cfg.sections(): items = dict(cfg.items(section)) - key = items.pop('path') - items['url'] = items['url'].replace('.git', '') # remove `.git`, if present + key = items.pop("path") + items["url"] = items["url"].replace(".git", "") # remove `.git`, if present modules[key] = items # In the case of the tools repo, make sure to remove any tool that is not yet # supported by PAPI (use the ^ operator to only keep common items) - if 'tool' in self.repo: + if "tool" in self.repo: for tool_name in papiconf.TOOLS.keys() ^ modules.keys(): _ = modules.pop(tool_name) return modules - @cached(cache=TTLCache(maxsize=1024, ttl=6*60*60)) + @cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60)) def get_filtered_list( self, tags: Union[Tuple, None] = Query(default=None), tags_any: Union[Tuple, None] = Query(default=None), not_tags: Union[Tuple, None] = Query(default=None), not_tags_any: Union[Tuple, None] = Query(default=None), - ): + ): """ Retrieve a list of all items. @@ -113,15 +113,14 @@ def get_filtered_list( # ValueError: [ValueError('dictionary update sequence element #0 has length 1; 2 is required'), TypeError('vars() argument must have __dict__ attribute')] return modules - - @cached(cache=TTLCache(maxsize=1024, ttl=6*60*60)) + @cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60)) def get_summary( self, tags: Union[Tuple, None] = Query(default=None), tags_any: Union[Tuple, None] = Query(default=None), not_tags: Union[Tuple, None] = Query(default=None), not_tags_any: Union[Tuple, None] = Query(default=None), - ): + ): """ Retrieve a list of all items' basic metadata. @@ -130,23 +129,22 @@ def get_summary( """ modules = self.get_filtered_list() summary = [] - ignore = ['description', 'links'] # don't send this info to decrease latency + ignore = ["description", "links"] # don't send this info to decrease latency for m in modules: try: meta1 = self.get_metadata(m) except Exception: # Avoid breaking the whole method if failing to retrieve a module - print(f'Error retrieving metadata: {m}') + print(f"Error retrieving metadata: {m}") continue meta = {k: v for k, v in meta1.items() if k not in ignore} # filter keys - meta['name'] = m + meta["name"] = m summary.append(meta) return summary - def get_tags( self, - ): + ): """ Retrieve a list of all the existing tags. Now deprecated, kept to avoid breaking backward-compatibility. @@ -154,12 +152,14 @@ def get_tags( """ return [] - - @cached(cache=TTLCache(maxsize=1024, ttl=6*60*60), key=lambda self, item_name: item_name,) + @cached( + cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60), + key=lambda self, item_name: item_name, + ) def get_metadata( self, item_name: str, - ): + ): """ Get the item's full metadata. """ @@ -171,31 +171,33 @@ def get_metadata( raise HTTPException( status_code=404, detail=f"Item {item_name} not in catalog: {list(items.keys())}", - ) + ) # Retrieve metadata from default branch # Use try/except to avoid that a single module formatting error could take down # all the Dashboard branch = items[item_name].get("branch", "master") - url = items[item_name]['url'].replace('github.com', 'raw.githubusercontent.com') + url = items[item_name]["url"].replace("github.com", "raw.githubusercontent.com") metadata_url = f"{url}/{branch}/ai4-metadata.yml" error = None # Try to retrieve the metadata from Github r = requests.get(metadata_url) if not r.ok: - error = \ - "The metadata of this module could not be retrieved because the " \ + error = ( + "The metadata of this module could not be retrieved because the " "module is lacking a metadata file (`ai4-metadata.yml`)." + ) else: # Try to load the YML file try: metadata = yaml.safe_load(r.text) except Exception: metadata = None - error = \ - "The metadata of this module could not be retrieved because the " \ + error = ( + "The metadata of this module could not be retrieved because the " "metadata file is badly formatted (`ai4-metadata.yml`)." + ) # Since we are loading the metadata directly from the repo main branch, # we cannot know if they have successfully passed or not the Jenkins @@ -205,25 +207,28 @@ def get_metadata( schema = ai4_metadata.get_schema("2.0.0") ai4_metadata.validate.validate(instance=metadata, schema=schema) except Exception: - error = \ - "The metadata of this module has failed to comply with the " \ - "specifications of the AI4EOSC Platform (see the " \ + error = ( + "The metadata of this module has failed to comply with the " + "specifications of the AI4EOSC Platform (see the " "[metadata validator](https://github.com/ai4os/ai4-metadata))." + ) # Make sure the repo belongs to one of supported orgs pattern = r"https?:\/\/(www\.)?github\.com\/([^\/]+)\/" - match = re.search(pattern, metadata['links']['source_code']) + match = re.search(pattern, metadata["links"]["source_code"]) github_org = match.group(2) if match else None if not github_org: - error = \ - "This module does not seem to have a valid Github source code. " \ - "If you are the developer of this module, please check the " \ - "\"source_code\" link in your metadata." - if github_org not in ['ai4os', 'ai4os-hub', 'deephdc']: - error = \ - "This module belongs to a Github organization not supported by " \ - "the project. If you are the developer of this module, please " \ - "check the \"source_code\" link in your metadata." + error = ( + "This module does not seem to have a valid Github source code. " + "If you are the developer of this module, please check the " + '"source_code" link in your metadata.' + ) + if github_org not in ["ai4os", "ai4os-hub", "deephdc"]: + error = ( + "This module belongs to a Github organization not supported by " + "the project. If you are the developer of this module, please " + 'check the "source_code" link in your metadata.' + ) # If any of the previous steps raised an error, load a metadata placeholder if error: @@ -238,7 +243,7 @@ def get_metadata( "dates": { "created": "", "updated": "", - }, + }, "links": { "documentation": "", "source_code": "", @@ -258,32 +263,37 @@ def get_metadata( else: # Replace some fields with the info gathered from Github - pattern = r'github\.com/([^/]+)/([^/]+?)(?:\.git|/)?$' - match = re.search(pattern, items[item_name]['url']) + pattern = r"github\.com/([^/]+)/([^/]+?)(?:\.git|/)?$" + match = re.search(pattern, items[item_name]["url"]) if match: owner, repo = match.group(1), match.group(2) gh_info = utils.get_github_info(owner, repo) - metadata.setdefault('dates', {}) - metadata['dates']['created'] = gh_info.get('created', '') - metadata['dates']['updated'] = gh_info.get('updated', '') - metadata['license'] = gh_info.get('license', '') + metadata.setdefault("dates", {}) + metadata["dates"]["created"] = gh_info.get("created", "") + metadata["dates"]["updated"] = gh_info.get("updated", "") + metadata["license"] = gh_info.get("license", "") # Add Jenkins CI/CD links - metadata['links']['cicd_url'] = f"https://jenkins.services.ai4os.eu/job/{github_org}/job/{item_name}/job/{branch}/" - metadata['links']['cicd_badge'] = f"https://jenkins.services.ai4os.eu/buildStatus/icon?job={github_org}/{item_name}/{branch}" + metadata["links"]["cicd_url"] = ( + f"https://jenkins.services.ai4os.eu/job/{github_org}/job/{item_name}/job/{branch}/" + ) + metadata["links"]["cicd_badge"] = ( + f"https://jenkins.services.ai4os.eu/buildStatus/icon?job={github_org}/{item_name}/{branch}" + ) # Add DockerHub # TODO: when the migration is finished, we have to generate the url from the module name # (ie. ignore the value coming from the metadata) - metadata['links']['docker_image'] = metadata['links']['docker_image'].strip('/ ') + metadata["links"]["docker_image"] = metadata["links"]["docker_image"].strip( + "/ " + ) # Add the item name - metadata['id'] = item_name + metadata["id"] = item_name return metadata - def refresh_metadata_cache_entry( self, item_name: str, @@ -318,10 +328,9 @@ def refresh_metadata_cache_entry( except Exception as e: raise HTTPException(status_code=500, detail=str(e)) - def get_config( self, - ): + ): """ Returns the default configuration (dict) for creating a deployment for a specific item. It is prefilled with the appropriate @@ -332,8 +341,8 @@ def get_config( def retrieve_docker_tags( image: str, - repo: str = 'ai4oshub', - ): + repo: str = "ai4oshub", +): """ Retrieve tags from Dockerhub image """ @@ -346,6 +355,6 @@ def retrieve_docker_tags( raise HTTPException( status_code=400, detail=f"Could not retrieve Docker tags from {repo}/{image}.", - ) + ) tags = [i["name"] for i in r["results"]] return tags diff --git a/ai4papi/routers/v1/catalog/datasets/__init__.py b/ai4papi/routers/v1/catalog/datasets/__init__.py index 1efe3c6..84bcee3 100644 --- a/ai4papi/routers/v1/catalog/datasets/__init__.py +++ b/ai4papi/routers/v1/catalog/datasets/__init__.py @@ -6,4 +6,4 @@ router = fastapi.APIRouter() router.include_router( router=zenodo.router, - ) +) diff --git a/ai4papi/routers/v1/catalog/datasets/zenodo.py b/ai4papi/routers/v1/catalog/datasets/zenodo.py index 9916a36..ea20a90 100644 --- a/ai4papi/routers/v1/catalog/datasets/zenodo.py +++ b/ai4papi/routers/v1/catalog/datasets/zenodo.py @@ -29,20 +29,20 @@ # If available, authenticate the call to Zenodo to increase rate limit. # https://developers.zenodo.org/#rate-limiting -API_URL = 'https://zenodo.org' +API_URL = "https://zenodo.org" session = requests.Session() -zenodo_token = os.environ.get('ZENODO_TOKEN', None) +zenodo_token = os.environ.get("ZENODO_TOKEN", None) if zenodo_token: session.headers = { - 'Authorization': f'Bearer {zenodo_token}', + "Authorization": f"Bearer {zenodo_token}", } -@cached(cache=TTLCache(maxsize=1024, ttl=6*60*60)) +@cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60)) def _zenodo_proxy( api_route: str, params: Union[frozenset, None] = None, - ): +): """ We use this hidden function to allow for caching responses. Otherwise error will be raised, because "authorization" param cannot be cached @@ -59,11 +59,11 @@ def _zenodo_proxy( # To avoid security issues, only allow a subset of Zenodo API (to avoid users # using *our* Zenodo token to update any record) allowed_routes = [ - '^communities', - '^communities/[a-zA-Z0-9-]+/records*$', - '^records/[0-9]+', - '^records/[0-9]+/versions*$', - ] + "^communities", + "^communities/[a-zA-Z0-9-]+/records*$", + "^records/[0-9]+", + "^records/[0-9]+/versions*$", + ] allowed = False for i in allowed_routes: if re.match(i, api_route): @@ -72,21 +72,20 @@ def _zenodo_proxy( if not allowed: raise HTTPException( status_code=400, - detail="Zenodo API route not allowed." \ - f"Allowed routes: {allowed_routes}", - ) + detail="Zenodo API route not allowed." f"Allowed routes: {allowed_routes}", + ) # Make the call r = session.get( f"{API_URL}/api/{api_route}", params=params, - ) + ) if not r.ok: raise HTTPException( status_code=500, detail="Failed to query Zenodo.", - ) + ) return r.json() @@ -96,7 +95,7 @@ def zenodo_proxy( api_route: str, params: Union[dict, None] = None, authorization=Depends(security), - ): +): """ Zenodo proxy diff --git a/ai4papi/routers/v1/catalog/modules.py b/ai4papi/routers/v1/catalog/modules.py index 20b5c58..57d343e 100644 --- a/ai4papi/routers/v1/catalog/modules.py +++ b/ai4papi/routers/v1/catalog/modules.py @@ -13,26 +13,26 @@ def get_config( self, item_name: str, vo: str, - ): +): # Check if module exists modules = self.get_items() if item_name not in modules.keys(): raise HTTPException( status_code=400, detail=f"{item_name} is not an available module.", - ) + ) # Retrieve module configuration - conf = deepcopy(papiconf.MODULES['user']['full']) + conf = deepcopy(papiconf.MODULES["user"]["full"]) # Retrieve module metadata metadata = self.get_metadata(item_name) # Parse docker registry - registry = metadata['links']['docker_image'] - repo, image = registry.split('/')[-2:] - if repo not in ['deephdc', 'ai4oshub']: - repo = 'ai4oshub' + registry = metadata["links"]["docker_image"] + repo, image = registry.split("/")[-2:] + if repo not in ["deephdc", "ai4oshub"]: + repo = "ai4oshub" # Fill with correct Docker image conf["general"]["docker_image"]["value"] = f"{repo}/{image}" @@ -43,7 +43,7 @@ def get_config( conf["general"]["docker_tag"]["value"] = tags[0] # Custom conf for development environment - if item_name == 'ai4os-dev-env': + if item_name == "ai4os-dev-env": # For dev-env, order the tags in "Z-A" order instead of "newest" # This is done because builds are done in parallel, so "newest" is meaningless # (Z-A + natsort) allows to show more recent semver first @@ -52,12 +52,14 @@ def get_config( conf["general"]["docker_tag"]["value"] = tags[0] # Use VS Code (Coder OSS) in the development container - conf["general"]["service"]["value"] = 'vscode' - conf["general"]["service"]["options"].insert(0, 'vscode') - conf["general"]["service"]["options"].remove('deepaas') # no models installed in dev + conf["general"]["service"]["value"] = "vscode" + conf["general"]["service"]["options"].insert(0, "vscode") + conf["general"]["service"]["options"].remove( + "deepaas" + ) # no models installed in dev # Modify the resources limits for a given user or VO - conf['hardware'] = quotas.limit_resources( + conf["hardware"] = quotas.limit_resources( item_name=item_name, vo=vo, ) @@ -71,8 +73,8 @@ def get_config( Modules = Catalog( - repo='ai4os-hub/modules-catalog', - item_type='module', + repo="ai4os-hub/modules-catalog", + item_type="module", ) Modules.get_config = types.MethodType(get_config, Modules) @@ -86,28 +88,28 @@ def get_config( "", Modules.get_filtered_list, methods=["GET"], - ) +) router.add_api_route( "/detail", Modules.get_summary, methods=["GET"], - ) +) router.add_api_route( "/tags", Modules.get_tags, methods=["GET"], deprecated=True, - ) +) router.add_api_route( "/{item_name}/metadata", Modules.get_metadata, methods=["GET"], - ) +) router.add_api_route( "/{item_name}/config", Modules.get_config, methods=["GET"], - ) +) router.add_api_route( "/{item_name}/refresh", diff --git a/ai4papi/routers/v1/catalog/tools.py b/ai4papi/routers/v1/catalog/tools.py index c5d5dab..cbcbeb1 100644 --- a/ai4papi/routers/v1/catalog/tools.py +++ b/ai4papi/routers/v1/catalog/tools.py @@ -16,7 +16,7 @@ def get_config( self, item_name: str, vo: str, - ): +): """ Returns the default configuration (dict) for creating a deployment for a specific item. It is prefilled with the appropriate @@ -25,24 +25,24 @@ def get_config( # Retrieve tool configuration try: - conf = deepcopy(papiconf.TOOLS[item_name]['user']['full']) + conf = deepcopy(papiconf.TOOLS[item_name]["user"]["full"]) except Exception: raise HTTPException( status_code=400, detail=f"{item_name} is not an available tool.", - ) + ) # Retrieve tool metadata metadata = self.get_metadata(item_name) # Parse docker registry - registry = metadata['links']['docker_image'] - repo, image = registry.split('/')[-2:] - if repo not in ['deephdc', 'ai4oshub']: - repo = 'ai4oshub' + registry = metadata["links"]["docker_image"] + repo, image = registry.split("/")[-2:] + if repo not in ["deephdc", "ai4oshub"]: + repo = "ai4oshub" # Fill with correct Docker image and tags (not needed for CVAT because hardcoded) - if item_name != 'ai4os-cvat': + if item_name != "ai4os-cvat": conf["general"]["docker_image"]["value"] = f"{repo}/{image}" tags = retrieve_docker_tags(image=image, repo=repo) @@ -60,8 +60,8 @@ def get_config( Tools = Catalog( - repo='ai4os/tools-catalog', - item_type='tool', + repo="ai4os/tools-catalog", + item_type="tool", ) Tools.get_config = types.MethodType(get_config, Tools) @@ -75,28 +75,28 @@ def get_config( "", Tools.get_filtered_list, methods=["GET"], - ) +) router.add_api_route( "/detail", Tools.get_summary, methods=["GET"], - ) +) router.add_api_route( "/tags", Tools.get_tags, methods=["GET"], deprecated=True, - ) +) router.add_api_route( "/{item_name}/metadata", Tools.get_metadata, methods=["GET"], - ) +) router.add_api_route( "/{item_name}/config", Tools.get_config, methods=["GET"], - ) +) router.add_api_route( "/{item_name}/refresh", Tools.refresh_metadata_cache_entry, diff --git a/ai4papi/routers/v1/deployments/__init__.py b/ai4papi/routers/v1/deployments/__init__.py index 7ae724f..e5a4de1 100644 --- a/ai4papi/routers/v1/deployments/__init__.py +++ b/ai4papi/routers/v1/deployments/__init__.py @@ -6,9 +6,9 @@ router = fastapi.APIRouter() router.include_router( router=modules.router, - prefix='/deployments', - ) + prefix="/deployments", +) router.include_router( router=tools.router, - prefix='/deployments', - ) + prefix="/deployments", +) diff --git a/ai4papi/routers/v1/deployments/modules.py b/ai4papi/routers/v1/deployments/modules.py index 2c4fa39..9ee8091 100644 --- a/ai4papi/routers/v1/deployments/modules.py +++ b/ai4papi/routers/v1/deployments/modules.py @@ -1,7 +1,6 @@ from copy import deepcopy import datetime import os -import re import types from typing import Tuple, Union import uuid @@ -24,9 +23,9 @@ # When deploying in production, force the definition of a provenance token -provenance_token = os.environ.get('PAPI_PROVENANCE_TOKEN', None) +provenance_token = os.environ.get("PAPI_PROVENANCE_TOKEN", None) if not papiconf.IS_DEV and not provenance_token: - raise Exception("You need to define the variable \"PAPI_PROVENANCE_TOKEN\".") + raise Exception('You need to define the variable "PAPI_PROVENANCE_TOKEN".') @router.get("") @@ -34,7 +33,7 @@ def get_deployments( vos: Union[Tuple, None] = Query(default=None), full_info: bool = Query(default=False), authorization=Depends(security), - ): +): """ Returns a list of all deployments belonging to a user. @@ -51,21 +50,21 @@ def get_deployments( # If no VOs, then retrieve jobs from all user VOs # Always remove VOs that do not belong to the project if not vos: - vos = auth_info['vos'] - vos = set(vos).intersection(set(papiconf.MAIN_CONF['auth']['VO'])) + vos = auth_info["vos"] + vos = set(vos).intersection(set(papiconf.MAIN_CONF["auth"]["VO"])) if not vos: raise HTTPException( status_code=401, - detail=f"The provided Virtual Organizations do not match with any of your available VOs: {auth_info['vos']}." - ) + detail=f"The provided Virtual Organizations do not match with any of your available VOs: {auth_info['vos']}.", + ) user_jobs = [] for vo in vos: # Retrieve all jobs in namespace jobs = nomad.get_deployments( - namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo], - owner=auth_info['id'], - prefix='module', + namespace=papiconf.MAIN_CONF["nomad"]["namespaces"][vo], + owner=auth_info["id"], + prefix="module", ) # Retrieve info for jobs in namespace @@ -73,7 +72,7 @@ def get_deployments( try: job_info = get_deployment( vo=vo, - deployment_uuid=j['ID'], + deployment_uuid=j["ID"], full_info=full_info, authorization=types.SimpleNamespace( credentials=authorization.credentials # token @@ -82,12 +81,12 @@ def get_deployments( except HTTPException: # not a module continue except Exception as e: # unexpected error - raise(e) + raise (e) user_jobs.append(job_info) # Sort deployments by creation date - seq = [j['submit_time'] for j in user_jobs] + seq = [j["submit_time"] for j in user_jobs] args = sorted(range(len(seq)), key=seq.__getitem__)[::-1] sorted_jobs = [user_jobs[i] for i in args] @@ -100,7 +99,7 @@ def get_deployment( deployment_uuid: str, full_info: bool = Query(default=True), authorization=Depends(security), - ): +): """ Retrieve the info of a specific deployment. Format outputs to a Nomad-independent format to be used by the Dashboard @@ -119,24 +118,24 @@ def get_deployment( # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Retrieve the associated namespace to that VO - namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo] + namespace = papiconf.MAIN_CONF["nomad"]["namespaces"][vo] job = nomad.get_deployment( deployment_uuid=deployment_uuid, namespace=namespace, - owner=auth_info['id'], + owner=auth_info["id"], full_info=full_info, ) # Check the deployment is indeed a module - if not job['name'].startswith('module'): + if not job["name"].startswith("module"): raise HTTPException( status_code=400, detail="This deployment is not a module.", - ) + ) return job @@ -146,7 +145,7 @@ def create_deployment( vo: str, conf: Union[dict, None] = None, authorization=Depends(security), - ): +): """ Submit a deployment to Nomad. @@ -172,11 +171,11 @@ def create_deployment( """ # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Load module configuration - nomad_conf = deepcopy(papiconf.MODULES['nomad']) - user_conf = deepcopy(papiconf.MODULES['user']['values']) + nomad_conf = deepcopy(papiconf.MODULES["nomad"]) + user_conf = deepcopy(papiconf.MODULES["user"]["values"]) # Update values conf in case we received a submitted conf if conf is not None: @@ -210,128 +209,132 @@ def create_deployment( job_uuid = uuid.uuid1() # Jobs from tutorial users should have low priority (ie. can be displaced if needed) - if vo == 'training.egi.eu': + if vo == "training.egi.eu": priority = 25 else: priority = 50 - base_domain = papiconf.MAIN_CONF['lb']['domain'][vo] + base_domain = papiconf.MAIN_CONF["lb"]["domain"][vo] # Replace the Nomad job template nomad_conf = nomad_conf.safe_substitute( { - 'JOB_UUID': job_uuid, - 'NAMESPACE': papiconf.MAIN_CONF['nomad']['namespaces'][vo], - 'PRIORITY': priority, - 'OWNER': auth_info['id'], - 'OWNER_NAME': auth_info['name'], - 'OWNER_EMAIL': auth_info['email'], - 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters - 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters - 'BASE_DOMAIN': base_domain, - 'HOSTNAME': job_uuid, - 'DOCKER_IMAGE': user_conf['general']['docker_image'], - 'DOCKER_TAG': user_conf['general']['docker_tag'], - 'SERVICE': user_conf['general']['service'], - 'CPU_NUM': user_conf['hardware']['cpu_num'], - 'RAM': user_conf['hardware']['ram'], - 'DISK': user_conf['hardware']['disk'], - 'SHARED_MEMORY': user_conf['hardware']['ram'] * 10**6 * 0.5, + "JOB_UUID": job_uuid, + "NAMESPACE": papiconf.MAIN_CONF["nomad"]["namespaces"][vo], + "PRIORITY": priority, + "OWNER": auth_info["id"], + "OWNER_NAME": auth_info["name"], + "OWNER_EMAIL": auth_info["email"], + "TITLE": user_conf["general"]["title"][ + :45 + ], # keep only 45 first characters + "DESCRIPTION": user_conf["general"]["desc"][ + :1000 + ], # limit to 1K characters + "BASE_DOMAIN": base_domain, + "HOSTNAME": job_uuid, + "DOCKER_IMAGE": user_conf["general"]["docker_image"], + "DOCKER_TAG": user_conf["general"]["docker_tag"], + "SERVICE": user_conf["general"]["service"], + "CPU_NUM": user_conf["hardware"]["cpu_num"], + "RAM": user_conf["hardware"]["ram"], + "DISK": user_conf["hardware"]["disk"], + "SHARED_MEMORY": user_conf["hardware"]["ram"] * 10**6 * 0.5, # Limit at 50% of RAM memory, in bytes - 'GPU_NUM': user_conf['hardware']['gpu_num'], - 'GPU_MODELNAME': user_conf['hardware']['gpu_type'], - 'JUPYTER_PASSWORD': user_conf['general']['jupyter_password'], - 'RCLONE_CONFIG_RSHARE_URL': user_conf['storage']['rclone_url'], - 'RCLONE_CONFIG_RSHARE_VENDOR': user_conf['storage']['rclone_vendor'], - 'RCLONE_CONFIG_RSHARE_USER': user_conf['storage']['rclone_user'], - 'RCLONE_CONFIG_RSHARE_PASS': user_conf['storage']['rclone_password'], - 'RCLONE_CONFIG': user_conf['storage']['rclone_conf'], - 'MAILING_TOKEN': os.getenv("MAILING_TOKEN", default=""), - 'PROJECT_NAME': papiconf.MAIN_CONF['nomad']['namespaces'][vo].upper(), - 'TODAY': str(datetime.date.today()), + "GPU_NUM": user_conf["hardware"]["gpu_num"], + "GPU_MODELNAME": user_conf["hardware"]["gpu_type"], + "JUPYTER_PASSWORD": user_conf["general"]["jupyter_password"], + "RCLONE_CONFIG_RSHARE_URL": user_conf["storage"]["rclone_url"], + "RCLONE_CONFIG_RSHARE_VENDOR": user_conf["storage"]["rclone_vendor"], + "RCLONE_CONFIG_RSHARE_USER": user_conf["storage"]["rclone_user"], + "RCLONE_CONFIG_RSHARE_PASS": user_conf["storage"]["rclone_password"], + "RCLONE_CONFIG": user_conf["storage"]["rclone_conf"], + "MAILING_TOKEN": os.getenv("MAILING_TOKEN", default=""), + "PROJECT_NAME": papiconf.MAIN_CONF["nomad"]["namespaces"][vo].upper(), + "TODAY": str(datetime.date.today()), } ) # Convert template to Nomad conf nomad_conf = nomad.load_job_conf(nomad_conf) - tasks = nomad_conf['TaskGroups'][0]['Tasks'] - usertask = [t for t in tasks if t['Name']=='main'][0] + tasks = nomad_conf["TaskGroups"][0]["Tasks"] + usertask = [t for t in tasks if t["Name"] == "main"][0] # Apply patches if needed usertask = module_patches.patch_nextcloud_mount( - user_conf['general']['docker_image'], - usertask + user_conf["general"]["docker_image"], usertask ) # Modify the GPU section - if user_conf['hardware']['gpu_num'] <= 0: + if user_conf["hardware"]["gpu_num"] <= 0: # Delete GPU section in CPU deployments - usertask['Resources']['Devices'] = None + usertask["Resources"]["Devices"] = None else: # If gpu_type not provided, remove constraint to GPU model - if not user_conf['hardware']['gpu_type']: - usertask['Resources']['Devices'][0]['Constraints'] = None + if not user_conf["hardware"]["gpu_type"]: + usertask["Resources"]["Devices"][0]["Constraints"] = None # If the image belong to Harbor, then it's a user snapshot - docker_image = user_conf['general']['docker_image'] - if docker_image.split('/')[0] == "registry.services.ai4os.eu": - + docker_image = user_conf["general"]["docker_image"] + if docker_image.split("/")[0] == "registry.services.ai4os.eu": # Check the user is the owner of the image - if docker_image.split('/')[-1] != auth_info['id'].replace('@', '_at_'): + if docker_image.split("/")[-1] != auth_info["id"].replace("@", "_at_"): raise HTTPException( status_code=401, detail="You are not the owner of the Harbor image.", - ) + ) # Check the snapshot indeed exists user_snapshots = v1.snapshots.get_harbor_snapshots( - owner=auth_info['id'], + owner=auth_info["id"], vo=vo, ) - snapshot_ids = [s['snapshot_ID'] for s in user_snapshots] - if user_conf['general']['docker_tag'] not in snapshot_ids: + snapshot_ids = [s["snapshot_ID"] for s in user_snapshots] + if user_conf["general"]["docker_tag"] not in snapshot_ids: raise HTTPException( status_code=400, detail="The snapshot does not exist.", - ) + ) # Add Harbor authentication credentials to Nomad job - usertask['Config']['auth'] = [{ - 'username': papiconf.HARBOR_USER, - 'password': papiconf.HARBOR_PASS, - }] + usertask["Config"]["auth"] = [ + { + "username": papiconf.HARBOR_USER, + "password": papiconf.HARBOR_PASS, + } + ] # If storage credentials not provided, remove all storage-related tasks - rclone = {k: v for k, v in user_conf['storage'].items() if k.startswith('rclone')} + rclone = {k: v for k, v in user_conf["storage"].items() if k.startswith("rclone")} if not all(rclone.values()): - exclude_tasks = ['storage_mount', 'storage_cleanup', 'dataset_download'] + exclude_tasks = ["storage_mount", "storage_cleanup", "dataset_download"] else: # If datasets provided, replicate 'dataset_download' task as many times as needed - if user_conf['storage']['datasets']: - download_task = [t for t in tasks if t['Name'] == 'dataset_download'][0] - for i, dataset in enumerate(user_conf['storage']['datasets']): + if user_conf["storage"]["datasets"]: + download_task = [t for t in tasks if t["Name"] == "dataset_download"][0] + for i, dataset in enumerate(user_conf["storage"]["datasets"]): t = deepcopy(download_task) - t['Env']['DOI'] = dataset['doi'] - t['Env']['FORCE_PULL'] = dataset['doi'] - t['Name'] = f'dataset_download_{i}' + t["Env"]["DOI"] = dataset["doi"] + t["Env"]["FORCE_PULL"] = dataset["doi"] + t["Name"] = f"dataset_download_{i}" tasks.append(t) # Always exclude initial 'dataset_download' task, as it is used as template - exclude_tasks = ['dataset_download'] + exclude_tasks = ["dataset_download"] # If DEEPaaS was not launched, do not launch UI because it will fail - if user_conf['general']['service'] != 'deepaas': - exclude_tasks.append('ui') + if user_conf["general"]["service"] != "deepaas": + exclude_tasks.append("ui") - tasks[:] = [t for t in tasks if t['Name'] not in exclude_tasks] + tasks[:] = [t for t in tasks if t["Name"] not in exclude_tasks] # Remove appropriate Traefik domains in each case (no need to remove the ports) - services = nomad_conf['TaskGroups'][0]['Services'] - if user_conf['general']['service'] == 'deepaas': - exclude_services = ['ide'] + services = nomad_conf["TaskGroups"][0]["Services"] + if user_conf["general"]["service"] == "deepaas": + exclude_services = ["ide"] else: - exclude_services = ['ui'] - services[:] = [s for s in services if s['PortLabel'] not in exclude_services] + exclude_services = ["ui"] + services[:] = [s for s in services if s["PortLabel"] not in exclude_services] # Submit job r = nomad.create_deployment(nomad_conf) @@ -344,7 +347,7 @@ def delete_deployment( vo: str, deployment_uuid: str, authorization=Depends(security), - ): +): """ Delete a deployment. Users can only delete their own deployments. @@ -356,13 +359,13 @@ def delete_deployment( """ # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Delete deployment r = nomad.delete_deployment( deployment_uuid=deployment_uuid, - namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo], - owner=auth_info['id'], + namespace=papiconf.MAIN_CONF["nomad"]["namespaces"][vo], + owner=auth_info["id"], ) return r diff --git a/ai4papi/routers/v1/deployments/tools.py b/ai4papi/routers/v1/deployments/tools.py index 49cf252..2c7fb85 100644 --- a/ai4papi/routers/v1/deployments/tools.py +++ b/ai4papi/routers/v1/deployments/tools.py @@ -1,5 +1,4 @@ from copy import deepcopy -from datetime import datetime import re import secrets import types @@ -30,7 +29,7 @@ def get_deployments( vos: Union[Tuple, None] = Query(default=None), full_info: bool = Query(default=False), authorization=Depends(security), - ): +): """ Returns a list of all deployments belonging to a user. @@ -47,21 +46,21 @@ def get_deployments( # If no VOs, then retrieve jobs from all user VOs # Always remove VOs that do not belong to the project if not vos: - vos = auth_info['vos'] - vos = set(vos).intersection(set(papiconf.MAIN_CONF['auth']['VO'])) + vos = auth_info["vos"] + vos = set(vos).intersection(set(papiconf.MAIN_CONF["auth"]["VO"])) if not vos: raise HTTPException( status_code=401, - detail=f"The provided Virtual Organizations do not match with any of your available VOs: {auth_info['vos']}." - ) + detail=f"The provided Virtual Organizations do not match with any of your available VOs: {auth_info['vos']}.", + ) user_jobs = [] for vo in vos: # Retrieve all jobs in namespace jobs = nomad.get_deployments( - namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo], - owner=auth_info['id'], - prefix='tool', + namespace=papiconf.MAIN_CONF["nomad"]["namespaces"][vo], + owner=auth_info["id"], + prefix="tool", ) # Retrieve info for jobs in namespace @@ -69,7 +68,7 @@ def get_deployments( try: job_info = get_deployment( vo=vo, - deployment_uuid=j['ID'], + deployment_uuid=j["ID"], full_info=full_info, authorization=types.SimpleNamespace( credentials=authorization.credentials # token @@ -78,12 +77,12 @@ def get_deployments( except HTTPException: # not a tool continue except Exception as e: # unexpected error - raise(e) + raise (e) user_jobs.append(job_info) # Sort deployments by creation date - seq = [j['submit_time'] for j in user_jobs] + seq = [j["submit_time"] for j in user_jobs] args = sorted(range(len(seq)), key=seq.__getitem__)[::-1] sorted_jobs = [user_jobs[i] for i in args] @@ -96,7 +95,7 @@ def get_deployment( deployment_uuid: str, full_info: bool = Query(default=True), authorization=Depends(security), - ): +): """ Retrieve the info of a specific deployment. Format outputs to a Nomad-independent format to be used by the Dashboard @@ -111,39 +110,43 @@ def get_deployment( """ # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Retrieve the associated namespace to that VO - namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo] + namespace = papiconf.MAIN_CONF["nomad"]["namespaces"][vo] job = nomad.get_deployment( deployment_uuid=deployment_uuid, namespace=namespace, - owner=auth_info['id'], + owner=auth_info["id"], full_info=full_info, ) # Check the deployment is indeed a tool - if not job['name'].startswith('tool'): + if not job["name"].startswith("tool"): raise HTTPException( status_code=400, detail="This deployment is not a tool.", - ) + ) # Add an additional field with the tool type # We map name from Nomad job to tool ID - match = re.search(r'tool-(.*?)-[a-f0-9-]{36}', job['name']) - nomad_name = match.group(1) if match else '' - tool_id = papiconf.tools_nomad2id.get(nomad_name, '') - job['tool_name'] = tool_id + match = re.search(r"tool-(.*?)-[a-f0-9-]{36}", job["name"]) + nomad_name = match.group(1) if match else "" + tool_id = papiconf.tools_nomad2id.get(nomad_name, "") + job["tool_name"] = tool_id # Additional checks - if tool_id == 'ai4os-cvat': + if tool_id == "ai4os-cvat": # Remove useless endpoints (they all point to same url) - ignore = ['server', 'grafana'] - job['endpoints'] = {k: v for k, v in job['endpoints'].items() if k not in ignore} - if job['active_endpoints']: - job['active_endpoints'] = [k for k in job['active_endpoints'] if k not in ignore] + ignore = ["server", "grafana"] + job["endpoints"] = { + k: v for k, v in job["endpoints"].items() if k not in ignore + } + if job["active_endpoints"]: + job["active_endpoints"] = [ + k for k in job["active_endpoints"] if k not in ignore + ] return job @@ -154,7 +157,7 @@ def create_deployment( tool_name: str, conf: Union[dict, None] = None, authorization=Depends(security), - ): +): """ Submit a deployment to Nomad. @@ -180,18 +183,18 @@ def create_deployment( """ # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Check tool_ID if tool_name not in Tools_catalog.get_items().keys(): raise HTTPException( status_code=400, detail="This ID does not correspond to an available tool.", - ) + ) # Load tool configuration - nomad_conf = deepcopy(papiconf.TOOLS[tool_name]['nomad']) - user_conf = deepcopy(papiconf.TOOLS[tool_name]['user']['values']) + nomad_conf = deepcopy(papiconf.TOOLS[tool_name]["nomad"]) + user_conf = deepcopy(papiconf.TOOLS[tool_name]["user"]["values"]) # Update values conf in case we received a submitted conf if conf is not None: @@ -205,7 +208,7 @@ def create_deployment( # Check if the provided configuration is within the job quotas # Skip this check with CVAT because it does not have a "hardware" section in the conf - if tool_name not in ['ai4os-cvat']: + if tool_name not in ["ai4os-cvat"]: quotas.check_jobwise( conf=user_conf, vo=vo, @@ -215,21 +218,20 @@ def create_deployment( job_uuid = uuid.uuid1() # Jobs from tutorial users should have low priority (ie. can be displaced if needed) - if vo == 'training.egi.eu': + if vo == "training.egi.eu": priority = 25 else: priority = 50 - base_domain = papiconf.MAIN_CONF['lb']['domain'][vo] + base_domain = papiconf.MAIN_CONF["lb"]["domain"][vo] # Deploy a Federated server - if tool_name == 'ai4os-federated-server': - + if tool_name == "ai4os-federated-server": # Create a default secret for the Federated Server _ = ai4secrets.create_secret( vo=vo, secret_path=f"deployments/{job_uuid}/federated/default", - secret_data={'token': secrets.token_hex()}, + secret_data={"token": secrets.token_hex()}, authorization=SimpleNamespace( credentials=authorization.credentials, ), @@ -238,107 +240,124 @@ def create_deployment( # Create a Vault token so that the deployment can access the Federated secret vault_token = ai4secrets.create_vault_token( jwt=authorization.credentials, - issuer=auth_info['issuer'], - ttl='365d', # 1 year expiration date + issuer=auth_info["issuer"], + ttl="365d", # 1 year expiration date ) # Replace the Nomad job template nomad_conf = nomad_conf.safe_substitute( { - 'JOB_UUID': job_uuid, - 'NAMESPACE': papiconf.MAIN_CONF['nomad']['namespaces'][vo], - 'PRIORITY': priority, - 'OWNER': auth_info['id'], - 'OWNER_NAME': auth_info['name'], - 'OWNER_EMAIL': auth_info['email'], - 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters - 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters - 'BASE_DOMAIN': base_domain, - 'HOSTNAME': job_uuid, - 'DOCKER_IMAGE': user_conf['general']['docker_image'], - 'DOCKER_TAG': user_conf['general']['docker_tag'], - 'CPU_NUM': user_conf['hardware']['cpu_num'], - 'RAM': user_conf['hardware']['ram'], - 'DISK': user_conf['hardware']['disk'], - 'SHARED_MEMORY': user_conf['hardware']['ram'] * 10**6 * 0.5, + "JOB_UUID": job_uuid, + "NAMESPACE": papiconf.MAIN_CONF["nomad"]["namespaces"][vo], + "PRIORITY": priority, + "OWNER": auth_info["id"], + "OWNER_NAME": auth_info["name"], + "OWNER_EMAIL": auth_info["email"], + "TITLE": user_conf["general"]["title"][ + :45 + ], # keep only 45 first characters + "DESCRIPTION": user_conf["general"]["desc"][ + :1000 + ], # limit to 1K characters + "BASE_DOMAIN": base_domain, + "HOSTNAME": job_uuid, + "DOCKER_IMAGE": user_conf["general"]["docker_image"], + "DOCKER_TAG": user_conf["general"]["docker_tag"], + "CPU_NUM": user_conf["hardware"]["cpu_num"], + "RAM": user_conf["hardware"]["ram"], + "DISK": user_conf["hardware"]["disk"], + "SHARED_MEMORY": user_conf["hardware"]["ram"] * 10**6 * 0.5, # Limit at 50% of RAM memory, in bytes - 'JUPYTER_PASSWORD': user_conf['general']['jupyter_password'], - 'VAULT_TOKEN': vault_token, - 'FEDERATED_ROUNDS': user_conf['configuration']['rounds'], - 'FEDERATED_METRIC': user_conf['configuration']['metric'], - 'FEDERATED_MIN_FIT_CLIENTS': user_conf['configuration']['min_fit_clients'], - 'FEDERATED_MIN_AVAILABLE_CLIENTS': user_conf['configuration']['min_available_clients'], - 'FEDERATED_STRATEGY': user_conf['configuration']['strategy'], - 'MU_FEDPROX': user_conf['configuration']['mu'], - 'FEDAVGM_SERVER_FL' : user_conf['configuration']['fl'], - 'FEDAVGM_SERVER_MOMENTUM': user_conf['configuration']['momentum'], - 'DP': user_conf['configuration']['dp'], - 'NOISE_MULT': user_conf['configuration']['noise_mult'], - 'SAMPLED_CLIENTS': user_conf['configuration']['sampled_clients'], - 'CLIP_NORM': user_conf['configuration']['clip_norm'] + "JUPYTER_PASSWORD": user_conf["general"]["jupyter_password"], + "VAULT_TOKEN": vault_token, + "FEDERATED_ROUNDS": user_conf["configuration"]["rounds"], + "FEDERATED_METRIC": user_conf["configuration"]["metric"], + "FEDERATED_MIN_FIT_CLIENTS": user_conf["configuration"][ + "min_fit_clients" + ], + "FEDERATED_MIN_AVAILABLE_CLIENTS": user_conf["configuration"][ + "min_available_clients" + ], + "FEDERATED_STRATEGY": user_conf["configuration"]["strategy"], + "MU_FEDPROX": user_conf["configuration"]["mu"], + "FEDAVGM_SERVER_FL": user_conf["configuration"]["fl"], + "FEDAVGM_SERVER_MOMENTUM": user_conf["configuration"]["momentum"], + "DP": user_conf["configuration"]["dp"], + "NOISE_MULT": user_conf["configuration"]["noise_mult"], + "SAMPLED_CLIENTS": user_conf["configuration"]["sampled_clients"], + "CLIP_NORM": user_conf["configuration"]["clip_norm"], } ) # Convert template to Nomad conf nomad_conf = nomad.load_job_conf(nomad_conf) - tasks = nomad_conf['TaskGroups'][0]['Tasks'] - usertask = [t for t in tasks if t['Name']=='main'][0] + tasks = nomad_conf["TaskGroups"][0]["Tasks"] + usertask = [t for t in tasks if t["Name"] == "main"][0] # Launch `deep-start` compatible service if needed - service = user_conf['general']['service'] - if service in ['deepaas', 'jupyter', 'vscode']: - usertask['Config']['command'] = 'deep-start' - usertask['Config']['args'] = [f'--{service}'] + service = user_conf["general"]["service"] + if service in ["deepaas", "jupyter", "vscode"]: + usertask["Config"]["command"] = "deep-start" + usertask["Config"]["args"] = [f"--{service}"] # Deploy a CVAT tool - elif tool_name == 'ai4os-cvat': - + elif tool_name == "ai4os-cvat": # Enforce defining CVAT username and password - cvat = {k: v for k, v in user_conf['general'].items() if k in ['cvat_username', 'cvat_password']} + cvat = { + k: v + for k, v in user_conf["general"].items() + if k in ["cvat_username", "cvat_password"] + } if not all(cvat.values()): raise HTTPException( status_code=400, detail="You must fill all CVAT-related variables.", - ) + ) # Enforce all rclone vars are defined - rclone = {k: v for k, v in user_conf['storage'].items() if k.startswith('rclone')} + rclone = { + k: v for k, v in user_conf["storage"].items() if k.startswith("rclone") + } if not all(rclone.values()): raise HTTPException( status_code=400, detail="You must fill all RCLONE-related variables.", - ) + ) # Replace the Nomad job template job_title = re.sub( r'[<>:"/\\|?* ]', - '_', - user_conf['general']['title'][:45], - ) # make title foldername-friendly + "_", + user_conf["general"]["title"][:45], + ) # make title foldername-friendly nomad_conf = nomad_conf.safe_substitute( { - 'JOB_UUID': job_uuid, - 'NAMESPACE': papiconf.MAIN_CONF['nomad']['namespaces'][vo], - 'PRIORITY': priority, - 'OWNER': auth_info['id'], - 'OWNER_NAME': auth_info['name'], - 'OWNER_EMAIL': auth_info['email'], - 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters - 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters - 'BASE_DOMAIN': base_domain, - 'HOSTNAME': job_uuid, - 'CVAT_USERNAME': user_conf['general']['cvat_username'], - 'CVAT_PASSWORD': user_conf['general']['cvat_password'], - 'RESTORE_FROM': user_conf['storage']['cvat_backup'], - 'BACKUP_NAME': f'{job_title}', - 'RCLONE_CONFIG_RSHARE_URL': user_conf['storage']['rclone_url'], - 'RCLONE_CONFIG_RSHARE_VENDOR': user_conf['storage']['rclone_vendor'], - 'RCLONE_CONFIG_RSHARE_USER': user_conf['storage']['rclone_user'], - 'RCLONE_CONFIG_RSHARE_PASS': user_conf['storage']['rclone_password'], - 'RCLONE_CONFIG': user_conf['storage']['rclone_conf'], - } + "JOB_UUID": job_uuid, + "NAMESPACE": papiconf.MAIN_CONF["nomad"]["namespaces"][vo], + "PRIORITY": priority, + "OWNER": auth_info["id"], + "OWNER_NAME": auth_info["name"], + "OWNER_EMAIL": auth_info["email"], + "TITLE": user_conf["general"]["title"][ + :45 + ], # keep only 45 first characters + "DESCRIPTION": user_conf["general"]["desc"][ + :1000 + ], # limit to 1K characters + "BASE_DOMAIN": base_domain, + "HOSTNAME": job_uuid, + "CVAT_USERNAME": user_conf["general"]["cvat_username"], + "CVAT_PASSWORD": user_conf["general"]["cvat_password"], + "RESTORE_FROM": user_conf["storage"]["cvat_backup"], + "BACKUP_NAME": f"{job_title}", + "RCLONE_CONFIG_RSHARE_URL": user_conf["storage"]["rclone_url"], + "RCLONE_CONFIG_RSHARE_VENDOR": user_conf["storage"]["rclone_vendor"], + "RCLONE_CONFIG_RSHARE_USER": user_conf["storage"]["rclone_user"], + "RCLONE_CONFIG_RSHARE_PASS": user_conf["storage"]["rclone_password"], + "RCLONE_CONFIG": user_conf["storage"]["rclone_conf"], + } ) # Convert template to Nomad conf @@ -355,7 +374,7 @@ def delete_deployment( vo: str, deployment_uuid: str, authorization=Depends(security), - ): +): """ Delete a deployment. Users can only delete their own deployments. @@ -367,13 +386,13 @@ def delete_deployment( """ # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Delete deployment r = nomad.delete_deployment( deployment_uuid=deployment_uuid, - namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo], - owner=auth_info['id'], + namespace=papiconf.MAIN_CONF["nomad"]["namespaces"][vo], + owner=auth_info["id"], ) # Remove Vault secrets belonging to that deployment diff --git a/ai4papi/routers/v1/inference/__init__.py b/ai4papi/routers/v1/inference/__init__.py index c30d6e7..2c6eacc 100644 --- a/ai4papi/routers/v1/inference/__init__.py +++ b/ai4papi/routers/v1/inference/__init__.py @@ -6,5 +6,5 @@ router = fastapi.APIRouter() router.include_router( router=oscar.router, - prefix='/inference', - ) + prefix="/inference", +) diff --git a/ai4papi/routers/v1/inference/oscar.py b/ai4papi/routers/v1/inference/oscar.py index 527dd9d..d436925 100644 --- a/ai4papi/routers/v1/inference/oscar.py +++ b/ai4papi/routers/v1/inference/oscar.py @@ -1,6 +1,7 @@ """ Manage OSCAR clusters to create and execute services. """ + from copy import deepcopy from datetime import datetime from functools import wraps @@ -25,15 +26,16 @@ responses={404: {"description": "Inference not found"}}, ) + class Service(BaseModel): image: str cpu: NonNegativeInt = 2 memory: NonNegativeInt = 3000 allowed_users: List[str] = [] # no additional users by default - title: str = '' + title: str = "" # Not configurable - _name: str = '' # filled by PAPI with UUID + _name: str = "" # filled by PAPI with UUID model_config = { "json_schema_extra": { @@ -43,12 +45,13 @@ class Service(BaseModel): "image": "deephdc/deep-oc-image-classification-tf", "cpu": 2, "memory": 3000, - "allowed_users": [] + "allowed_users": [], } ] } } + security = HTTPBearer() @@ -56,9 +59,9 @@ def raise_for_status(func): """ Raise HTML error if the response of OSCAR functions has status!=2**. """ + @wraps(func) def wrapper(*args, **kwargs): - # Catch first errors happening internally try: r = func(*args, **kwargs) @@ -66,12 +69,12 @@ def wrapper(*args, **kwargs): raise HTTPException( status_code=400, detail=e, - ) + ) except requests.exceptions.HTTPError as e: raise HTTPException( status_code=500, detail=e, - ) + ) # Catch errors when the function itself does not raise errors but the response # has a non-successful code @@ -81,7 +84,7 @@ def wrapper(*args, **kwargs): raise HTTPException( status_code=r.status_code, detail=r.text, - ) + ) return wrapper @@ -91,11 +94,11 @@ def get_client_from_auth(token, vo): Retrieve authenticated user info and init OSCAR client. """ client_options = { - 'cluster_id': MAIN_CONF["oscar"]["clusters"][vo]['cluster_id'], - 'endpoint': MAIN_CONF["oscar"]["clusters"][vo]['endpoint'], - 'oidc_token': token, - 'ssl': 'true', - } + "cluster_id": MAIN_CONF["oscar"]["clusters"][vo]["cluster_id"], + "endpoint": MAIN_CONF["oscar"]["clusters"][vo]["endpoint"], + "oidc_token": token, + "ssl": "true", + } try: client = Client(client_options) @@ -115,22 +118,21 @@ def get_client_from_auth(token, vo): def make_service_definition(svc_conf, vo): - # Create service definition service = deepcopy(OSCAR_TMPL) # init from template service = service.safe_substitute( { - 'CLUSTER_ID': MAIN_CONF["oscar"]["clusters"][vo]["cluster_id"], - 'NAME': svc_conf._name, - 'IMAGE': svc_conf.image, - 'CPU': svc_conf.cpu, - 'MEMORY': svc_conf.memory, - 'ALLOWED_USERS': svc_conf.allowed_users, - 'VO': vo, - 'ENV_VARS': { - 'Variables':{ - 'PAPI_TITLE': svc_conf.title, - 'PAPI_CREATED': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + "CLUSTER_ID": MAIN_CONF["oscar"]["clusters"][vo]["cluster_id"], + "NAME": svc_conf._name, + "IMAGE": svc_conf.image, + "CPU": svc_conf.cpu, + "MEMORY": svc_conf.memory, + "ALLOWED_USERS": svc_conf.allowed_users, + "VO": vo, + "ENV_VARS": { + "Variables": { + "PAPI_TITLE": svc_conf.title, + "PAPI_CREATED": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), }, }, } @@ -144,14 +146,14 @@ def make_service_definition(svc_conf, vo): def get_cluster_info( vo: str, authorization=Depends(security), - ): +): """ Gets information about the cluster. - Returns a JSON with the cluster information. """ # Retrieve authenticated user info auth_info = auth.get_user_info(authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Get cluster info client = get_client_from_auth(authorization.credentials, vo) @@ -165,7 +167,7 @@ def get_services_list( vo: str, public: bool = Query(default=False), authorization=Depends(security), - ): +): """ Retrieves a list of all the deployed services of the cluster. @@ -177,7 +179,7 @@ def get_services_list( """ # Retrieve authenticated user info auth_info = auth.get_user_info(authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Get services list client = get_client_from_auth(authorization.credentials, vo) @@ -186,27 +188,26 @@ def get_services_list( # Filter services services = [] for s in json.loads(r.text): - # Filter out public services, if requested - if not (s.get('allowed_users', None) or public): + if not (s.get("allowed_users", None) or public): continue # Retrieve only services launched by PAPI - if not s.get('name', '').startswith('ai4papi-'): + if not s.get("name", "").startswith("ai4papi-"): continue # Keep only services that belong to vo - if vo not in s.get('vo', []): + if vo not in s.get("vo", []): continue # Add service endpoint cluster_endpoint = MAIN_CONF["oscar"]["clusters"][vo]["endpoint"] - s['endpoint'] = f"{cluster_endpoint}/run/{s['name']}" + s["endpoint"] = f"{cluster_endpoint}/run/{s['name']}" services.append(s) # Sort services by creation time, recent to old - dates = [s['environment']['Variables']['PAPI_CREATED'] for s in services] + dates = [s["environment"]["Variables"]["PAPI_CREATED"] for s in services] idxs = sorted(range(len(dates)), key=dates.__getitem__) # argsort sorted_services = [services[i] for i in idxs[::-1]] @@ -218,14 +219,14 @@ def get_service( vo: str, service_name: str, authorization=Depends(security), - ): +): """ Retrieves a specific service. - Returns a JSON with the cluster information. """ # Retrieve authenticated user info auth_info = auth.get_user_info(authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Get service client = get_client_from_auth(authorization.credentials, vo) @@ -234,7 +235,7 @@ def get_service( # Add service endpoint cluster_endpoint = MAIN_CONF["oscar"]["clusters"][vo]["endpoint"] - service['endpoint'] = f"{cluster_endpoint}/run/{service_name}" + service["endpoint"] = f"{cluster_endpoint}/run/{service_name}" return service @@ -244,25 +245,25 @@ def create_service( vo: str, svc_conf: Service, authorization=Depends(security), - ): +): """ Creates a new inference service for an AI pre-trained model on a specific cluster. """ # Retrieve authenticated user info auth_info = auth.get_user_info(authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Assign random UUID to service to avoid clashes # We clip it because OSCAR only seems to support names smaller than 39 characters - svc_conf._name = f'ai4papi-{uuid.uuid1()}'[:39] + svc_conf._name = f"ai4papi-{uuid.uuid1()}"[:39] # Create service definition service_definition = make_service_definition(svc_conf, vo) - service_definition['allowed_users'] += [auth_info['id']] # add service owner + service_definition["allowed_users"] += [auth_info["id"]] # add service owner # Update service client = get_client_from_auth(authorization.credentials, vo) - r = client.create_service(service_definition) + _ = client.create_service(service_definition) return svc_conf._name @@ -273,23 +274,23 @@ def update_service( service_name: str, svc_conf: Service, authorization=Depends(security), - ): +): """ Updates service if it exists. The method needs all service parameters to be on the request. """ # Retrieve authenticated user info auth_info = auth.get_user_info(authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Create service definition svc_conf._name = service_name service_definition = make_service_definition(svc_conf, vo) - service_definition['allowed_users'] += [auth_info['id']] # add service owner + service_definition["allowed_users"] += [auth_info["id"]] # add service owner # Update service client = get_client_from_auth(authorization.credentials, vo) - r = client.update_service(svc_conf._name, service_definition) + _ = client.update_service(svc_conf._name, service_definition) return service_name @@ -299,17 +300,17 @@ def delete_service( vo: str, service_name: str, authorization=Depends(security), - ): +): """ Delete a specific service. Raises 500 if the service does not exists. """ # Retrieve authenticated user info auth_info = auth.get_user_info(authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Delete service client = get_client_from_auth(authorization.credentials, vo) - r = client.remove_service(service_name) + _ = client.remove_service(service_name) return service_name diff --git a/ai4papi/routers/v1/secrets.py b/ai4papi/routers/v1/secrets.py index 47c79c4..30f40e3 100644 --- a/ai4papi/routers/v1/secrets.py +++ b/ai4papi/routers/v1/secrets.py @@ -29,22 +29,22 @@ def vault_client(jwt, issuer): Common init steps of Vault client """ # Check we are using EGI Check-In prod - if issuer != 'https://aai.egi.eu/auth/realms/egi': + if issuer != "https://aai.egi.eu/auth/realms/egi": raise HTTPException( status_code=400, - detail="Secrets are only compatible with EGI Check-In Production OIDC " \ - "provider.", - ) + detail="Secrets are only compatible with EGI Check-In Production OIDC " + "provider.", + ) # Init the Vault client client = hvac.Client( url=VAULT_ADDR, - ) + ) client.auth.jwt.jwt_login( role=VAULT_ROLE, jwt=jwt, path=VAULT_AUTH_PATH, - ) + ) return client @@ -52,8 +52,8 @@ def vault_client(jwt, issuer): def create_vault_token( jwt, issuer, - ttl='1h', - ): + ttl="1h", +): """ Create a Vault token from a JWT. @@ -70,7 +70,7 @@ def create_vault_token( # So instead of creating a child token, we have to *extend* login token. client.auth.token.renew_self(increment=ttl) - #TODO: for extra security we should only allow reading/listing from a given subpath. + # TODO: for extra security we should only allow reading/listing from a given subpath. # - Restrict to read/list can be done with user roles # - Restricting subpaths might not be done because policies are static (and # deployment paths are dynamic). In addition only admins can create policies) @@ -86,12 +86,11 @@ def recursive_path_builder(client, kv_list): # if any list items end in '/' return 1 for li in kv_list[:]: - if li[-1] == '/': + if li[-1] == "/": r = client.secrets.kv.v1.list_secrets( - path=li, - mount_point=VAULT_MOUNT_POINT + path=li, mount_point=VAULT_MOUNT_POINT ) - append_list = r['data']['keys'] + append_list = r["data"]["keys"] for new_item in append_list: kv_list.append(li + new_item) # remove list item ending in '/' @@ -108,9 +107,9 @@ def recursive_path_builder(client, kv_list): @router.get("") def get_secrets( vo: str, - subpath: str = '', + subpath: str = "", authorization=Depends(security), - ): +): """ Returns a list of secrets belonging to a user. @@ -123,28 +122,27 @@ def get_secrets( """ # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Init the Vault client client = vault_client( jwt=authorization.credentials, - issuer=auth_info['issuer'], + issuer=auth_info["issuer"], ) # Check subpath syntax - if not subpath.startswith('/'): - subpath = '/' + subpath - if not subpath.endswith('/'): - subpath += '/' + if not subpath.startswith("/"): + subpath = "/" + subpath + if not subpath.endswith("/"): + subpath += "/" # Retrieve initial level-0 secrets user_path = f"users/{auth_info['id']}/{vo}" try: r = client.secrets.kv.v1.list_secrets( - path = user_path + subpath, - mount_point=VAULT_MOUNT_POINT + path=user_path + subpath, mount_point=VAULT_MOUNT_POINT ) - seed_list = r['data']['keys'] + seed_list = r["data"]["keys"] except hvac.exceptions.InvalidPath: # InvalidPath is raised when there are no secrets available return {} @@ -163,8 +161,8 @@ def get_secrets( ) # Remove user-path prefix and save - secret_path = secret_path.replace(user_path, '') - out[secret_path] = r1['data'] + secret_path = secret_path.replace(user_path, "") + out[secret_path] = r1["data"] return out @@ -175,7 +173,7 @@ def create_secret( secret_path: str, secret_data: dict, authorization=Depends(security), - ): +): """ Creates a new secret or updates an existing one. @@ -191,22 +189,22 @@ def create_secret( """ # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Init the Vault client client = vault_client( jwt=authorization.credentials, - issuer=auth_info['issuer'], + issuer=auth_info["issuer"], ) # Create secret client.secrets.kv.v1.create_or_update_secret( path=f"users/{auth_info['id']}/{vo}/{secret_path}", - mount_point='/secrets/', + mount_point="/secrets/", secret=secret_data, ) - return {'status': 'success'} + return {"status": "success"} @router.delete("") @@ -214,7 +212,7 @@ def delete_secret( vo: str, secret_path: str, authorization=Depends(security), - ): +): """ Delete a secret. @@ -227,12 +225,12 @@ def delete_secret( """ # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Init the Vault client client = vault_client( jwt=authorization.credentials, - issuer=auth_info['issuer'], + issuer=auth_info["issuer"], ) # Delete secret @@ -241,4 +239,4 @@ def delete_secret( mount_point=VAULT_MOUNT_POINT, ) - return {'status': 'success'} + return {"status": "success"} diff --git a/ai4papi/routers/v1/snapshots.py b/ai4papi/routers/v1/snapshots.py index f6a0920..f1943d9 100644 --- a/ai4papi/routers/v1/snapshots.py +++ b/ai4papi/routers/v1/snapshots.py @@ -160,7 +160,7 @@ def create_snapshot( nomad_conf = nomad_common.load_job_conf(nomad_conf) # Submit job - r = nomad_common.create_deployment(nomad_conf) + _ = nomad_common.create_deployment(nomad_conf) return { "status": "success", @@ -318,7 +318,6 @@ def get_nomad_snapshots( # user_jobs = [] snapshots = [] for j in jobs: - # Get job to retrieve the metadata job_info = Nomad.job.get_job( id_=j["ID"], @@ -355,7 +354,9 @@ def get_nomad_snapshots( ][::-1] # more recent first # Retrieve tasks - tasks = allocs[0]["TaskStates"] if allocs else {} # if no allocations, use empty dict + tasks = ( + allocs[0]["TaskStates"] if allocs else {} + ) # if no allocations, use empty dict tasks = tasks or {} # if None, use empty dict client_status = allocs[0]["ClientStatus"] if allocs else None diff --git a/ai4papi/routers/v1/stats/__init__.py b/ai4papi/routers/v1/stats/__init__.py index df69733..54e786f 100644 --- a/ai4papi/routers/v1/stats/__init__.py +++ b/ai4papi/routers/v1/stats/__init__.py @@ -6,5 +6,5 @@ router = fastapi.APIRouter() router.include_router( router=deployments.router, - prefix='/deployments', - ) + prefix="/deployments", +) diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py index e399bc2..78c355e 100644 --- a/ai4papi/routers/v1/stats/deployments.py +++ b/ai4papi/routers/v1/stats/deployments.py @@ -29,63 +29,60 @@ main_dir = Path(__file__).resolve().parent Nomad = nomad.Nomad() -Nomad.job.get_allocations = types.MethodType( - npatches.get_allocations, - Nomad.job -) +Nomad.job.get_allocations = types.MethodType(npatches.get_allocations, Nomad.job) cluster_stats = None -@cached(cache=TTLCache(maxsize=1024, ttl=6*60*60)) +@cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60)) def load_stats( namespace: str, - ): +): """ CSV reader and data filtering could be improved with Pandas, but that's a heavy dependency, so we're keeping it like this for the moment. """ - main_dir = os.environ.get('ACCOUNTING_PTH', None) + main_dir = os.environ.get("ACCOUNTING_PTH", None) if not main_dir: raise HTTPException( status_code=500, detail="Deployments stats information not available (no env var).", - ) + ) # Load all stats files stats = {} - for name in ['full-agg', 'timeseries', 'users-agg']: - pth = Path(main_dir) / 'summaries' / f'{namespace}-{name}.csv' + for name in ["full-agg", "timeseries", "users-agg"]: + pth = Path(main_dir) / "summaries" / f"{namespace}-{name}.csv" if not pth.is_file(): raise HTTPException( status_code=500, detail="Deployments stats information not available (missing file).", - ) + ) - with open(pth, 'r') as f: - reader = csv.DictReader(f, delimiter=';') + with open(pth, "r") as f: + reader = csv.DictReader(f, delimiter=";") stats[name] = {k: [] for k in reader.fieldnames} for row in reader: for k, v in row.items(): - if k not in ['date', 'owner']: - v= int(v) + if k not in ["date", "owner"]: + v = int(v) stats[name][k].append(v) # In VO timeseries, only return last three months threshold = datetime.now() - timedelta(days=90) threshold = str(threshold.date()) try: - idx = [i > threshold for i in stats['timeseries']['date']].index(True) + idx = [i > threshold for i in stats["timeseries"]["date"]].index(True) except Exception: # If there are no data in the last 90 days, then return last 90 dates idx = -90 - for k, v in stats['timeseries'].items(): - stats['timeseries'][k] = v[idx:] + for k, v in stats["timeseries"].items(): + stats["timeseries"][k] = v[idx:] # Namespace aggregates are not lists - stats['full-agg'] = {k: v[0] for k, v in stats['full-agg'].items()} + stats["full-agg"] = {k: v[0] for k, v in stats["full-agg"].items()} return stats @@ -94,7 +91,7 @@ def load_stats( def get_user_stats( vo: str, authorization=Depends(security), - ): +): """ Returns the following stats (per resource type): * the time-series usage of that VO @@ -107,10 +104,10 @@ def get_user_stats( # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Retrieve the associated namespace to that VO - namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo] + namespace = papiconf.MAIN_CONF["nomad"]["namespaces"][vo] # Load proper namespace stats full_stats = load_stats(namespace=namespace) @@ -118,63 +115,66 @@ def get_user_stats( # Keep only stats from the current user user_stats = copy.deepcopy(full_stats) try: - idx = full_stats['users-agg']['owner'].index(auth_info['id']) - user_stats['users-agg'] = {k: v[idx] for k, v in full_stats['users-agg'].items()} + idx = full_stats["users-agg"]["owner"].index(auth_info["id"]) + user_stats["users-agg"] = { + k: v[idx] for k, v in full_stats["users-agg"].items() + } except ValueError: # user has still no recorded stats - user_stats['users-agg'] = None + user_stats["users-agg"] = None return user_stats def get_proper_allocation(allocs): - - # Reorder allocations based on recency - dates = [a['CreateTime'] for a in allocs] - allocs = [x for _, x in sorted( + # Reorder allocations based on recency + dates = [a["CreateTime"] for a in allocs] + allocs = [ + x + for _, x in sorted( zip(dates, allocs), key=lambda pair: pair[0], - )][::-1] # more recent first - - # Select the proper allocation - statuses = [a['ClientStatus'] for a in allocs] - if 'unknown' in statuses: - # The node has lost connection. Avoid showing temporary reallocated job, - # to avoid confusions when the original allocation is restored back again. - idx = statuses.index('unknown') - elif 'running' in statuses: - # If an allocation is running, return that allocation - # It happens that after a network cut, when the network is restored, - # the temporary allocation created in the meantime (now with status - # 'complete') is more recent than the original allocation that we - # recovered (with status 'running'), so using only recency does not work. - idx = statuses.index('running') - else: - # Return most recent allocation - idx = 0 - - return allocs[idx]['ID'] - - -@cached(cache=TTLCache(maxsize=1024, ttl=6*60*60)) + ) + ][::-1] # more recent first + + # Select the proper allocation + statuses = [a["ClientStatus"] for a in allocs] + if "unknown" in statuses: + # The node has lost connection. Avoid showing temporary reallocated job, + # to avoid confusions when the original allocation is restored back again. + idx = statuses.index("unknown") + elif "running" in statuses: + # If an allocation is running, return that allocation + # It happens that after a network cut, when the network is restored, + # the temporary allocation created in the meantime (now with status + # 'complete') is more recent than the original allocation that we + # recovered (with status 'running'), so using only recency does not work. + idx = statuses.index("running") + else: + # Return most recent allocation + idx = 0 + + return allocs[idx]["ID"] + + +@cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60)) def load_datacenters(): - # Check if datacenter info file is available - pth = papiconf.main_path.parent / 'var' / 'datacenters_info.csv' + pth = papiconf.main_path.parent / "var" / "datacenters_info.csv" if not pth.is_file(): return {} # Load datacenter info datacenters = {} - with open(pth, 'r') as f: - reader = csv.DictReader(f, delimiter=';') + with open(pth, "r") as f: + reader = csv.DictReader(f, delimiter=";") dc_keys = reader.fieldnames.copy() - dc_keys.remove('name') + dc_keys.remove("name") for row in reader: for k, v in row.items(): - if k == 'name': + if k == "name": name = v datacenters[name] = {k: 0 for k in dc_keys} - datacenters[name]['nodes'] = {} + datacenters[name]["nodes"] = {} else: datacenters[name][k] = float(v) @@ -185,7 +185,7 @@ def load_datacenters(): @cached(cache=TTLCache(maxsize=1024, ttl=30)) def get_cluster_stats( vo: str, - ): +): """ Returns the following stats of the nodes and the cluster (per resource type): * the aggregated usage @@ -201,45 +201,46 @@ def get_cluster_stats( cluster_stats = get_cluster_stats_bg() stats = copy.deepcopy(cluster_stats) - namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo] - - for k, v in stats['datacenters'].copy().items(): + namespace = papiconf.MAIN_CONF["nomad"]["namespaces"][vo] + for k, v in stats["datacenters"].copy().items(): # Filter out nodes that do not support the given VO nodes = {} - for n_id, n_stats in v['nodes'].items(): - if namespace in n_stats['namespaces']: + for n_id, n_stats in v["nodes"].items(): + if namespace in n_stats["namespaces"]: nodes[n_id] = n_stats # Ignore datacenters with no nodes if not nodes: - del stats['datacenters'][k] + del stats["datacenters"][k] else: - stats['datacenters'][k]['nodes'] = nodes + stats["datacenters"][k]["nodes"] = nodes # Compute cluster stats after node filtering is done - for dc_stats in stats['datacenters'].values(): - for n_stats in dc_stats['nodes'].values(): + for dc_stats in stats["datacenters"].values(): + for n_stats in dc_stats["nodes"].values(): for k, v in n_stats.items(): - # Ignore keys - if k in ['name', 'namespaces', 'eligibility', 'status', 'tags']: + if k in ["name", "namespaces", "eligibility", "status", "tags"]: continue # Aggregate nested gpu_models dict - elif k == 'gpu_models': + elif k == "gpu_models": for k1, v1 in v.items(): - model_stats = stats['cluster']['gpu_models'].get( + model_stats = stats["cluster"]["gpu_models"].get( k1, - {'gpu_total': 0, 'gpu_used': 0,} # init value + { + "gpu_total": 0, + "gpu_used": 0, + }, # init value ) for k2, v2 in v1.items(): model_stats[k2] += v2 - stats['cluster']['gpu_models'][k1] = model_stats + stats["cluster"]["gpu_models"][k1] = model_stats # Aggregate other resources else: - stats['cluster'][k] += v + stats["cluster"][k] += v return stats @@ -253,132 +254,142 @@ def get_cluster_stats_bg(): """ resources = [ - 'jobs_num', - 'cpu_total', - 'cpu_used', - 'gpu_total', - 'gpu_used', - 'ram_total', - 'ram_used', - 'disk_total', - 'disk_used', + "jobs_num", + "cpu_total", + "cpu_used", + "gpu_total", + "gpu_used", + "ram_total", + "ram_used", + "disk_total", + "disk_used", ] - datacenters = load_datacenters() # available datacenters info + datacenters = load_datacenters() # available datacenters info stats = { - 'datacenters' : datacenters, # aggregated datacenter usage - 'cluster': {k: 0 for k in resources}, # aggregated cluster usage - } - stats['cluster']['gpu_models'] = {} + "datacenters": datacenters, # aggregated datacenter usage + "cluster": {k: 0 for k in resources}, # aggregated cluster usage + } + stats["cluster"]["gpu_models"] = {} # Load nodes nodes = Nomad.nodes.get_nodes(resources=True) - nodes_dc = {} # dict(node, datacenter) + nodes_dc = {} # dict(node, datacenter) # Get total stats for each node for n in nodes: - node = Nomad.node.get_node(n['ID']) + node = Nomad.node.get_node(n["ID"]) n_stats = {k: 0 for k in resources} - n_stats['name'] = node['Name'] - n_stats['eligibility'] = node['SchedulingEligibility'] - n_stats['cpu_total'] = int(node['Attributes']['cpu.numcores']) - n_stats['ram_total'] = int(node['Attributes']['memory.totalbytes']) / 2**20 - n_stats['disk_total'] = int(node['Attributes']['unique.storage.bytestotal']) / 2**20 - n_stats['gpu_models'] = {} - n_stats['namespaces'] = node['Meta'].get('namespace', '') - n_stats['status'] = node['Meta'].get('status', '') - n_stats['tags'] = node['Meta'].get('tags', '') - - if n['NodeResources']['Devices']: - for devices in n['NodeResources']['Devices']: - if devices['Type'] == 'gpu': - n_stats['gpu_total'] += len(devices['Instances']) + n_stats["name"] = node["Name"] + n_stats["eligibility"] = node["SchedulingEligibility"] + n_stats["cpu_total"] = int(node["Attributes"]["cpu.numcores"]) + n_stats["ram_total"] = int(node["Attributes"]["memory.totalbytes"]) / 2**20 + n_stats["disk_total"] = ( + int(node["Attributes"]["unique.storage.bytestotal"]) / 2**20 + ) + n_stats["gpu_models"] = {} + n_stats["namespaces"] = node["Meta"].get("namespace", "") + n_stats["status"] = node["Meta"].get("status", "") + n_stats["tags"] = node["Meta"].get("tags", "") + + if n["NodeResources"]["Devices"]: + for devices in n["NodeResources"]["Devices"]: + if devices["Type"] == "gpu": + n_stats["gpu_total"] += len(devices["Instances"]) # Track stats per GPU model type - if devices['Name'] not in n_stats['gpu_models'].keys(): - n_stats['gpu_models'][devices['Name']] = {'gpu_total': 0, 'gpu_used': 0} + if devices["Name"] not in n_stats["gpu_models"].keys(): + n_stats["gpu_models"][devices["Name"]] = { + "gpu_total": 0, + "gpu_used": 0, + } - n_stats['gpu_models'][devices['Name']]['gpu_total'] += len(devices['Instances']) + n_stats["gpu_models"][devices["Name"]]["gpu_total"] += len( + devices["Instances"] + ) # If datacenter is not in csv, load default info - if n['Datacenter'] not in stats['datacenters']: - stats['datacenters'][n['Datacenter']] = {'lat':0, 'lon':0, 'PUE':0, 'energy_quality':0, 'nodes':{}} - - stats['datacenters'][n['Datacenter']]['nodes'][n['ID']] = n_stats - nodes_dc[n['ID']] = n['Datacenter'] + if n["Datacenter"] not in stats["datacenters"]: + stats["datacenters"][n["Datacenter"]] = { + "lat": 0, + "lon": 0, + "PUE": 0, + "energy_quality": 0, + "nodes": {}, + } + + stats["datacenters"][n["Datacenter"]]["nodes"][n["ID"]] = n_stats + nodes_dc[n["ID"]] = n["Datacenter"] # Get aggregated usage stats for each node - namespaces = ['default'] + list(papiconf.MAIN_CONF['nomad']['namespaces'].values()) + namespaces = ["default"] + list(papiconf.MAIN_CONF["nomad"]["namespaces"].values()) for namespace in namespaces: jobs = Nomad.jobs.get_jobs(namespace=namespace, filter_='Status == "running"') for j in jobs: - # Retrieve full job for meta job = Nomad.job.get_job( - id_=j['ID'], + id_=j["ID"], namespace=namespace, - ) + ) allocs = Nomad.job.get_allocations( - id_=job['ID'], + id_=job["ID"], namespace=namespace, - ) + ) # Keep the proper allocation - a = Nomad.allocation.get_allocation( - get_proper_allocation(allocs) - ) + a = Nomad.allocation.get_allocation(get_proper_allocation(allocs)) # Add resources - datacenter = nodes_dc[a['NodeID']] - n_stats = stats['datacenters'][datacenter]['nodes'][a['NodeID']] + datacenter = nodes_dc[a["NodeID"]] + n_stats = stats["datacenters"][datacenter]["nodes"][a["NodeID"]] - #TODO: we are ignoring resources consumed by other jobs - if job['Name'].startswith('module') or job['Name'].startswith('tool'): - n_stats['jobs_num'] += 1 + # TODO: we are ignoring resources consumed by other jobs + if job["Name"].startswith("module") or job["Name"].startswith("tool"): + n_stats["jobs_num"] += 1 - #TODO: we are ignoring resources consumed by other tasks - if 'main' in a['AllocatedResources']['Tasks']: - res = a['AllocatedResources']['Tasks']['main'] + # TODO: we are ignoring resources consumed by other tasks + if "main" in a["AllocatedResources"]["Tasks"]: + res = a["AllocatedResources"]["Tasks"]["main"] # cpu - if res['Cpu']['ReservedCores']: - n_stats['cpu_used'] += len(res['Cpu']['ReservedCores']) + if res["Cpu"]["ReservedCores"]: + n_stats["cpu_used"] += len(res["Cpu"]["ReservedCores"]) # ram - n_stats['ram_used'] += res['Memory']['MemoryMB'] + n_stats["ram_used"] += res["Memory"]["MemoryMB"] # disk # Note: In theory we can get the total disk used in a node looking at the # metadata (ie. "unique.storage.bytesfree"). But that gave us the disk that # is actually used. But we are instead interested on the disk that is reserved # by users (regardless of whether they are actually using it). - n_stats['disk_used'] += a['AllocatedResources']['Shared']['DiskMB'] + n_stats["disk_used"] += a["AllocatedResources"]["Shared"]["DiskMB"] # gpu - if res['Devices']: - gpu = [d for d in res['Devices'] if d['Type'] == 'gpu'][0] - gpu_num = len(gpu['DeviceIDs']) if gpu else 0 + if res["Devices"]: + gpu = [d for d in res["Devices"] if d["Type"] == "gpu"][0] + gpu_num = len(gpu["DeviceIDs"]) if gpu else 0 # Sometimes the node fails and GPUs are not detected [1]. # In that case, avoid counting that GPU in the stats. # [1]: https://docs.ai4os.eu/en/latest/user/others/faq.html#my-gpu-just-disappeared-from-my-deployment - if n_stats['gpu_models']: - n_stats['gpu_used'] += gpu_num - n_stats['gpu_models'][gpu['Name']]['gpu_used'] += gpu_num + if n_stats["gpu_models"]: + n_stats["gpu_used"] += gpu_num + n_stats["gpu_models"][gpu["Name"]]["gpu_used"] += gpu_num else: continue # Keep ineligible nodes, but set (used=total) for all resources # We don't remove the node altogether because jobs might still be running there # and we want to show them in the stats - for datacenter in stats['datacenters'].values(): - for n_stats in datacenter['nodes'].values(): - if n_stats['eligibility'] == 'ineligible': - for r in ['cpu', 'gpu', 'ram', 'disk']: - n_stats[f'{r}_total'] = n_stats[f'{r}_used'] - for g_stats in n_stats['gpu_models'].values(): - g_stats['gpu_total'] = n_stats['gpu_used'] + for datacenter in stats["datacenters"].values(): + for n_stats in datacenter["nodes"].values(): + if n_stats["eligibility"] == "ineligible": + for r in ["cpu", "gpu", "ram", "disk"]: + n_stats[f"{r}_total"] = n_stats[f"{r}_used"] + for g_stats in n_stats["gpu_models"].values(): + g_stats["gpu_total"] = n_stats["gpu_used"] # Set the new shared variable global cluster_stats diff --git a/ai4papi/routers/v1/storage.py b/ai4papi/routers/v1/storage.py index 23a2b12..a067ac2 100644 --- a/ai4papi/routers/v1/storage.py +++ b/ai4papi/routers/v1/storage.py @@ -25,9 +25,9 @@ def storage_ls( vo: str, storage_name: str, - subpath: str = '', + subpath: str = "", authorization=Depends(security), - ): +): """ Returns a list of files/folders inside a given subpath of the specified storage. It is using RCLONE under-the-hood. @@ -41,19 +41,19 @@ def storage_ls( """ # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Retrieve storage credentials if storage_name: # Retrieve the rclone credentials secrets = ai4secrets.get_secrets( vo=vo, - subpath='/services/storage/', + subpath="/services/storage/", authorization=types.SimpleNamespace( credentials=authorization.credentials, ), ) - storage = secrets[f'/services/storage/{storage_name}'] + storage = secrets[f"/services/storage/{storage_name}"] if not storage: raise HTTPException( status_code=401, @@ -61,21 +61,22 @@ def storage_ls( ) # Use rclone to list content of subpath - result = subprocess.run([ - f"export RCLONE_CONFIG_RSHARE_VENDOR={storage['vendor']} && " - f"export RCLONE_CONFIG_RSHARE_URL={storage['server']}/remote.php/dav/files/{storage['loginName']} && " - "export RCLONE_CONFIG_RSHARE_TYPE=webdav && " - f"export RCLONE_CONFIG_RSHARE_USER={storage['loginName']} && " - f"export RCLONE_CONFIG_RSHARE_PASS={storage['appPassword']} && " - "export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $RCLONE_CONFIG_RSHARE_PASS) && " - f"rclone lsjson rshare:/{subpath} ;" - "status=$? ;" # we want to return the status code of the rclone purge command - "for var in $(env | grep '^RCLONE_CONFIG_RSHARE_' | awk -F= '{print $1}'); do unset $var; done;" - "exit $status" + result = subprocess.run( + [ + f"export RCLONE_CONFIG_RSHARE_VENDOR={storage['vendor']} && " + f"export RCLONE_CONFIG_RSHARE_URL={storage['server']}/remote.php/dav/files/{storage['loginName']} && " + "export RCLONE_CONFIG_RSHARE_TYPE=webdav && " + f"export RCLONE_CONFIG_RSHARE_USER={storage['loginName']} && " + f"export RCLONE_CONFIG_RSHARE_PASS={storage['appPassword']} && " + "export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $RCLONE_CONFIG_RSHARE_PASS) && " + f"rclone lsjson rshare:/{subpath} ;" + "status=$? ;" # we want to return the status code of the rclone purge command + "for var in $(env | grep '^RCLONE_CONFIG_RSHARE_' | awk -F= '{print $1}'); do unset $var; done;" + "exit $status" ], shell=True, capture_output=True, - text=True + text=True, ) # Check for possible errors @@ -102,7 +103,7 @@ def storage_rm( storage_name: str, subpath: str, authorization=Depends(security), - ): +): """ Deletes the files/folders inside a given subpath of the specified storage. It is using RCLONE under-the-hood. @@ -116,26 +117,26 @@ def storage_rm( """ # Retrieve authenticated user info auth_info = auth.get_user_info(token=authorization.credentials) - auth.check_vo_membership(vo, auth_info['vos']) + auth.check_vo_membership(vo, auth_info["vos"]) # Do not allow to delete root folder to prevent accidents - if not subpath.strip('/'): + if not subpath.strip("/"): raise HTTPException( status_code=400, detail="You cannot delete the root folder for security reasons.", - ) + ) # Retrieve storage credentials if storage_name: # Retrieve the rclone credentials secrets = ai4secrets.get_secrets( vo=vo, - subpath='/services/storage/', + subpath="/services/storage/", authorization=types.SimpleNamespace( credentials=authorization.credentials, ), ) - storage = secrets[f'/services/storage/{storage_name}'] + storage = secrets[f"/services/storage/{storage_name}"] if not storage: raise HTTPException( status_code=401, @@ -143,21 +144,22 @@ def storage_rm( ) # Use rclone to delete the subpath - result = subprocess.run([ - f"export RCLONE_CONFIG_RSHARE_VENDOR={storage['vendor']} && " - f"export RCLONE_CONFIG_RSHARE_URL={storage['server']}/remote.php/dav/files/{storage['loginName']} && " - "export RCLONE_CONFIG_RSHARE_TYPE=webdav && " - f"export RCLONE_CONFIG_RSHARE_USER={storage['loginName']} && " - f"export RCLONE_CONFIG_RSHARE_PASS={storage['appPassword']} && " - "export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $RCLONE_CONFIG_RSHARE_PASS) && " - f"rclone purge rshare:/{subpath} ;" - "status=$? ;" # we want to return the status code of the rclone purge command - "for var in $(env | grep '^RCLONE_CONFIG_RSHARE_' | awk -F= '{print $1}'); do unset $var; done;" - "exit $status" + result = subprocess.run( + [ + f"export RCLONE_CONFIG_RSHARE_VENDOR={storage['vendor']} && " + f"export RCLONE_CONFIG_RSHARE_URL={storage['server']}/remote.php/dav/files/{storage['loginName']} && " + "export RCLONE_CONFIG_RSHARE_TYPE=webdav && " + f"export RCLONE_CONFIG_RSHARE_USER={storage['loginName']} && " + f"export RCLONE_CONFIG_RSHARE_PASS={storage['appPassword']} && " + "export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $RCLONE_CONFIG_RSHARE_PASS) && " + f"rclone purge rshare:/{subpath} ;" + "status=$? ;" # we want to return the status code of the rclone purge command + "for var in $(env | grep '^RCLONE_CONFIG_RSHARE_' | awk -F= '{print $1}'); do unset $var; done;" + "exit $status" ], shell=True, capture_output=True, - text=True + text=True, ) # Check for possible errors @@ -167,4 +169,4 @@ def storage_rm( detail=f"Error deleting the selected subpath from storage. \n\n {result.stderr}", ) - return {'status': 'success'} + return {"status": "success"} diff --git a/ai4papi/routers/v1/try_me/__init__.py b/ai4papi/routers/v1/try_me/__init__.py index 18169cb..7c6da08 100644 --- a/ai4papi/routers/v1/try_me/__init__.py +++ b/ai4papi/routers/v1/try_me/__init__.py @@ -6,5 +6,5 @@ router = fastapi.APIRouter() router.include_router( router=nomad.router, - prefix='/try_me', - ) + prefix="/try_me", +) diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py index 2d267fa..c79f41c 100644 --- a/ai4papi/routers/v1/try_me/nomad.py +++ b/ai4papi/routers/v1/try_me/nomad.py @@ -26,14 +26,14 @@ # (!) try-me jobs are always deployed in AI4EOSC VO = "vo.ai4eosc.eu" -NAMESPACE = papiconf.MAIN_CONF['nomad']['namespaces'][VO] +NAMESPACE = papiconf.MAIN_CONF["nomad"]["namespaces"][VO] @router.get("") def get_deployments( full_info: bool = Query(default=False), authorization=Depends(security), - ): +): """ Returns a list of all deployments belonging to a user. @@ -50,14 +50,14 @@ def get_deployments( # Retrieve all jobs in namespace jobs = nomad.get_deployments( namespace=NAMESPACE, - owner=auth_info['id'], - prefix='try', + owner=auth_info["id"], + prefix="try", ) user_jobs = [] for j in jobs: try: job_info = get_deployment( - deployment_uuid=j['ID'], + deployment_uuid=j["ID"], full_info=full_info, authorization=types.SimpleNamespace( credentials=authorization.credentials # token @@ -66,12 +66,12 @@ def get_deployments( except HTTPException: # not a try-me continue except Exception as e: # unexpected error - raise(e) + raise (e) user_jobs.append(job_info) # Sort deployments by creation date - seq = [j['submit_time'] for j in user_jobs] + seq = [j["submit_time"] for j in user_jobs] args = sorted(range(len(seq)), key=seq.__getitem__)[::-1] sorted_jobs = [user_jobs[i] for i in args] @@ -83,7 +83,7 @@ def get_deployment( deployment_uuid: str, full_info: bool = Query(default=True), authorization=Depends(security), - ): +): """ This function is used mainly to be able to retrieve the endpoint of the try_me job. We cannot return the endpoint when creating the job, because the final endpoint will @@ -100,12 +100,12 @@ def get_deployment( job = nomad.get_deployment( deployment_uuid=deployment_uuid, namespace=NAMESPACE, - owner=auth_info['id'], + owner=auth_info["id"], full_info=full_info, ) # Rewrite main endpoint, otherwise it automatically selects DEEPaaS API - job['main_endpoint'] = 'ui' + job["main_endpoint"] = "ui" return job @@ -115,7 +115,7 @@ def create_deployment( module_name: str, title: str = Query(default=""), authorization=Depends(security), - ): +): """ Submit a try-me deployment to Nomad. The deployment will automatically kill himself after a short amount of time. @@ -127,11 +127,11 @@ def create_deployment( # Retrieve docker_image from module_name meta = Modules.get_metadata(module_name) - registry = meta['links']['docker_image'] - docker_image = '/'.join(registry.split('/')[-2:]) + registry = meta["links"]["docker_image"] + docker_image = "/".join(registry.split("/")[-2:]) # Load module configuration - nomad_conf = deepcopy(papiconf.TRY_ME['nomad']) + nomad_conf = deepcopy(papiconf.TRY_ME["nomad"]) # Generate UUID from (MAC address+timestamp) so it's unique job_uuid = uuid.uuid1() @@ -139,15 +139,15 @@ def create_deployment( # Replace the Nomad job template nomad_conf = nomad_conf.safe_substitute( { - 'JOB_UUID': job_uuid, - 'NAMESPACE': NAMESPACE, - 'TITLE': title[:45], - 'OWNER': auth_info['id'], - 'OWNER_NAME': auth_info['name'], - 'OWNER_EMAIL': auth_info['email'], - 'BASE_DOMAIN': papiconf.MAIN_CONF['lb']['domain'][VO], - 'HOSTNAME': job_uuid, - 'DOCKER_IMAGE': docker_image, + "JOB_UUID": job_uuid, + "NAMESPACE": NAMESPACE, + "TITLE": title[:45], + "OWNER": auth_info["id"], + "OWNER_NAME": auth_info["name"], + "OWNER_EMAIL": auth_info["email"], + "BASE_DOMAIN": papiconf.MAIN_CONF["lb"]["domain"][VO], + "HOSTNAME": job_uuid, + "DOCKER_IMAGE": docker_image, } ) @@ -158,39 +158,42 @@ def create_deployment( # these jobs cannot be left queueing # We check for every resource metric (cpu, disk, ram) stats = get_cluster_stats(vo=VO) - resources = ['cpu', 'ram', 'disk'] + resources = ["cpu", "ram", "disk"] keys = [f"{i}_used" for i in resources] + [f"{i}_total" for i in resources] status = {k: 0 for k in keys} - for _, datacenter in stats['datacenters'].items(): - for _, node in datacenter['nodes'].items(): - if 'tryme' in node['tags'] and node['status'] == 'ready': + for _, datacenter in stats["datacenters"].items(): + for _, node in datacenter["nodes"].items(): + if "tryme" in node["tags"] and node["status"] == "ready": for k in keys: status[k] += node[k] for r in resources: - if status[f"{r}_total"] == 0 or status[f"{r}_used"] / status[f"{r}_total"] > 0.85: + if ( + status[f"{r}_total"] == 0 + or status[f"{r}_used"] / status[f"{r}_total"] > 0.85 + ): # We cut of somehow earlier than 100% because we are only accounting for # cores consumed in "main" task. But UI task is also consuming resources. raise HTTPException( status_code=503, - detail="Sorry, but there seem to be no resources available right " \ - "now to test the module. Please try later.", - ) + detail="Sorry, but there seem to be no resources available right " + "now to test the module. Please try later.", + ) # Check that the user hasn't too many "try-me" jobs currently running jobs = nomad.get_deployments( namespace=NAMESPACE, - owner=auth_info['id'], + owner=auth_info["id"], prefix="try", ) if len(jobs) >= 3: raise HTTPException( status_code=503, - detail="Sorry, but you seem to be currently running 3 `try-me` environments already. " \ - "Before launching a new one, you will need to wait till one of your " \ - "existing environments gets automatically deleted (ca. 10 min) or delete it manually " \ - "in the Dashboard." - ) + detail="Sorry, but you seem to be currently running 3 `try-me` environments already. " + "Before launching a new one, you will need to wait till one of your " + "existing environments gets automatically deleted (ca. 10 min) or delete it manually " + "in the Dashboard.", + ) # Submit job r = nomad.create_deployment(nomad_conf) @@ -202,7 +205,7 @@ def create_deployment( def delete_deployment( deployment_uuid: str, authorization=Depends(security), - ): +): """ Delete a deployment. Users can only delete their own deployments. @@ -219,7 +222,7 @@ def delete_deployment( r = nomad.delete_deployment( deployment_uuid=deployment_uuid, namespace=NAMESPACE, - owner=auth_info['id'], + owner=auth_info["id"], ) return r diff --git a/ai4papi/utils.py b/ai4papi/utils.py index 563ab20..2f2a46d 100644 --- a/ai4papi/utils.py +++ b/ai4papi/utils.py @@ -1,6 +1,7 @@ """ Miscellaneous utils """ + from datetime import datetime import json from pathlib import Path @@ -18,7 +19,7 @@ session = requests.Session() # Retrieve tokens for better rate limit -github_token = os.environ.get('PAPI_GITHUB_TOKEN', None) +github_token = os.environ.get("PAPI_GITHUB_TOKEN", None) def update_values_conf(submitted, reference): @@ -27,13 +28,11 @@ def update_values_conf(submitted, reference): We also check that the submitted conf has the appropriate keys. """ for k in submitted.keys(): - # Check level 1 keys if k not in reference.keys(): raise HTTPException( - status_code=400, - detail=f"The key `{k}` in not a valid parameter." - ) + status_code=400, detail=f"The key `{k}` in not a valid parameter." + ) # Check level 2 keys s1 = set(submitted[k].keys()) @@ -41,9 +40,8 @@ def update_values_conf(submitted, reference): subs = s1.difference(s2) if subs: raise HTTPException( - status_code=400, - detail=f"The keys `{subs}` are not a valid parameters." - ) + status_code=400, detail=f"The keys `{subs}` are not a valid parameters." + ) # Update with user values reference[k].update(submitted[k]) @@ -57,42 +55,44 @@ def validate_conf(conf): """ # Check that the Dockerhub image belongs either to "deephdc" or "ai4oshub" # or that it points to our Harbor instance (eg. CVAT) - image = conf.get('general', {}).get('docker_image') + image = conf.get("general", {}).get("docker_image") if image: - if image.split('/')[0] not in ["deephdc", "ai4oshub", "registry.services.ai4os.eu"]: + if image.split("/")[0] not in [ + "deephdc", + "ai4oshub", + "registry.services.ai4os.eu", + ]: raise HTTPException( status_code=400, detail="The docker image should belong to either 'deephdc' or 'ai4oshub' \ - DockerHub organizations or be hosted in the project's Harbor." - ) + DockerHub organizations or be hosted in the project's Harbor.", + ) # Check datasets_info list - datasets = conf.get('storage', {}).get('datasets') + datasets = conf.get("storage", {}).get("datasets") if datasets: for d in datasets: - # Validate DOI and URL # ref: https://stackoverflow.com/a/48524047/18471590 doiPattern = r"^10.\d{4,9}/[-._;()/:A-Z0-9]+$" urlPattern = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)" - if not (re.match(doiPattern, d['doi'], re.IGNORECASE) or re.match(urlPattern, d['doi'], re.IGNORECASE)): - raise HTTPException( - status_code=400, - detail="Invalid DOI or URL." - ) + if not ( + re.match(doiPattern, d["doi"], re.IGNORECASE) + or re.match(urlPattern, d["doi"], re.IGNORECASE) + ): + raise HTTPException(status_code=400, detail="Invalid DOI or URL.") # Check force pull parameter - if not isinstance(d['force_pull'], bool): + if not isinstance(d["force_pull"], bool): raise HTTPException( - status_code=400, - detail="Force pull should be bool." - ) + status_code=400, detail="Force pull should be bool." + ) return conf -#TODO: temporarily parse every 24hrs (instead of 6hrs) to reduce a bit the latency -@cached(cache=TTLCache(maxsize=1024, ttl=24*60*60)) +# TODO: temporarily parse every 24hrs (instead of 6hrs) to reduce a bit the latency +@cached(cache=TTLCache(maxsize=1024, ttl=24 * 60 * 60)) def get_github_info(owner, repo): """ Retrieve information from a Github repo @@ -100,31 +100,39 @@ def get_github_info(owner, repo): # Avoid running this function if were are doing local development, because # repeatedly calling the Github API will otherwise get you blocked if papiconf.IS_DEV: - print('[info] Skipping Github API info fetching (development).') + print("[info] Skipping Github API info fetching (development).") return {} # Retrieve information from Github API url = f"https://api.github.com/repos/{owner}/{repo}" - headers = {'Authorization': f'token {github_token}'} if github_token else {} + headers = {"Authorization": f"token {github_token}"} if github_token else {} r = session.get(url, headers=headers) # Parse the information out = {} if r.ok: repo_data = r.json() - out['created'] = datetime.strptime( - repo_data['created_at'], - "%Y-%m-%dT%H:%M:%SZ", - ).date().strftime("%Y-%m-%d") # keep only the date - out['updated'] = datetime.strptime( - repo_data['updated_at'], - "%Y-%m-%dT%H:%M:%SZ", - ).date().strftime("%Y-%m-%d") - out['license'] = (repo_data['license'] or {}).get('spdx_id', '') + out["created"] = ( + datetime.strptime( + repo_data["created_at"], + "%Y-%m-%dT%H:%M:%SZ", + ) + .date() + .strftime("%Y-%m-%d") + ) # keep only the date + out["updated"] = ( + datetime.strptime( + repo_data["updated_at"], + "%Y-%m-%dT%H:%M:%SZ", + ) + .date() + .strftime("%Y-%m-%d") + ) + out["license"] = (repo_data["license"] or {}).get("spdx_id", "") # out['stars'] = repo_data['stargazers_count'] else: msg = "API rate limit exceeded" if r.status_code == 403 else "" - print(f' [Error] Failed to parse Github repo info: {msg}') + print(f" [Error] Failed to parse Github repo info: {msg}") return out @@ -132,7 +140,7 @@ def get_github_info(owner, repo): @cached(cache=LRUCache(maxsize=20)) def retrieve_from_snapshots( deployment_uuid: str, - ): +): """ Retrieve the deployment info from Nomad periodic snapshots. @@ -143,31 +151,31 @@ def retrieve_from_snapshots( Anyway, not a big concern because this function is not meant to be called very frequently and latency from reading JSONs is very small. """ - main_dir = os.environ.get('ACCOUNTING_PTH', None) + main_dir = os.environ.get("ACCOUNTING_PTH", None) if not main_dir: raise HTTPException( status_code=500, detail="Accounting repo with snapshots not available.", - ) - snapshot_dir = Path(main_dir) / 'snapshots' + ) + snapshot_dir = Path(main_dir) / "snapshots" # Iterate over snapshots, from recent to old - for snapshot_pth in sorted(snapshot_dir.glob('**/*.json'))[::-1]: - + for snapshot_pth in sorted(snapshot_dir.glob("**/*.json"))[::-1]: # Load the snapshot - with open(snapshot_pth, 'r') as f: + with open(snapshot_pth, "r") as f: snapshot = json.load(f) # Iterate over deployments until we find the correct one for namespace, jobs in snapshot.items(): for job in jobs: - if (job['job_ID'] == deployment_uuid) and (job['status'] == 'running'): - job['namespace'] = namespace - job['alloc_end'] = f'{snapshot_pth.stem}0000Z' # the end date is approximate (true value lies between this snapshot date and next one) + if (job["job_ID"] == deployment_uuid) and (job["status"] == "running"): + job["namespace"] = namespace + job["alloc_end"] = ( + f"{snapshot_pth.stem}0000Z" # the end date is approximate (true value lies between this snapshot date and next one) + ) return job # If no deployment found, show error raise HTTPException( - status_code=404, - detail="Could not find the deployment in the database." - ) + status_code=404, detail="Could not find the deployment in the database." + ) diff --git a/requirements.txt b/requirements.txt index 18bf410..f15d0ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ pydantic >= 2.5.2, <= 2.9.2 natsort >= 8.1.0, < 9.0 ai4_metadata >= 2.0.2, < 3.0 harborapi == 0.25.3 +pre-commit >= 4.0.1, <= 5.0 diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..efffbb7 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,8 @@ +line-length = 88 + +[format] +quote-style = "double" + +[lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] +"tests/main.py" = ["E402", "F401"] diff --git a/setup.py b/setup.py index f3e0709..10256ae 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,4 @@ import setuptools -setuptools.setup( - setup_requires=['pbr>=5.3.0'], - pbr=True) +setuptools.setup(setup_requires=["pbr>=5.3.0"], pbr=True) diff --git a/tests/catalog/modules.py b/tests/catalog/modules.py index 83f15e0..702bb20 100644 --- a/tests/catalog/modules.py +++ b/tests/catalog/modules.py @@ -8,18 +8,18 @@ modules_list = list(Modules.get_items().keys()) assert isinstance(modules_list, list) -assert 'dogs-breed-detector' in modules_list -assert 'ai4os-federated-server' not in modules_list +assert "dogs-breed-detector" in modules_list +assert "ai4os-federated-server" not in modules_list # List filtered modules modules_list2 = Modules.get_filtered_list( - tags=('development',), + tags=("development",), tags_any=None, not_tags=None, not_tags_any=None, ) assert isinstance(modules_list2, list) -assert 'ai4os-dev-env' in modules_list2 +assert "ai4os-dev-env" in modules_list2 # Get modules summaries modules_sum = Modules.get_summary( @@ -41,28 +41,28 @@ # Get module config module_conf = Modules.get_config( item_name=module_name, - vo='vo.ai4eosc.eu', + vo="vo.ai4eosc.eu", ) assert isinstance(module_conf, dict) -assert 'general' in module_conf.keys() +assert "general" in module_conf.keys() # Get module metadata module_meta = Modules.get_metadata( item_name=module_name, ) assert isinstance(module_meta, dict) -assert 'title' in module_meta.keys() +assert "title" in module_meta.keys() # Refresh metadata cache -common.JENKINS_TOKEN = '1234' +common.JENKINS_TOKEN = "1234" module_meta = Modules.refresh_metadata_cache_entry( item_name=module_name, authorization=SimpleNamespace( - credentials='1234', + credentials="1234", ), ) assert isinstance(module_meta, dict) -#TODO: we should not be able to get config or metadata for a tool_name +# TODO: we should not be able to get config or metadata for a tool_name -print('Catalog (modules) tests passed!') +print("Catalog (modules) tests passed!") diff --git a/tests/catalog/tools.py b/tests/catalog/tools.py index 0666c05..d50ff6f 100644 --- a/tests/catalog/tools.py +++ b/tests/catalog/tools.py @@ -7,32 +7,32 @@ # Retrieve EGI token (not generated on the fly in case the are rate limiting issues # if too many queries) -token = os.getenv('TMP_EGI_TOKEN') +token = os.getenv("TMP_EGI_TOKEN") if not token: raise Exception( -'Please remember to set a token as ENV variable before executing \ + 'Please remember to set a token as ENV variable before executing \ the tests! \n\n \ export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \ If running from VScode make sure to launch `code` from that terminal so it can access \ that ENV variable.' - ) + ) # List tools tools_list = list(Tools.get_items().keys()) assert isinstance(tools_list, list) -assert 'ai4os-federated-server' in tools_list -assert 'dogs-breed-detector' not in tools_list +assert "ai4os-federated-server" in tools_list +assert "dogs-breed-detector" not in tools_list # List filtered tools tools_list2 = Tools.get_filtered_list( - tags=('docker',), + tags=("docker",), tags_any=None, not_tags=None, not_tags_any=None, ) assert isinstance(tools_list2, list) -assert 'ai4os-federated-server' in tools_list +assert "ai4os-federated-server" in tools_list # Get tools summaries tools_sum = Tools.get_summary( @@ -52,34 +52,33 @@ # Contrary than for modules, we do this for all tools because tool configurations are # particular for each tool for tool_name in tools_list: - - print(f' - Testing {tool_name}') + print(f" - Testing {tool_name}") # Get tool config tool_conf = Tools.get_config( item_name=tool_name, - vo='vo.ai4eosc.eu', + vo="vo.ai4eosc.eu", ) assert isinstance(tool_conf, dict) - assert 'general' in tool_conf.keys() + assert "general" in tool_conf.keys() # Get tool metadata tool_meta = Tools.get_metadata( item_name=tool_name, ) assert isinstance(tool_meta, dict) - assert 'title' in tool_meta.keys() + assert "title" in tool_meta.keys() # Refresh metadata cache -common.JENKINS_TOKEN = '1234' +common.JENKINS_TOKEN = "1234" module_meta = Tools.refresh_metadata_cache_entry( item_name=tool_name, authorization=SimpleNamespace( - credentials='1234', + credentials="1234", ), ) assert isinstance(module_meta, dict) -#TODO: we should not be able to get config or metadata for a module_name +# TODO: we should not be able to get config or metadata for a module_name -print('Catalog (tools) tests passed!') +print("Catalog (tools) tests passed!") diff --git a/tests/deployments/modules.py b/tests/deployments/modules.py index 1f192f3..af21813 100644 --- a/tests/deployments/modules.py +++ b/tests/deployments/modules.py @@ -8,52 +8,46 @@ # Retrieve EGI token (not generated on the fly in case the are rate limiting issues # if too many queries) -token = os.getenv('TMP_EGI_TOKEN') +token = os.getenv("TMP_EGI_TOKEN") if not token: raise Exception( -'Please remember to set a token as ENV variable before executing \ + 'Please remember to set a token as ENV variable before executing \ the tests! \n\n \ export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \ If running from VScode make sure to launch `code` from that terminal so it can access \ that ENV variable.' - ) + ) # Create module rcreate = modules.create_deployment( - vo='vo.ai4eosc.eu', + vo="vo.ai4eosc.eu", conf={}, - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rcreate, dict) -assert 'job_ID' in rcreate.keys() +assert "job_ID" in rcreate.keys() time.sleep(0.2) # Nomad takes some time to allocate deployment # Retrieve that module rdep = modules.get_deployment( - vo='vo.ai4eosc.eu', - deployment_uuid=rcreate['job_ID'], - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + deployment_uuid=rcreate["job_ID"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rdep, dict) -assert 'job_ID' in rdep.keys() -assert rdep['job_ID']==rcreate['job_ID'] -assert rdep['status']!='error' +assert "job_ID" in rdep.keys() +assert rdep["job_ID"] == rcreate["job_ID"] +assert rdep["status"] != "error" # Retrieve all modules rdeps = modules.get_deployments( - vos=['vo.ai4eosc.eu'], - authorization=SimpleNamespace( - credentials=token - ), + vos=["vo.ai4eosc.eu"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rdeps, list) -assert any([d['job_ID']==rcreate['job_ID'] for d in rdeps]) -assert all([d['job_ID']!='error' for d in rdeps]) +assert any([d["job_ID"] == rcreate["job_ID"] for d in rdeps]) +assert all([d["job_ID"] != "error" for d in rdeps]) # Check that we cannot retrieve that module from tools # This should break! @@ -67,44 +61,36 @@ # Check that we cannot retrieve that module from tools list rdeps2 = tools.get_deployments( - vos=['vo.ai4eosc.eu'], - authorization=SimpleNamespace( - credentials=token - ), + vos=["vo.ai4eosc.eu"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rdeps2, list) -assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps2]) +assert not any([d["job_ID"] == rcreate["job_ID"] for d in rdeps2]) # Delete module rdel = modules.delete_deployment( - vo='vo.ai4eosc.eu', - deployment_uuid=rcreate['job_ID'], - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + deployment_uuid=rcreate["job_ID"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rdel, dict) -assert 'status' in rdel.keys() +assert "status" in rdel.keys() time.sleep(3) # Nomad takes some time to delete # Check module no longer exists rdeps3 = modules.get_deployments( - vos=['vo.ai4eosc.eu'], - authorization=SimpleNamespace( - credentials=token - ), + vos=["vo.ai4eosc.eu"], + authorization=SimpleNamespace(credentials=token), ) -assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps3]) +assert not any([d["job_ID"] == rcreate["job_ID"] for d in rdeps3]) # Check that we are able to retrieve info from Nomad snapshots (provenance) -modules.provenance_token = '1234' +modules.provenance_token = "1234" r_prov = modules.get_deployment( - vo='', - deployment_uuid='de0599d6-a1b9-11ef-b98d-0242ac120005', - authorization=SimpleNamespace( - credentials='1234' - ), + vo="", + deployment_uuid="de0599d6-a1b9-11ef-b98d-0242ac120005", + authorization=SimpleNamespace(credentials="1234"), ) -print('Deployments (modules) tests passed!') +print("Deployments (modules) tests passed!") diff --git a/tests/deployments/tools.py b/tests/deployments/tools.py index ade3da3..30bf052 100644 --- a/tests/deployments/tools.py +++ b/tests/deployments/tools.py @@ -8,55 +8,49 @@ # Retrieve EGI token (not generated on the fly in case the are rate limiting issues # if too many queries) -token = os.getenv('TMP_EGI_TOKEN') +token = os.getenv("TMP_EGI_TOKEN") if not token: raise Exception( -'Please remember to set a token as ENV variable before executing \ + 'Please remember to set a token as ENV variable before executing \ the tests! \n\n \ export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \ If running from VScode make sure to launch `code` from that terminal so it can access \ that ENV variable.' - ) + ) -print(' Testing FL server') +print(" Testing FL server") # Create tool rcreate = tools.create_deployment( - vo='vo.ai4eosc.eu', - tool_name='ai4os-federated-server', + vo="vo.ai4eosc.eu", + tool_name="ai4os-federated-server", conf={}, - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rcreate, dict) -assert 'job_ID' in rcreate.keys() +assert "job_ID" in rcreate.keys() time.sleep(0.2) # Nomad takes some time to allocate deployment # Retrieve that tool rdep = tools.get_deployment( - vo='vo.ai4eosc.eu', - deployment_uuid=rcreate['job_ID'], - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + deployment_uuid=rcreate["job_ID"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rdep, dict) -assert 'job_ID' in rdep.keys() -assert rdep['job_ID']==rcreate['job_ID'] -assert rdep['status']!='error' +assert "job_ID" in rdep.keys() +assert rdep["job_ID"] == rcreate["job_ID"] +assert rdep["status"] != "error" # Retrieve all tools rdeps = tools.get_deployments( - vos=['vo.ai4eosc.eu'], - authorization=SimpleNamespace( - credentials=token - ), + vos=["vo.ai4eosc.eu"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rdeps, list) -assert any([d['job_ID']==rcreate['job_ID'] for d in rdeps]) -assert all([d['job_ID']!='error' for d in rdeps]) +assert any([d["job_ID"] == rcreate["job_ID"] for d in rdeps]) +assert all([d["job_ID"] != "error" for d in rdeps]) # Check that we cannot retrieve that tool from modules # This should break! @@ -70,79 +64,69 @@ # Check that we cannot retrieve that tool from modules list rdeps2 = modules.get_deployments( - vos=['vo.ai4eosc.eu'], - authorization=SimpleNamespace( - credentials=token - ), + vos=["vo.ai4eosc.eu"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rdeps2, list) -assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps2]) +assert not any([d["job_ID"] == rcreate["job_ID"] for d in rdeps2]) # Delete tool rdel = tools.delete_deployment( - vo='vo.ai4eosc.eu', - deployment_uuid=rcreate['job_ID'], - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + deployment_uuid=rcreate["job_ID"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rdel, dict) -assert 'status' in rdel.keys() +assert "status" in rdel.keys() time.sleep(3) # Nomad takes some time to delete # Check tool no longer exists rdeps3 = tools.get_deployments( - vos=['vo.ai4eosc.eu'], - authorization=SimpleNamespace( - credentials=token - ), + vos=["vo.ai4eosc.eu"], + authorization=SimpleNamespace(credentials=token), ) -assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps3]) +assert not any([d["job_ID"] == rcreate["job_ID"] for d in rdeps3]) ############################################################ # Additionally test simply the creation of the other tools # ############################################################ -print(' Testing CVAT') +print(" Testing CVAT") # Create tool rcreate = tools.create_deployment( - vo='vo.ai4eosc.eu', - tool_name='ai4os-cvat', + vo="vo.ai4eosc.eu", + tool_name="ai4os-cvat", conf={ - 'general':{ - 'title': 'CVAT test', - 'cvat_username': 'mock_user', - 'cvat_password': 'mock_password', + "general": { + "title": "CVAT test", + "cvat_username": "mock_user", + "cvat_password": "mock_password", + }, + "storage": { + "rclone_conf": "/srv/.rclone/rclone.conf", + "rclone_url": "https://share.services.ai4os.eu/remote.php/webdav", + "rclone_vendor": "nextcloud", + "rclone_user": "mock_user", + "rclone_password": "mock_password", }, - 'storage': { - 'rclone_conf': '/srv/.rclone/rclone.conf', - 'rclone_url': 'https://share.services.ai4os.eu/remote.php/webdav', - 'rclone_vendor': 'nextcloud', - 'rclone_user': 'mock_user', - 'rclone_password': 'mock_password', - } }, - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rcreate, dict) -assert 'job_ID' in rcreate.keys() -assert rdep['status']!='error' +assert "job_ID" in rcreate.keys() +assert rdep["status"] != "error" time.sleep(0.2) # Nomad takes some time to allocate deployment # Delete tool rdel = tools.delete_deployment( - vo='vo.ai4eosc.eu', - deployment_uuid=rcreate['job_ID'], - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + deployment_uuid=rcreate["job_ID"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rdel, dict) -assert 'status' in rdel.keys() +assert "status" in rdel.keys() -print('Deployments (tools) tests passed!') +print("Deployments (tools) tests passed!") diff --git a/tests/inference/oscar.py b/tests/inference/oscar.py index 17373b7..4f02b3c 100644 --- a/tests/inference/oscar.py +++ b/tests/inference/oscar.py @@ -5,69 +5,59 @@ # Retrieve EGI token (not generated on the fly in case the are rate limitng issues # if too many queries) -token = os.getenv('TMP_EGI_TOKEN') +token = os.getenv("TMP_EGI_TOKEN") if not token: raise Exception( -'Please remember to set a token as ENV variable before executing \ + 'Please remember to set a token as ENV variable before executing \ the tests! \n\n \ export TMP_EGI_TOKEN="$(oidc-token egi-checkin-demo)" \n\n \ If running from VScode make sure to launch `code` from that terminal so it can access \ that ENV variable.' - ) + ) # Test service service = oscar.Service( - image='deephdc/deep-oc-image-classification-tf', + image="deephdc/deep-oc-image-classification-tf", cpu=2, ) # Create service sname = oscar.create_service( - vo='vo.ai4eosc.eu', + vo="vo.ai4eosc.eu", svc_conf=service, - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) # Check service exists slist = oscar.get_services_list( - vo='vo.ai4eosc.eu', - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + authorization=SimpleNamespace(credentials=token), ) -names = [s['name'] for s in slist] +names = [s["name"] for s in slist] assert sname in names, "Service does not exist" # Update service service.cpu = 1 oscar.update_service( - vo='vo.ai4eosc.eu', + vo="vo.ai4eosc.eu", service_name=sname, svc_conf=service, - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) # Delete the service oscar.delete_service( - vo='vo.ai4eosc.eu', + vo="vo.ai4eosc.eu", service_name=sname, - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) # Check service does not longer exist slist = oscar.get_services_list( - vo='vo.ai4eosc.eu', - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + authorization=SimpleNamespace(credentials=token), ) -names = [s['name'] for s in slist] +names = [s["name"] for s in slist] assert sname not in names, "Service exists" -print('Inference (OSCAR) tests passed!') +print("Inference (OSCAR) tests passed!") diff --git a/tests/main.py b/tests/main.py index 3605e32..538d685 100644 --- a/tests/main.py +++ b/tests/main.py @@ -7,9 +7,9 @@ Nomad (ie. after launching) """ -#TODO: move to proper testing package -#TODO: rename test script: modules --> test_modules -#TODO: add spinners +# TODO: move to proper testing package +# TODO: rename test script: modules --> test_modules +# TODO: add spinners import ai4papi.conf as papiconf diff --git a/tests/routes.py b/tests/routes.py index 4635316..f0cccf7 100644 --- a/tests/routes.py +++ b/tests/routes.py @@ -8,41 +8,43 @@ # Check routes routes = [(r.path, r.methods) for r in app.routes] -for collection in ['modules', 'tools']: - - assert (f'/v1/catalog/{collection}', {'GET'}) in routes - assert (f'/v1/catalog/{collection}/detail', {'GET'}) in routes - assert (f'/v1/catalog/{collection}/tags', {'GET'}) in routes - assert (f'/v1/catalog/{collection}/' + '{item_name}/config', {'GET'}) in routes - assert (f'/v1/catalog/{collection}/' + '{item_name}/metadata', {'GET'}) in routes - - assert (f'/v1/deployments/{collection}', {'GET'}) in routes - assert (f'/v1/deployments/{collection}', {'POST'}) in routes - assert (f'/v1/deployments/{collection}/' + '{deployment_uuid}', {'GET'}) in routes - assert (f'/v1/deployments/{collection}/' + '{deployment_uuid}', {'DELETE'}) in routes - - -assert ('/v1/datasets/zenodo', {'POST'}) in routes - -assert ('/v1/inference/oscar/cluster', {'GET'}) in routes -assert ('/v1/inference/oscar/services', {'GET'}) in routes -assert ('/v1/inference/oscar/services', {'POST'}) in routes -assert ('/v1/inference/oscar/services/{service_name}', {'GET'}) in routes -assert ('/v1/inference/oscar/services/{service_name}', {'PUT'}) in routes -assert ('/v1/inference/oscar/services/{service_name}', {'DELETE'}) in routes - -assert ('/v1/secrets', {'GET'}) in routes -assert ('/v1/secrets', {'POST'}) in routes -assert ('/v1/secrets', {'DELETE'}) in routes - -assert ('/v1/deployments/stats/user', {'GET'}) in routes -assert ('/v1/deployments/stats/cluster', {'GET'}) in routes - -assert ('/v1/try_me/nomad', {'POST'}) in routes -assert ('/v1/try_me/nomad', {'GET'}) in routes -assert ('/v1/try_me/nomad/{deployment_uuid}', {'GET'}) in routes -assert ('/v1/try_me/nomad/{deployment_uuid}', {'DELETE'}) in routes - -assert ('/v1/storage/{storage_name}/ls', {'GET'}) in routes - -print('Checks for API routes passed!') +for collection in ["modules", "tools"]: + assert (f"/v1/catalog/{collection}", {"GET"}) in routes + assert (f"/v1/catalog/{collection}/detail", {"GET"}) in routes + assert (f"/v1/catalog/{collection}/tags", {"GET"}) in routes + assert (f"/v1/catalog/{collection}/" + "{item_name}/config", {"GET"}) in routes + assert (f"/v1/catalog/{collection}/" + "{item_name}/metadata", {"GET"}) in routes + + assert (f"/v1/deployments/{collection}", {"GET"}) in routes + assert (f"/v1/deployments/{collection}", {"POST"}) in routes + assert (f"/v1/deployments/{collection}/" + "{deployment_uuid}", {"GET"}) in routes + assert ( + f"/v1/deployments/{collection}/" + "{deployment_uuid}", + {"DELETE"}, + ) in routes + + +assert ("/v1/datasets/zenodo", {"POST"}) in routes + +assert ("/v1/inference/oscar/cluster", {"GET"}) in routes +assert ("/v1/inference/oscar/services", {"GET"}) in routes +assert ("/v1/inference/oscar/services", {"POST"}) in routes +assert ("/v1/inference/oscar/services/{service_name}", {"GET"}) in routes +assert ("/v1/inference/oscar/services/{service_name}", {"PUT"}) in routes +assert ("/v1/inference/oscar/services/{service_name}", {"DELETE"}) in routes + +assert ("/v1/secrets", {"GET"}) in routes +assert ("/v1/secrets", {"POST"}) in routes +assert ("/v1/secrets", {"DELETE"}) in routes + +assert ("/v1/deployments/stats/user", {"GET"}) in routes +assert ("/v1/deployments/stats/cluster", {"GET"}) in routes + +assert ("/v1/try_me/nomad", {"POST"}) in routes +assert ("/v1/try_me/nomad", {"GET"}) in routes +assert ("/v1/try_me/nomad/{deployment_uuid}", {"GET"}) in routes +assert ("/v1/try_me/nomad/{deployment_uuid}", {"DELETE"}) in routes + +assert ("/v1/storage/{storage_name}/ls", {"GET"}) in routes + +print("Checks for API routes passed!") diff --git a/tests/test_launch.py b/tests/test_launch.py index 6331730..ad17b0e 100644 --- a/tests/test_launch.py +++ b/tests/test_launch.py @@ -11,10 +11,10 @@ server_process = subprocess.Popen( - ['uvicorn', 'ai4papi.main:app', '--host', '0.0.0.0', '--port', '8080'], + ["uvicorn", "ai4papi.main:app", "--host", "0.0.0.0", "--port", "8080"], stdout=subprocess.DEVNULL, - stderr = subprocess.DEVNULL, - ) + stderr=subprocess.DEVNULL, +) time.sleep(15) # wait for PAPI to start try: diff --git a/tests/test_secrets.py b/tests/test_secrets.py index f3ea026..48e12ff 100644 --- a/tests/test_secrets.py +++ b/tests/test_secrets.py @@ -6,55 +6,47 @@ # Retrieve EGI token (not generated on the fly in case the are rate limiting issues # if too many queries) -token = os.getenv('TMP_EGI_TOKEN') +token = os.getenv("TMP_EGI_TOKEN") if not token: raise Exception( -'Please remember to set a token as ENV variable before executing \ + 'Please remember to set a token as ENV variable before executing \ the tests! \n\n \ export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \ If running from VScode make sure to launch `code` from that terminal so it can access \ that ENV variable.' - ) + ) -SECRET_PATH = '/demo-papi-tests/demo-secret' -SECRET_DATA = {'pwd': 12345} +SECRET_PATH = "/demo-papi-tests/demo-secret" +SECRET_DATA = {"pwd": 12345} # Create secret r = secrets.create_secret( - vo='vo.ai4eosc.eu', + vo="vo.ai4eosc.eu", secret_path=SECRET_PATH, secret_data=SECRET_DATA, - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) # Check that secret is in list r = secrets.get_secrets( - vo='vo.ai4eosc.eu', - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + authorization=SimpleNamespace(credentials=token), ) assert SECRET_PATH in r.keys() assert r[SECRET_PATH] == SECRET_DATA # Delete r = secrets.delete_secret( - vo='vo.ai4eosc.eu', + vo="vo.ai4eosc.eu", secret_path=SECRET_PATH, - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) # Check that secret is no longer in list r = secrets.get_secrets( - vo='vo.ai4eosc.eu', - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + authorization=SimpleNamespace(credentials=token), ) assert SECRET_PATH not in r.keys() -print('Secrets tests passed!') +print("Secrets tests passed!") diff --git a/tests/test_snapshots.py b/tests/test_snapshots.py index 0d71c1b..84fa5c0 100644 --- a/tests/test_snapshots.py +++ b/tests/test_snapshots.py @@ -8,85 +8,73 @@ # Retrieve EGI token (not generated on the fly in case the are rate limiting issues # if too many queries) -token = os.getenv('TMP_EGI_TOKEN') +token = os.getenv("TMP_EGI_TOKEN") if not token: raise Exception( -'Please remember to set a token as ENV variable before executing \ + 'Please remember to set a token as ENV variable before executing \ the tests! \n\n \ export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \ If running from VScode make sure to launch `code` from that terminal so it can access \ that ENV variable.' - ) + ) # Create Nomad deployment njob = modules.create_deployment( - vo='vo.ai4eosc.eu', + vo="vo.ai4eosc.eu", conf={}, - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) assert isinstance(njob, dict) -assert 'job_ID' in njob.keys() +assert "job_ID" in njob.keys() time.sleep(60) # Make snapshot of that module created = snapshots.create_snapshot( - vo='vo.ai4eosc.eu', - deployment_uuid=njob['job_ID'], - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + deployment_uuid=njob["job_ID"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(created, dict) -assert 'snapshot_ID' in created.keys() +assert "snapshot_ID" in created.keys() time.sleep(10) # Retrieve all snapshots retrieved = snapshots.get_snapshots( - vos=['vo.ai4eosc.eu'], - authorization=SimpleNamespace( - credentials=token - ), + vos=["vo.ai4eosc.eu"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(retrieved, list) -assert any([d['snapshot_ID']==created['snapshot_ID'] for d in retrieved]) -#TODO: waiting 10s the snapshot is still probably queued in Nomad, we should wait more if we want to test also Harbor +assert any([d["snapshot_ID"] == created["snapshot_ID"] for d in retrieved]) +# TODO: waiting 10s the snapshot is still probably queued in Nomad, we should wait more if we want to test also Harbor # Delete snapshot deleted = snapshots.delete_snapshot( - vo='vo.ai4eosc.eu', - snapshot_uuid=created['snapshot_ID'], - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + snapshot_uuid=created["snapshot_ID"], + authorization=SimpleNamespace(credentials=token), ) time.sleep(10) # it takes some time to delete assert isinstance(deleted, dict) -assert 'status' in deleted.keys() +assert "status" in deleted.keys() # Check snapshot no longer exists retrieved2 = snapshots.get_snapshots( - vos=['vo.ai4eosc.eu'], - authorization=SimpleNamespace( - credentials=token - ), + vos=["vo.ai4eosc.eu"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(retrieved, list) -assert not any([d['snapshot_ID']==created['snapshot_ID'] for d in retrieved2]) +assert not any([d["snapshot_ID"] == created["snapshot_ID"] for d in retrieved2]) # Delete deployment ndel = modules.delete_deployment( - vo='vo.ai4eosc.eu', - deployment_uuid=njob['job_ID'], - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + deployment_uuid=njob["job_ID"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(ndel, dict) -assert 'status' in ndel.keys() +assert "status" in ndel.keys() -print('Snapshot tests passed!') +print("Snapshot tests passed!") diff --git a/tests/test_stats.py b/tests/test_stats.py index 43ad934..e8d20e8 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -6,33 +6,31 @@ # Retrieve EGI token (not generated on the fly in case the are rate limiting issues # if too many queries) -token = os.getenv('TMP_EGI_TOKEN') +token = os.getenv("TMP_EGI_TOKEN") if not token: raise Exception( -'Please remember to set a token as ENV variable before executing \ + 'Please remember to set a token as ENV variable before executing \ the tests! \n\n \ export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \ If running from VScode make sure to launch `code` from that terminal so it can access \ that ENV variable.' - ) + ) -SECRET_PATH = '/demo-papi-tests/demo-secret' -SECRET_DATA = {'pwd': 12345} +SECRET_PATH = "/demo-papi-tests/demo-secret" +SECRET_DATA = {"pwd": 12345} # Retrieve user stats r = stats.deployments.get_user_stats( - vo='vo.ai4eosc.eu', - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + authorization=SimpleNamespace(credentials=token), ) -assert r, 'User stats dict is empty' +assert r, "User stats dict is empty" # Retrieve cluster stats _ = stats.deployments.get_cluster_stats_bg() r = stats.deployments.get_cluster_stats( - vo='vo.ai4eosc.eu', + vo="vo.ai4eosc.eu", ) -assert r, 'Cluster stats dict is empty' +assert r, "Cluster stats dict is empty" -print('Stats tests passed!') +print("Stats tests passed!") diff --git a/tests/test_storage.py b/tests/test_storage.py index 11c8e28..48523ac 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -6,23 +6,21 @@ # Retrieve EGI token (not generated on the fly in case the are rate limiting issues # if too many queries) -token = os.getenv('TMP_EGI_TOKEN') +token = os.getenv("TMP_EGI_TOKEN") if not token: raise Exception( -'Please remember to set a token as ENV variable before executing \ + 'Please remember to set a token as ENV variable before executing \ the tests! \n\n \ export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \ If running from VScode make sure to launch `code` from that terminal so it can access \ that ENV variable.' - ) + ) r = storage.storage_ls( - vo='vo.ai4eosc.eu', - storage_name='share.services.ai4os.eu', - subpath='ai4os-storage', - authorization=SimpleNamespace( - credentials=token - ), + vo="vo.ai4eosc.eu", + storage_name="share.services.ai4os.eu", + subpath="ai4os-storage", + authorization=SimpleNamespace(credentials=token), ) -print('Storage tests passed!') +print("Storage tests passed!") diff --git a/tests/try_me/test_nomad.py b/tests/try_me/test_nomad.py index 65d3a07..b492e55 100644 --- a/tests/try_me/test_nomad.py +++ b/tests/try_me/test_nomad.py @@ -7,64 +7,54 @@ # Retrieve EGI token (not generated on the fly in case the are rate limiting issues # if too many queries) -token = os.getenv('TMP_EGI_TOKEN') +token = os.getenv("TMP_EGI_TOKEN") if not token: raise Exception( -'Please remember to set a token as ENV variable before executing \ + 'Please remember to set a token as ENV variable before executing \ the tests! \n\n \ export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \ If running from VScode make sure to launch `code` from that terminal so it can access \ that ENV variable.' - ) + ) # Create deployment rcreate = nomad.create_deployment( module_name="ai4os-demo-app", title="PAPI tests", - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rcreate, dict) -assert 'job_ID' in rcreate.keys() +assert "job_ID" in rcreate.keys() # Retrieve that deployment rdep = nomad.get_deployment( - deployment_uuid=rcreate['job_ID'], - authorization=SimpleNamespace( - credentials=token - ), + deployment_uuid=rcreate["job_ID"], + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rdep, dict) -assert 'job_ID' in rdep.keys() -assert rdep['job_ID']==rcreate['job_ID'] +assert "job_ID" in rdep.keys() +assert rdep["job_ID"] == rcreate["job_ID"] # Retrieve all deployments rdeps = nomad.get_deployments( - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) assert isinstance(rdeps, list) -assert any([d['job_ID']==rcreate['job_ID'] for d in rdeps]) +assert any([d["job_ID"] == rcreate["job_ID"] for d in rdeps]) # Delete deployment rdel = nomad.delete_deployment( - deployment_uuid=rcreate['job_ID'], - authorization=SimpleNamespace( - credentials=token - ), + deployment_uuid=rcreate["job_ID"], + authorization=SimpleNamespace(credentials=token), ) time.sleep(3) # Nomad takes some time to delete assert isinstance(rdel, dict) -assert 'status' in rdel.keys() +assert "status" in rdel.keys() # Check module no longer exists rdeps3 = nomad.get_deployments( - authorization=SimpleNamespace( - credentials=token - ), + authorization=SimpleNamespace(credentials=token), ) -assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps3]) +assert not any([d["job_ID"] == rcreate["job_ID"] for d in rdeps3]) -print('Try-me (nomad) tests passed!') +print("Try-me (nomad) tests passed!")