diff --git a/.gitignore b/.gitignore
index 5eb1071..94c93b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,7 +67,7 @@ target/
.idea
# VS Code
-.vscode/
+# .vscode/
# Spyder
.spyproject/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..5df93ce
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,16 @@
+repos:
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v2.3.0
+ hooks:
+ - id: check-yaml
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ # Ruff version.
+ rev: v0.8.0
+ hooks:
+ # Run the linter.
+ - id: ruff
+ types_or: [ python, pyi ]
+ args: [ --fix ]
+ # Run the formatter.
+ - id: ruff-format
+ types_or: [ python, pyi ]
\ No newline at end of file
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
new file mode 100644
index 0000000..cb75462
--- /dev/null
+++ b/.vscode/extensions.json
@@ -0,0 +1,7 @@
+{
+ "recommendations": [
+ "ms-python.python",
+ "charliermarsh.ruff",
+ "bdsoftware.format-on-auto-save"
+ ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..ebc63ac
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,10 @@
+{
+ "[python]": {
+ "editor.defaultFormatter": "charliermarsh.ruff",
+ "editor.formatOnSave": true
+ },
+ "files.autoSave": "afterDelay",
+ "editor.rulers": [
+ 88
+ ],
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 65936f4..899f96f 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,9 @@
# AI4EOSC - Platform API
[![Conventional Commits](https://img.shields.io/badge/Conventional%20Commits-1.0.0-%23FE5196?logo=conventionalcommits&logoColor=white)](https://conventionalcommits.org)
+[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
+[![Build Docker](https://github.com/ai4os/ai4-papi/actions/workflows/build-docker-prod.yml/badge.svg)](https://github.com/ai4os/ai4-papi/actions/workflows/build-docker-prod.yml)
+[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/ai4os/ai4-papi/master.svg)](https://results.pre-commit.ci/latest/github/ai4os/ai4-papi/master)
[//]: # ([![GitHub license](https://img.shields.io/github/license/ai4papi/ai4papi.svg)](https://github.com/ai4papi/ai4papi/blob/master/LICENSE))
[//]: # ([![GitHub release](https://img.shields.io/github/release/ai4papi/ai4papi.svg)](https://github.com/ai4papi/ai4papi/releases))
@@ -271,3 +274,10 @@ The pattern for the subfolders follows:
- `user.yaml`: user customizable configuration to make a deployment in Nomad.
Also contains the generic quotas for hardware (see `range` parameter).
- `nomad.hcl`: additional non-customizable values (eg. ports)
+
+### Implementation notes
+
+This repository is formatted with [Ruff](https://docs.astral.sh/ruff/).
+We use [Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) and [FormatOnSave](https://marketplace.visualstudio.com/items?itemName=BdSoftware.format-on-auto-save) ([issue](https://github.com/microsoft/vscode/issues/45997#issuecomment-950405496)) VScode extensions to make the development workflow smoother.
+
+We use [Precommit](https://pre-commit.com/) locally to enforce format in commits. Then use [Precommit.CI](https://pre-commit.ci/) to enforce it at the Github level.
diff --git a/ai4papi/auth.py b/ai4papi/auth.py
index 44bc09e..5308002 100644
--- a/ai4papi/auth.py
+++ b/ai4papi/auth.py
@@ -35,27 +35,26 @@
def get_user_info(token):
-
try:
user_infos = flaat.get_user_infos_from_access_token(token)
except Exception as e:
raise HTTPException(
status_code=401,
detail=str(e),
- )
+ )
# Check output
if user_infos is None:
raise HTTPException(
status_code=401,
detail="Invalid token",
- )
+ )
# Retrieve VOs the user belongs to
# VOs can be empty if the user does not belong to any VO, or the
# 'eduperson_entitlement wasn't correctly retrieved from the token
vos = []
- for i in user_infos.get('eduperson_entitlement', []):
+ for i in user_infos.get("eduperson_entitlement", []):
# Parse Virtual Organizations manually from URNs
# If more complexity is need in the future, check https://github.com/oarepo/urnparse
ent_i = re.search(r"group:(.+?):", i)
@@ -63,18 +62,18 @@ def get_user_info(token):
vos.append(ent_i.group(1))
# Generate user info dict
- for k in ['sub', 'iss', 'name', 'email']:
+ for k in ["sub", "iss", "name", "email"]:
if user_infos.get(k) is None:
raise HTTPException(
status_code=401,
detail=f"You token should have scopes for {k}.",
- )
+ )
out = {
- 'id': user_infos.get('sub'), # subject, user-ID
- 'issuer': user_infos.get('iss'), # URL of the access token issuer
- 'name': user_infos.get('name'),
- 'email': user_infos.get('email'),
- 'vos': vos,
+ "id": user_infos.get("sub"), # subject, user-ID
+ "issuer": user_infos.get("iss"), # URL of the access token issuer
+ "name": user_infos.get("name"),
+ "email": user_infos.get("email"),
+ "vos": vos,
}
return out
@@ -90,5 +89,5 @@ def check_vo_membership(
if requested_vo not in user_vos:
raise HTTPException(
status_code=401,
- detail=f"The requested Virtual Organization ({requested_vo}) does not match with any of your available VOs: {user_vos}."
- )
+ detail=f"The requested Virtual Organization ({requested_vo}) does not match with any of your available VOs: {user_vos}.",
+ )
diff --git a/ai4papi/conf.py b/ai4papi/conf.py
index 5ed1867..a1cdc3d 100644
--- a/ai4papi/conf.py
+++ b/ai4papi/conf.py
@@ -14,17 +14,19 @@
# intensive (eg. disables calls to Github API)
# The variables 'FORWARDED_ALLOW_IPS' serves as proxy for this, as it is only defined
# when running from the Docker container
-IS_DEV = False if os.getenv('FORWARDED_ALLOW_IPS') else True
+IS_DEV = False if os.getenv("FORWARDED_ALLOW_IPS") else True
# Harbor token is kind of mandatory in production, otherwise snapshots won't work.
HARBOR_USER = "robot$user-snapshots+snapshot-api"
-HARBOR_PASS = os.environ.get('HARBOR_ROBOT_PASSWORD')
+HARBOR_PASS = os.environ.get("HARBOR_ROBOT_PASSWORD")
if not HARBOR_PASS:
if IS_DEV:
# Not enforce this for developers
- print("You should define the variable \"HARBOR_ROBOT_PASSWORD\" to use the \"/snapshots\" endpoint.")
+ print(
+ 'You should define the variable "HARBOR_ROBOT_PASSWORD" to use the "/snapshots" endpoint.'
+ )
else:
- raise Exception("You need to define the variable \"HARBOR_ROBOT_PASSWORD\".")
+ raise Exception('You need to define the variable "HARBOR_ROBOT_PASSWORD".')
# Paths
main_path = Path(__file__).parent.absolute()
@@ -34,7 +36,7 @@
}
# Load main API configuration
-with open(paths['conf'] / 'main.yaml', 'r') as f:
+with open(paths["conf"] / "main.yaml", "r") as f:
MAIN_CONF = yaml.safe_load(f)
@@ -42,7 +44,7 @@ def load_nomad_job(fpath):
"""
Load default Nomad job configuration
"""
- with open(fpath, 'r') as f:
+ with open(fpath, "r") as f:
raw_job = f.read()
job_template = Template(raw_job)
return job_template
@@ -52,84 +54,84 @@ def load_yaml_conf(fpath):
"""
Load user customizable parameters
"""
- with open(fpath, 'r') as f:
+ with open(fpath, "r") as f:
conf_full = yaml.safe_load(f)
conf_values = {}
for group_name, params in conf_full.items():
conf_values[group_name] = {}
for k, v in params.items():
- if 'name' not in v.keys():
+ if "name" not in v.keys():
raise Exception(f"Parameter {k} needs to have a name.")
- if 'value' not in v.keys():
+ if "value" not in v.keys():
raise Exception(f"Parameter {k} needs to have a value.")
- conf_values[group_name][k] = v['value']
+ conf_values[group_name][k] = v["value"]
return conf_full, conf_values
# Standard modules
-nmd = load_nomad_job(paths['conf'] / 'modules' / 'nomad.hcl')
-yml = load_yaml_conf(paths['conf'] / 'modules' / 'user.yaml')
+nmd = load_nomad_job(paths["conf"] / "modules" / "nomad.hcl")
+yml = load_yaml_conf(paths["conf"] / "modules" / "user.yaml")
MODULES = {
- 'nomad': nmd,
- 'user': {
- 'full': yml[0],
- 'values': yml[1],
+ "nomad": nmd,
+ "user": {
+ "full": yml[0],
+ "values": yml[1],
},
}
# Tools
-tool_dir = paths['conf'] / 'tools'
+tool_dir = paths["conf"] / "tools"
tool_list = [f for f in tool_dir.iterdir() if f.is_dir()]
TOOLS = {}
for tool_path in tool_list:
- nmd = load_nomad_job(tool_path / 'nomad.hcl')
- yml = load_yaml_conf(tool_path / 'user.yaml')
+ nmd = load_nomad_job(tool_path / "nomad.hcl")
+ yml = load_yaml_conf(tool_path / "user.yaml")
TOOLS[tool_path.name] = {
- 'nomad': nmd,
- 'user': {
- 'full': yml[0],
- 'values': yml[1],
+ "nomad": nmd,
+ "user": {
+ "full": yml[0],
+ "values": yml[1],
},
}
# For tools, map the Nomad job name prefixes to tool IDs
tools_nomad2id = {
- 'fl': 'ai4os-federated-server',
- 'cvat': 'ai4os-cvat',
+ "fl": "ai4os-federated-server",
+ "cvat": "ai4os-cvat",
}
for tool in TOOLS.keys():
if tool not in tools_nomad2id.values():
raise Exception(f"The tool {tool} is missing from the mapping dictionary.")
# OSCAR template
-with open(paths['conf'] / 'oscar.yaml', 'r') as f:
+with open(paths["conf"] / "oscar.yaml", "r") as f:
OSCAR_TMPL = Template(f.read())
# Try-me endpoints
-nmd = load_nomad_job(paths['conf'] / 'try_me' / 'nomad.hcl')
+nmd = load_nomad_job(paths["conf"] / "try_me" / "nomad.hcl")
TRY_ME = {
- 'nomad': nmd,
+ "nomad": nmd,
}
# Snapshot endpoints
-nmd = load_nomad_job(paths['conf'] / 'snapshots' / 'nomad.hcl')
+nmd = load_nomad_job(paths["conf"] / "snapshots" / "nomad.hcl")
SNAPSHOTS = {
- 'nomad': nmd,
+ "nomad": nmd,
}
# Retrieve git info from PAPI, to show current version in the docs
papi_commit = subprocess.run(
- ['git', 'log', '-1', '--format=%H'],
+ ["git", "log", "-1", "--format=%H"],
stdout=subprocess.PIPE,
text=True,
cwd=main_path,
).stdout.strip()
papi_branch = subprocess.run(
- ['git', 'rev-parse', '--abbrev-ref', '--symbolic-full-name', '@{u}'],
+ ["git", "rev-parse", "--abbrev-ref", "--symbolic-full-name", "@{u}"],
stdout=subprocess.PIPE,
text=True,
cwd=main_path,
).stdout.strip()
-papi_branch = papi_branch.split('/')[-1] # remove the "origin/" part
+papi_branch = papi_branch.split("/")[-1] # remove the "origin/" part
diff --git a/ai4papi/main.py b/ai4papi/main.py
index d101a5f..404450f 100644
--- a/ai4papi/main.py
+++ b/ai4papi/main.py
@@ -19,33 +19,29 @@
" src='https://ai4eosc.eu/wp-content/uploads/sites/10/2023/01/horizontal-bg-dark.png'"
" width=200 alt='' />"
"
"
-
"This is the Platform API for interacting with the AI4EOSC services. "
"It aims at providing a stable UI, effectively decoupling the services offered by "
"the project from the underlying tools we use to provide them (ie. Nomad)."
"
"
-
"You can also access the functionalities of the API through our dashboards:
"
"- [AIEOSC Dashboard](https://dashboard.cloud.ai4eosc.eu/)
"
"- [iMagine Dashboard](https://dashboard.cloud.imagine-ai.eu/)"
"
"
-
"For more information, please visit:
"
"- [AI4EOSC Homepage](https://ai4eosc.eu)
"
"- [API Github repository](https://github.com/AI4EOSC/ai4-papi)"
"
"
-
"**Acknowledgements**
"
"This work is co-funded by [AI4EOSC](https://ai4eosc.eu/) project that has "
"received funding from the European Union's Horizon Europe 2022 research and "
"innovation programme under agreement No 101058593"
"
"
-
"PAPI version:"
f"[`ai4-papi/{papi_branch}@{papi_commit[:5]}`]"
f"(https://github.com/ai4os/ai4-papi/tree/{papi_commit})"
)
+
@asynccontextmanager
async def lifespan(app: fastapi.FastAPI):
# on startup
@@ -114,11 +110,11 @@ async def favicon():
def run(
- host:str = "0.0.0.0",
- port:int = 8080,
- ssl_keyfile:str = None,
- ssl_certfile:str = None,
- ):
+ host: str = "0.0.0.0",
+ port: int = 8080,
+ ssl_keyfile: str = None,
+ ssl_certfile: str = None,
+):
uvicorn.run(
app,
host=host,
diff --git a/ai4papi/module_patches.py b/ai4papi/module_patches.py
index 718cb7f..2c1dfed 100644
--- a/ai4papi/module_patches.py
+++ b/ai4papi/module_patches.py
@@ -3,10 +3,11 @@
fix/rebuild them.
"""
+
def patch_nextcloud_mount(
docker_image: str,
task: dict,
- ):
+):
"""
Some module are blocked when running deepaas.
@@ -37,10 +38,10 @@ def patch_nextcloud_mount(
"DEEP-OC-image-classification-tf-dicom",
"DEEP-OC-speech-to-text-tf",
]
- modules = [f'deephdc/{m.lower()}' for m in modules]
+ modules = [f"deephdc/{m.lower()}" for m in modules]
# TODO: this will need to be updated to ai4os-hub
if docker_image in modules:
- task['Env']['RCLONE_CONTIMEOUT'] = '1s'
+ task["Env"]["RCLONE_CONTIMEOUT"] = "1s"
return task
diff --git a/ai4papi/nomad/common.py b/ai4papi/nomad/common.py
index 0f59662..9631bcc 100644
--- a/ai4papi/nomad/common.py
+++ b/ai4papi/nomad/common.py
@@ -23,18 +23,9 @@
Nomad = nomad.Nomad()
# TODO: Remove monkey-patches when the code is merged to python-nomad Pypi package
-Nomad.job.deregister_job = types.MethodType(
- nomad_patches.deregister_job,
- Nomad.job
- )
-Nomad.job.get_allocations = types.MethodType(
- nomad_patches.get_allocations,
- Nomad.job
- )
-Nomad.job.get_evaluations = types.MethodType(
- nomad_patches.get_allocations,
- Nomad.job
- )
+Nomad.job.deregister_job = types.MethodType(nomad_patches.deregister_job, Nomad.job)
+Nomad.job.get_allocations = types.MethodType(nomad_patches.get_allocations, Nomad.job)
+Nomad.job.get_evaluations = types.MethodType(nomad_patches.get_allocations, Nomad.job)
# Persistent requests session for faster requests
session = requests.Session()
@@ -44,15 +35,16 @@ def get_deployments(
namespace: str,
owner: str,
prefix: str = "",
- ):
+):
"""
Returns a list of all deployments belonging to a user, in a given namespace.
"""
- job_filter = \
- 'Status != "dead" and ' + \
- f'Name matches "^{prefix}" and ' + \
- 'Meta is not empty and ' + \
- f'Meta.owner == "{owner}"'
+ job_filter = (
+ 'Status != "dead" and '
+ + f'Name matches "^{prefix}" and '
+ + "Meta is not empty and "
+ + f'Meta.owner == "{owner}"'
+ )
jobs = Nomad.jobs.get_jobs(namespace=namespace, filter_=job_filter)
return jobs
@@ -62,7 +54,7 @@ def get_deployment(
namespace: str,
owner: str,
full_info: True,
- ):
+):
"""
Retrieve the info of a specific deployment.
Format outputs to a Nomad-independent format to be used by the Dashboard
@@ -80,60 +72,60 @@ def get_deployment(
j = Nomad.job.get_job(
id_=deployment_uuid,
namespace=namespace,
- )
+ )
except exceptions.URLNotFoundNomadException:
raise HTTPException(
status_code=400,
detail="No deployment exists with this uuid.",
- )
+ )
# Check job does belong to owner
- if j['Meta'] and owner != j['Meta'].get('owner', ''):
+ if j["Meta"] and owner != j["Meta"].get("owner", ""):
raise HTTPException(
status_code=400,
detail="You are not the owner of that deployment.",
- )
+ )
# Create job info dict
info = {
- 'job_ID': j['ID'],
- 'name': j['Name'],
- 'status': '', # do not use j['Status'] as misleading
- 'owner': j['Meta']['owner'],
- 'title': j['Meta']['title'],
- 'description': j['Meta']['description'],
- 'docker_image': None,
- 'docker_command': None,
- 'submit_time': datetime.fromtimestamp(
- j['SubmitTime'] // 1000000000
- ).strftime('%Y-%m-%d %H:%M:%S'), # nanoseconds to timestamp
- 'resources': {},
- 'endpoints': {},
- 'active_endpoints': None,
- 'main_endpoint': None,
- 'alloc_ID': None,
- 'datacenter': None,
+ "job_ID": j["ID"],
+ "name": j["Name"],
+ "status": "", # do not use j['Status'] as misleading
+ "owner": j["Meta"]["owner"],
+ "title": j["Meta"]["title"],
+ "description": j["Meta"]["description"],
+ "docker_image": None,
+ "docker_command": None,
+ "submit_time": datetime.fromtimestamp(j["SubmitTime"] // 1000000000).strftime(
+ "%Y-%m-%d %H:%M:%S"
+ ), # nanoseconds to timestamp
+ "resources": {},
+ "endpoints": {},
+ "active_endpoints": None,
+ "main_endpoint": None,
+ "alloc_ID": None,
+ "datacenter": None,
}
# Retrieve tasks
- tasks = j['TaskGroups'][0]['Tasks']
- usertask = [t for t in tasks if t['Name'] == 'main'][0]
+ tasks = j["TaskGroups"][0]["Tasks"]
+ usertask = [t for t in tasks if t["Name"] == "main"][0]
# Retrieve Docker image
- info['docker_image'] = usertask['Config']['image']
- command = usertask['Config'].get('command', '')
- args = usertask['Config'].get('args', [])
- info['docker_command'] = f"{command} {' '.join(args)}".strip()
+ info["docker_image"] = usertask["Config"]["image"]
+ command = usertask["Config"].get("command", "")
+ args = usertask["Config"].get("args", [])
+ info["docker_command"] = f"{command} {' '.join(args)}".strip()
# Add endpoints
- info['endpoints'] = {}
- for s in j['TaskGroups'][0]['Services']:
- label = s['PortLabel']
+ info["endpoints"] = {}
+ for s in j["TaskGroups"][0]["Services"]:
+ label = s["PortLabel"]
# Iterate through tags to find `Host` tag
- for t in s['Tags']:
+ for t in s["Tags"]:
try:
- url = re.search(r'Host\(`(.+?)`', t).group(1)
+ url = re.search(r"Host\(`(.+?)`", t).group(1)
break
except Exception:
url = "missing-endpoint"
@@ -141,126 +133,139 @@ def get_deployment(
# Old deployments had network ports with names [deepaas, ide, monitor]
# instead of [api, ide, monitor] so we have to manually replace them
# see: https://github.com/AI4EOSC/ai4-papi/issues/22
- if label == 'deepaas':
- label = 'api'
+ if label == "deepaas":
+ label = "api"
- info['endpoints'][label] = f"http://{url}"
+ info["endpoints"][label] = f"http://{url}"
# Add '/ui' to deepaas endpoint
# If in the future we support other APIs, this will have to be removed.
- if 'api' in info['endpoints'].keys():
- info['endpoints']['api'] += '/ui'
+ if "api" in info["endpoints"].keys():
+ info["endpoints"]["api"] += "/ui"
# Add quick-access (main endpoint) + customize endpoints
service2endpoint = {
- 'deepaas': 'api',
- 'jupyter': 'ide',
- 'vscode': 'ide',
+ "deepaas": "api",
+ "jupyter": "ide",
+ "vscode": "ide",
}
try: # deep-start compatible service
service = re.search(
- 'deep-start --(.*)$',
- info['docker_command'],
- ).group(1)
+ "deep-start --(.*)$",
+ info["docker_command"],
+ ).group(1)
- info['main_endpoint'] = service2endpoint[service]
+ info["main_endpoint"] = service2endpoint[service]
except Exception: # return first endpoint
- info['main_endpoint'] = list(info['endpoints'].keys())[0]
+ info["main_endpoint"] = list(info["endpoints"].keys())[0]
# Only fill resources if the job is allocated
allocs = Nomad.job.get_allocations(
- id_=j['ID'],
+ id_=j["ID"],
namespace=namespace,
- )
+ )
evals = Nomad.job.get_evaluations(
- id_=j['ID'],
+ id_=j["ID"],
namespace=namespace,
- )
+ )
if allocs:
-
# Reorder allocations based on recency
- dates = [a['CreateTime'] for a in allocs]
- allocs = [x for _, x in sorted(
- zip(dates, allocs),
- key=lambda pair: pair[0],
- )][::-1] # more recent first
+ dates = [a["CreateTime"] for a in allocs]
+ allocs = [
+ x
+ for _, x in sorted(
+ zip(dates, allocs),
+ key=lambda pair: pair[0],
+ )
+ ][::-1] # more recent first
# Select the proper allocation
- statuses = [a['ClientStatus'] for a in allocs]
- if 'unknown' in statuses:
+ statuses = [a["ClientStatus"] for a in allocs]
+ if "unknown" in statuses:
# The node has lost connection. Avoid showing temporary reallocated job,
# to avoid confusions when the original allocation is restored back again.
- idx = statuses.index('unknown')
- elif 'running' in statuses:
+ idx = statuses.index("unknown")
+ elif "running" in statuses:
# If an allocation is running, return that allocation
# It happens that after a network cut, when the network is restored,
# the temporary allocation created in the meantime (now with status
# 'complete') is more recent than the original allocation that we
# recovered (with status 'running'), so using only recency does not work.
- idx = statuses.index('running')
+ idx = statuses.index("running")
else:
# Return most recent allocation
idx = 0
- a = Nomad.allocation.get_allocation(allocs[idx]['ID'])
+ a = Nomad.allocation.get_allocation(allocs[idx]["ID"])
# Add ID
- info['alloc_ID'] = a['ID']
+ info["alloc_ID"] = a["ID"]
# Add datacenter
- info['datacenter'] = Nomad.node.get_node(a['NodeID'])['Datacenter']
+ info["datacenter"] = Nomad.node.get_node(a["NodeID"])["Datacenter"]
# Replace Nomad status with a more user-friendly status
# Final list includes: starting, down, running, complete, failed, ...
# We use the status of the "main" task because it isn more relevant the the
# status of the overall job (a['ClientStatus'])
- status = a['TaskStates']['main']['State'] if a.get('TaskStates') else 'queued'
+ status = a["TaskStates"]["main"]["State"] if a.get("TaskStates") else "queued"
status_map = { # nomad: papi
- 'pending': 'starting',
- 'unknown': 'down',
+ "pending": "starting",
+ "unknown": "down",
}
- info['status'] = status_map.get(status, status) # if not mapped, then return original status
+ info["status"] = status_map.get(
+ status, status
+ ) # if not mapped, then return original status
# Add error messages if needed
- if info['status'] == 'failed':
- info['error_msg'] = a['TaskStates']['main']['Events'][0]['Message']
+ if info["status"] == "failed":
+ info["error_msg"] = a["TaskStates"]["main"]["Events"][0]["Message"]
# Replace with clearer message
- if info['error_msg'] == 'Docker container exited with non-zero exit code: 1':
- info['error_msg'] = \
- "An error seems to appear when running this Docker container. " \
- "Try to run this Docker locally with the command " \
- f"`{info['docker_command']}` to find what is the error " \
+ if (
+ info["error_msg"]
+ == "Docker container exited with non-zero exit code: 1"
+ ):
+ info["error_msg"] = (
+ "An error seems to appear when running this Docker container. "
+ "Try to run this Docker locally with the command "
+ f"`{info['docker_command']}` to find what is the error "
"or contact the module owner."
+ )
- elif info['status'] == 'down':
- info['error_msg'] = \
- "There seems to be network issues in the cluster. Please wait until " \
- "the network is restored and you should be able to fully recover " \
+ elif info["status"] == "down":
+ info["error_msg"] = (
+ "There seems to be network issues in the cluster. Please wait until "
+ "the network is restored and you should be able to fully recover "
"your deployment."
+ )
# Add resources
- res = a['AllocatedResources']['Tasks']['main']
- gpu = [d for d in res['Devices'] if d['Type'] == 'gpu'][0] if res['Devices'] else None
- cpu_cores = res['Cpu']['ReservedCores']
- info['resources'] = {
- 'cpu_num': len(cpu_cores) if cpu_cores else 0,
- 'cpu_MHz': res['Cpu']['CpuShares'],
- 'gpu_num': len(gpu['DeviceIDs']) if gpu else 0,
- 'memory_MB': res['Memory']['MemoryMB'],
- 'disk_MB': a['AllocatedResources']['Shared']['DiskMB'],
+ res = a["AllocatedResources"]["Tasks"]["main"]
+ gpu = (
+ [d for d in res["Devices"] if d["Type"] == "gpu"][0]
+ if res["Devices"]
+ else None
+ )
+ cpu_cores = res["Cpu"]["ReservedCores"]
+ info["resources"] = {
+ "cpu_num": len(cpu_cores) if cpu_cores else 0,
+ "cpu_MHz": res["Cpu"]["CpuShares"],
+ "gpu_num": len(gpu["DeviceIDs"]) if gpu else 0,
+ "memory_MB": res["Memory"]["MemoryMB"],
+ "disk_MB": a["AllocatedResources"]["Shared"]["DiskMB"],
}
# Retrieve the node the jobs landed at in order to properly fill the endpoints
- n = Nomad.node.get_node(a['NodeID'])
- for k, v in info['endpoints'].items():
- info['endpoints'][k] = v.replace('${meta.domain}', n['Meta']['domain'])
+ n = Nomad.node.get_node(a["NodeID"])
+ for k, v in info["endpoints"].items():
+ info["endpoints"][k] = v.replace("${meta.domain}", n["Meta"]["domain"])
# Add active endpoints
if full_info:
- info['active_endpoints'] = []
- for k, v in info['endpoints'].items():
+ info["active_endpoints"] = []
+ for k, v in info["endpoints"].items():
try:
# We use GET and not HEAD, because HEAD is not returning the correct status_codes (even with "allow_redirects=True")
# Anyway, both latencies are almost the same when using "allow_redirects=True"
@@ -269,33 +274,40 @@ def get_deployment(
# * Non existing domain: GET (404), HEAD (404) | latency: ~40 ms
r = session.get(v, timeout=2)
if r.ok:
- info['active_endpoints'].append(k)
- except (requests.exceptions.Timeout, requests.exceptions.ConnectionError):
+ info["active_endpoints"].append(k)
+ except (
+ requests.exceptions.Timeout,
+ requests.exceptions.ConnectionError,
+ ):
continue
# Disable access to endpoints if there is a network cut
- if info['status'] == 'down' and info['active_endpoints']:
- info['active_endpoints'] = []
+ if info["status"] == "down" and info["active_endpoints"]:
+ info["active_endpoints"] = []
elif evals:
# Something happened, job didn't deploy (eg. job needs port that's currently being used)
# We have to return `placement failures message`.
- info['status'] = 'error'
- info['error_msg'] = f"{evals[0].get('FailedTGAllocs', '')}"
+ info["status"] = "error"
+ info["error_msg"] = f"{evals[0].get('FailedTGAllocs', '')}"
else:
# info['error_msg'] = f"Job has not been yet evaluated. Contact with support sharing your job ID: {j['ID']}."
- info['status'] = 'queued'
+ info["status"] = "queued"
# Fill info with _requested_ resources instead
- res = usertask['Resources']
- gpu = [d for d in res['Devices'] if d['Name'] == 'gpu'][0] if res['Devices'] else None
- info['resources'] = {
- 'cpu_num': res['Cores'],
- 'cpu_MHz': 0, # not known before allocation
- 'gpu_num': gpu['Count'] if gpu else 0,
- 'memory_MB': res['MemoryMB'],
- 'disk_MB': j['TaskGroups'][0]['EphemeralDisk']['SizeMB'],
+ res = usertask["Resources"]
+ gpu = (
+ [d for d in res["Devices"] if d["Name"] == "gpu"][0]
+ if res["Devices"]
+ else None
+ )
+ info["resources"] = {
+ "cpu_num": res["Cores"],
+ "cpu_MHz": 0, # not known before allocation
+ "gpu_num": gpu["Count"] if gpu else 0,
+ "memory_MB": res["MemoryMB"],
+ "disk_MB": j["TaskGroups"][0]["EphemeralDisk"]["SizeMB"],
}
return info
@@ -303,7 +315,7 @@ def get_deployment(
def load_job_conf(
raw_str: str,
- ):
+):
"""
Transform raw hcl string to Nomad dict object
"""
@@ -312,21 +324,21 @@ def load_job_conf(
def create_deployment(
conf: dict,
- ):
+):
"""
Submit a deployment to Nomad.
"""
# Submit job
try:
- _ = Nomad.jobs.register_job({'Job': conf})
+ _ = Nomad.jobs.register_job({"Job": conf})
return {
- 'status': 'success',
- 'job_ID': conf['ID'],
+ "status": "success",
+ "job_ID": conf["ID"],
}
except Exception as e:
return {
- 'status': 'fail',
- 'error_msg': str(e),
+ "status": "fail",
+ "error_msg": str(e),
}
@@ -334,7 +346,7 @@ def delete_deployment(
deployment_uuid: str,
namespace: str,
owner: str,
- ):
+):
"""
Delete a deployment. Users can only delete their own deployments.
@@ -352,12 +364,12 @@ def delete_deployment(
namespace=namespace,
owner=owner,
full_info=False,
- )
+ )
# If job is in stuck status, allow deleting with purge.
# Most of the time, when a job is in this status, it is due to a platform error.
# It gets stuck and cannot be deleted without purge
- if info['status'] in ['queued', 'complete', 'failed', 'error', 'down'] :
+ if info["status"] in ["queued", "complete", "failed", "error", "down"]:
purge = True
else:
purge = False
@@ -367,12 +379,12 @@ def delete_deployment(
id_=deployment_uuid,
namespace=namespace,
purge=purge,
- )
+ )
- return {'status': 'success'}
+ return {"status": "success"}
-@cached(cache=TTLCache(maxsize=1024, ttl=1*60*60))
+@cached(cache=TTLCache(maxsize=1024, ttl=1 * 60 * 60))
def get_gpu_models(vo):
"""
Retrieve available GPU models in the cluster, filtering nodes by VO.
@@ -381,18 +393,18 @@ def get_gpu_models(vo):
nodes = Nomad.nodes.get_nodes(resources=True)
for node in nodes:
# Discard nodes that don't belong to the requested VO
- meta = Nomad.node.get_node(node['ID'])['Meta']
- if papiconf.MAIN_CONF['nomad']['namespaces'][vo] not in meta['namespace']:
+ meta = Nomad.node.get_node(node["ID"])["Meta"]
+ if papiconf.MAIN_CONF["nomad"]["namespaces"][vo] not in meta["namespace"]:
continue
# Discard GPU models of nodes that are not eligible
- if node['SchedulingEligibility'] != 'eligible':
+ if node["SchedulingEligibility"] != "eligible":
continue
# Retrieve GPU models of the node
- devices = node['NodeResources']['Devices']
- gpus = [d for d in devices if d['Type'] == 'gpu'] if devices else []
+ devices = node["NodeResources"]["Devices"]
+ gpus = [d for d in devices if d["Type"] == "gpu"] if devices else []
for gpu in gpus:
- gpu_models.add(gpu['Name'])
+ gpu_models.add(gpu["Name"])
return list(gpu_models)
diff --git a/ai4papi/nomad/patches.py b/ai4papi/nomad/patches.py
index 9154257..7488de6 100644
--- a/ai4papi/nomad/patches.py
+++ b/ai4papi/nomad/patches.py
@@ -2,6 +2,7 @@
Miscellaneous Nomad patches
#TODO: remove when new nomad-python release is launched.
"""
+
from typing import Union
import nomad
@@ -17,44 +18,44 @@ def deregister_job(
global_: Union[bool, None] = None,
namespace: Union[str, None] = None,
purge: Union[bool, None] = None,
- ):
- """ ================================================================================
- This is a monkey-patch of the default function in the python-nomad module,
- that did not support `namespace` as a parameter of the function.
-
- Remove when PR is merged:
- https://github.com/jrxFive/python-nomad/pull/153
-
- ================================================================================
-
- Deregisters a job, and stops all allocations part of it.
-
- https://www.nomadproject.io/docs/http/job.html
-
- arguments:
- - id
- - eval_priority (int) optional.
- Override the priority of the evaluations produced as a result
- of this job deregistration. By default, this is set to the
- priority of the job.
- - global (bool) optional.
- Stop a multi-region job in all its regions. By default, job
- stop will stop only a single region at a time. Ignored for
- single-region jobs.
- - purge (bool) optional.
- Specifies that the job should be stopped and purged immediately.
- This means the job will not be queryable after being stopped.
- If not set, the job will be purged by the garbage collector.
- - namespace (str) optional.
- Specifies the target namespace. If ACL is enabled, this value
- must match a namespace that the token is allowed to access.
- This is specified as a query string parameter.
-
- returns: dict
- raises:
- - nomad.api.exceptions.BaseNomadException
- - nomad.api.exceptions.URLNotFoundNomadException
- - nomad.api.exceptions.InvalidParameters
+):
+ """================================================================================
+ This is a monkey-patch of the default function in the python-nomad module,
+ that did not support `namespace` as a parameter of the function.
+
+ Remove when PR is merged:
+ https://github.com/jrxFive/python-nomad/pull/153
+
+ ================================================================================
+
+ Deregisters a job, and stops all allocations part of it.
+
+ https://www.nomadproject.io/docs/http/job.html
+
+ arguments:
+ - id
+ - eval_priority (int) optional.
+ Override the priority of the evaluations produced as a result
+ of this job deregistration. By default, this is set to the
+ priority of the job.
+ - global (bool) optional.
+ Stop a multi-region job in all its regions. By default, job
+ stop will stop only a single region at a time. Ignored for
+ single-region jobs.
+ - purge (bool) optional.
+ Specifies that the job should be stopped and purged immediately.
+ This means the job will not be queryable after being stopped.
+ If not set, the job will be purged by the garbage collector.
+ - namespace (str) optional.
+ Specifies the target namespace. If ACL is enabled, this value
+ must match a namespace that the token is allowed to access.
+ This is specified as a query string parameter.
+
+ returns: dict
+ raises:
+ - nomad.api.exceptions.BaseNomadException
+ - nomad.api.exceptions.URLNotFoundNomadException
+ - nomad.api.exceptions.InvalidParameters
"""
params = {
"eval_priority": eval_priority,
@@ -70,7 +71,7 @@ def get_allocations(
id_: str,
all_: Union[bool, None] = None,
namespace: Union[str, None] = None,
- ):
+):
"""Query the allocations belonging to a single job.
https://www.nomadproject.io/docs/http/job.html
@@ -98,7 +99,7 @@ def get_evaluations(
self,
id_: str,
namespace: Union[str, None] = None,
- ):
+):
"""Query the evaluations belonging to a single job.
https://www.nomadproject.io/docs/http/job.html
diff --git a/ai4papi/quotas.py b/ai4papi/quotas.py
index b1d0142..cc35c11 100644
--- a/ai4papi/quotas.py
+++ b/ai4papi/quotas.py
@@ -1,6 +1,7 @@
"""
Accounting of resources.
"""
+
from copy import deepcopy
from fastapi import HTTPException
@@ -11,88 +12,91 @@
def check_jobwise(
conf: dict,
vo: str,
- ):
+):
"""
Check the job configuration does not overflow the generic hardware limits.
"""
# Retrieve generic quotas (vo-dependent)
- item_name = conf['general']['docker_image'].split('/')[-1]
+ item_name = conf["general"]["docker_image"].split("/")[-1]
ref = limit_resources(
item_name=item_name,
vo=vo,
)
# Compare with user options
- user_conf = conf['hardware']
+ user_conf = conf["hardware"]
for k in ref.keys():
- if 'range' in ref[k].keys():
- if user_conf[k] < ref[k]['range'][0]:
+ if "range" in ref[k].keys():
+ if user_conf[k] < ref[k]["range"][0]:
raise HTTPException(
status_code=400,
- detail=f"The parameter {k} should bigger or equal to {ref[k]['range'][0]}."
- )
- if user_conf[k] > ref[k]['range'][1]:
+ detail=f"The parameter {k} should bigger or equal to {ref[k]['range'][0]}.",
+ )
+ if user_conf[k] > ref[k]["range"][1]:
raise HTTPException(
status_code=400,
- detail=f"The parameter {k} should smaller or equal to {ref[k]['range'][1]}."
- )
+ detail=f"The parameter {k} should smaller or equal to {ref[k]['range'][1]}.",
+ )
def check_userwise(
conf: dict,
deployments: dict,
- ):
+):
"""
Check the job configuration does not overflow the generic hardware limits.
For example, a user cannot have more than two GPUs running/queued.
"""
# Aggregate user resources
- user = {'gpu_num': 0}
+ user = {"gpu_num": 0}
for d in deployments:
- user['gpu_num'] += d['resources']['gpu_num']
+ user["gpu_num"] += d["resources"]["gpu_num"]
# Check if aggregate is within the limits
- threshold = {'gpu_num': 2}
- if (user['gpu_num'] + conf['hardware']['gpu_num']) > threshold['gpu_num'] and \
- conf['hardware']['gpu_num']:
+ threshold = {"gpu_num": 2}
+ if (user["gpu_num"] + conf["hardware"]["gpu_num"]) > threshold["gpu_num"] and conf[
+ "hardware"
+ ]["gpu_num"]:
# TODO: remove this last line ("and conf['hardware']['gpu_num']"") once everyone
# is within the quotas. For the time being this line is enabling users that have
# overpassed the quotas (*) to make CPU deployments.
# (*) before the quotas were in place
raise HTTPException(
status_code=400,
- detail="You already have at least 2 GPUs running and/or queued. " \
- "If you want to make a new GPU deployment please delete one of your " \
- "existing ones."
- )
+ detail="You already have at least 2 GPUs running and/or queued. "
+ "If you want to make a new GPU deployment please delete one of your "
+ "existing ones.",
+ )
def limit_resources(
item_name: str,
vo: str,
- ):
+):
"""
Implement hardware limits for specific users or VOs.
"""
# Select appropriate conf
if item_name in papiconf.TOOLS.keys():
- conf = deepcopy(papiconf.TOOLS[item_name]['user']['full'])
+ conf = deepcopy(papiconf.TOOLS[item_name]["user"]["full"])
else:
- conf = deepcopy(papiconf.MODULES['user']['full'])
- conf = conf['hardware']
+ conf = deepcopy(papiconf.MODULES["user"]["full"])
+ conf = conf["hardware"]
# Limit resources for tutorial users
- if vo == 'training.egi.eu':
- if 'cpu_num' in conf.keys():
+ if vo == "training.egi.eu":
+ if "cpu_num" in conf.keys():
conf["cpu_num"]["value"] = 2
conf["cpu_num"]["range"] = [2, 4]
- if 'gpu_num' in conf.keys():
+ if "gpu_num" in conf.keys():
conf["gpu_num"]["range"] = [0, 0]
- conf["gpu_num"]["description"] = "Tutorial users are not allowed to deploy on GPUs."
- if 'ram' in conf.keys():
+ conf["gpu_num"]["description"] = (
+ "Tutorial users are not allowed to deploy on GPUs."
+ )
+ if "ram" in conf.keys():
conf["ram"]["value"] = 2000
conf["ram"]["range"] = [2000, 4000]
- if 'disk' in conf.keys():
+ if "disk" in conf.keys():
conf["disk"]["value"] = 500
conf["disk"]["range"] = [300, 1000]
diff --git a/ai4papi/routers/__init__.py b/ai4papi/routers/__init__.py
index bbf8c7e..326cdf4 100644
--- a/ai4papi/routers/__init__.py
+++ b/ai4papi/routers/__init__.py
@@ -1 +1 @@
-from . import v1
\ No newline at end of file
+from . import v1
diff --git a/ai4papi/routers/v1/__init__.py b/ai4papi/routers/v1/__init__.py
index acce788..34383bd 100644
--- a/ai4papi/routers/v1/__init__.py
+++ b/ai4papi/routers/v1/__init__.py
@@ -1,6 +1,15 @@
import fastapi
-from . import catalog, deployments, inference, secrets, stats, storage, try_me, snapshots
+from . import (
+ catalog,
+ deployments,
+ inference,
+ secrets,
+ stats,
+ storage,
+ try_me,
+ snapshots,
+)
router = fastapi.APIRouter()
diff --git a/ai4papi/routers/v1/catalog/__init__.py b/ai4papi/routers/v1/catalog/__init__.py
index d98ecca..eb9c31d 100644
--- a/ai4papi/routers/v1/catalog/__init__.py
+++ b/ai4papi/routers/v1/catalog/__init__.py
@@ -6,13 +6,13 @@
router = fastapi.APIRouter()
router.include_router(
router=modules.router,
- prefix='/catalog',
- )
+ prefix="/catalog",
+)
router.include_router(
router=tools.router,
- prefix='/catalog',
- )
+ prefix="/catalog",
+)
router.include_router(
router=datasets.router,
- prefix='/datasets',
- )
+ prefix="/datasets",
+)
diff --git a/ai4papi/routers/v1/catalog/common.py b/ai4papi/routers/v1/catalog/common.py
index ae048e7..3d294fe 100644
--- a/ai4papi/routers/v1/catalog/common.py
+++ b/ai4papi/routers/v1/catalog/common.py
@@ -40,12 +40,11 @@
security = HTTPBearer()
-JENKINS_TOKEN = os.getenv('PAPI_JENKINS_TOKEN')
+JENKINS_TOKEN = os.getenv("PAPI_JENKINS_TOKEN")
class Catalog:
-
- def __init__(self, repo:str, item_type:str='item') -> None:
+ def __init__(self, repo: str, item_type: str = "item") -> None:
"""
Parameters:
* repo: Github repo where the catalog is hosted (via git submodules)
@@ -54,11 +53,10 @@ def __init__(self, repo:str, item_type:str='item') -> None:
self.repo = repo
self.item_type = item_type
-
- @cached(cache=TTLCache(maxsize=1024, ttl=6*60*60))
+ @cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60))
def get_items(
self,
- ):
+ ):
"""
Retrieve a dict of *all* items.
```
@@ -72,7 +70,9 @@ def get_items(
This is implemented in a separate function as many functions from this router
are using this function, so we need to avoid infinite recursions.
"""
- gitmodules_url = f"https://raw.githubusercontent.com/{self.repo}/master/.gitmodules"
+ gitmodules_url = (
+ f"https://raw.githubusercontent.com/{self.repo}/master/.gitmodules"
+ )
r = requests.get(gitmodules_url)
cfg = configparser.ConfigParser()
@@ -81,26 +81,26 @@ def get_items(
modules = {}
for section in cfg.sections():
items = dict(cfg.items(section))
- key = items.pop('path')
- items['url'] = items['url'].replace('.git', '') # remove `.git`, if present
+ key = items.pop("path")
+ items["url"] = items["url"].replace(".git", "") # remove `.git`, if present
modules[key] = items
# In the case of the tools repo, make sure to remove any tool that is not yet
# supported by PAPI (use the ^ operator to only keep common items)
- if 'tool' in self.repo:
+ if "tool" in self.repo:
for tool_name in papiconf.TOOLS.keys() ^ modules.keys():
_ = modules.pop(tool_name)
return modules
- @cached(cache=TTLCache(maxsize=1024, ttl=6*60*60))
+ @cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60))
def get_filtered_list(
self,
tags: Union[Tuple, None] = Query(default=None),
tags_any: Union[Tuple, None] = Query(default=None),
not_tags: Union[Tuple, None] = Query(default=None),
not_tags_any: Union[Tuple, None] = Query(default=None),
- ):
+ ):
"""
Retrieve a list of all items.
@@ -113,15 +113,14 @@ def get_filtered_list(
# ValueError: [ValueError('dictionary update sequence element #0 has length 1; 2 is required'), TypeError('vars() argument must have __dict__ attribute')]
return modules
-
- @cached(cache=TTLCache(maxsize=1024, ttl=6*60*60))
+ @cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60))
def get_summary(
self,
tags: Union[Tuple, None] = Query(default=None),
tags_any: Union[Tuple, None] = Query(default=None),
not_tags: Union[Tuple, None] = Query(default=None),
not_tags_any: Union[Tuple, None] = Query(default=None),
- ):
+ ):
"""
Retrieve a list of all items' basic metadata.
@@ -130,23 +129,22 @@ def get_summary(
"""
modules = self.get_filtered_list()
summary = []
- ignore = ['description', 'links'] # don't send this info to decrease latency
+ ignore = ["description", "links"] # don't send this info to decrease latency
for m in modules:
try:
meta1 = self.get_metadata(m)
except Exception:
# Avoid breaking the whole method if failing to retrieve a module
- print(f'Error retrieving metadata: {m}')
+ print(f"Error retrieving metadata: {m}")
continue
meta = {k: v for k, v in meta1.items() if k not in ignore} # filter keys
- meta['name'] = m
+ meta["name"] = m
summary.append(meta)
return summary
-
def get_tags(
self,
- ):
+ ):
"""
Retrieve a list of all the existing tags.
Now deprecated, kept to avoid breaking backward-compatibility.
@@ -154,12 +152,14 @@ def get_tags(
"""
return []
-
- @cached(cache=TTLCache(maxsize=1024, ttl=6*60*60), key=lambda self, item_name: item_name,)
+ @cached(
+ cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60),
+ key=lambda self, item_name: item_name,
+ )
def get_metadata(
self,
item_name: str,
- ):
+ ):
"""
Get the item's full metadata.
"""
@@ -171,31 +171,33 @@ def get_metadata(
raise HTTPException(
status_code=404,
detail=f"Item {item_name} not in catalog: {list(items.keys())}",
- )
+ )
# Retrieve metadata from default branch
# Use try/except to avoid that a single module formatting error could take down
# all the Dashboard
branch = items[item_name].get("branch", "master")
- url = items[item_name]['url'].replace('github.com', 'raw.githubusercontent.com')
+ url = items[item_name]["url"].replace("github.com", "raw.githubusercontent.com")
metadata_url = f"{url}/{branch}/ai4-metadata.yml"
error = None
# Try to retrieve the metadata from Github
r = requests.get(metadata_url)
if not r.ok:
- error = \
- "The metadata of this module could not be retrieved because the " \
+ error = (
+ "The metadata of this module could not be retrieved because the "
"module is lacking a metadata file (`ai4-metadata.yml`)."
+ )
else:
# Try to load the YML file
try:
metadata = yaml.safe_load(r.text)
except Exception:
metadata = None
- error = \
- "The metadata of this module could not be retrieved because the " \
+ error = (
+ "The metadata of this module could not be retrieved because the "
"metadata file is badly formatted (`ai4-metadata.yml`)."
+ )
# Since we are loading the metadata directly from the repo main branch,
# we cannot know if they have successfully passed or not the Jenkins
@@ -205,25 +207,28 @@ def get_metadata(
schema = ai4_metadata.get_schema("2.0.0")
ai4_metadata.validate.validate(instance=metadata, schema=schema)
except Exception:
- error = \
- "The metadata of this module has failed to comply with the " \
- "specifications of the AI4EOSC Platform (see the " \
+ error = (
+ "The metadata of this module has failed to comply with the "
+ "specifications of the AI4EOSC Platform (see the "
"[metadata validator](https://github.com/ai4os/ai4-metadata))."
+ )
# Make sure the repo belongs to one of supported orgs
pattern = r"https?:\/\/(www\.)?github\.com\/([^\/]+)\/"
- match = re.search(pattern, metadata['links']['source_code'])
+ match = re.search(pattern, metadata["links"]["source_code"])
github_org = match.group(2) if match else None
if not github_org:
- error = \
- "This module does not seem to have a valid Github source code. " \
- "If you are the developer of this module, please check the " \
- "\"source_code\" link in your metadata."
- if github_org not in ['ai4os', 'ai4os-hub', 'deephdc']:
- error = \
- "This module belongs to a Github organization not supported by " \
- "the project. If you are the developer of this module, please " \
- "check the \"source_code\" link in your metadata."
+ error = (
+ "This module does not seem to have a valid Github source code. "
+ "If you are the developer of this module, please check the "
+ '"source_code" link in your metadata.'
+ )
+ if github_org not in ["ai4os", "ai4os-hub", "deephdc"]:
+ error = (
+ "This module belongs to a Github organization not supported by "
+ "the project. If you are the developer of this module, please "
+ 'check the "source_code" link in your metadata.'
+ )
# If any of the previous steps raised an error, load a metadata placeholder
if error:
@@ -238,7 +243,7 @@ def get_metadata(
"dates": {
"created": "",
"updated": "",
- },
+ },
"links": {
"documentation": "",
"source_code": "",
@@ -258,32 +263,37 @@ def get_metadata(
else:
# Replace some fields with the info gathered from Github
- pattern = r'github\.com/([^/]+)/([^/]+?)(?:\.git|/)?$'
- match = re.search(pattern, items[item_name]['url'])
+ pattern = r"github\.com/([^/]+)/([^/]+?)(?:\.git|/)?$"
+ match = re.search(pattern, items[item_name]["url"])
if match:
owner, repo = match.group(1), match.group(2)
gh_info = utils.get_github_info(owner, repo)
- metadata.setdefault('dates', {})
- metadata['dates']['created'] = gh_info.get('created', '')
- metadata['dates']['updated'] = gh_info.get('updated', '')
- metadata['license'] = gh_info.get('license', '')
+ metadata.setdefault("dates", {})
+ metadata["dates"]["created"] = gh_info.get("created", "")
+ metadata["dates"]["updated"] = gh_info.get("updated", "")
+ metadata["license"] = gh_info.get("license", "")
# Add Jenkins CI/CD links
- metadata['links']['cicd_url'] = f"https://jenkins.services.ai4os.eu/job/{github_org}/job/{item_name}/job/{branch}/"
- metadata['links']['cicd_badge'] = f"https://jenkins.services.ai4os.eu/buildStatus/icon?job={github_org}/{item_name}/{branch}"
+ metadata["links"]["cicd_url"] = (
+ f"https://jenkins.services.ai4os.eu/job/{github_org}/job/{item_name}/job/{branch}/"
+ )
+ metadata["links"]["cicd_badge"] = (
+ f"https://jenkins.services.ai4os.eu/buildStatus/icon?job={github_org}/{item_name}/{branch}"
+ )
# Add DockerHub
# TODO: when the migration is finished, we have to generate the url from the module name
# (ie. ignore the value coming from the metadata)
- metadata['links']['docker_image'] = metadata['links']['docker_image'].strip('/ ')
+ metadata["links"]["docker_image"] = metadata["links"]["docker_image"].strip(
+ "/ "
+ )
# Add the item name
- metadata['id'] = item_name
+ metadata["id"] = item_name
return metadata
-
def refresh_metadata_cache_entry(
self,
item_name: str,
@@ -318,10 +328,9 @@ def refresh_metadata_cache_entry(
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
-
def get_config(
self,
- ):
+ ):
"""
Returns the default configuration (dict) for creating a deployment
for a specific item. It is prefilled with the appropriate
@@ -332,8 +341,8 @@ def get_config(
def retrieve_docker_tags(
image: str,
- repo: str = 'ai4oshub',
- ):
+ repo: str = "ai4oshub",
+):
"""
Retrieve tags from Dockerhub image
"""
@@ -346,6 +355,6 @@ def retrieve_docker_tags(
raise HTTPException(
status_code=400,
detail=f"Could not retrieve Docker tags from {repo}/{image}.",
- )
+ )
tags = [i["name"] for i in r["results"]]
return tags
diff --git a/ai4papi/routers/v1/catalog/datasets/__init__.py b/ai4papi/routers/v1/catalog/datasets/__init__.py
index 1efe3c6..84bcee3 100644
--- a/ai4papi/routers/v1/catalog/datasets/__init__.py
+++ b/ai4papi/routers/v1/catalog/datasets/__init__.py
@@ -6,4 +6,4 @@
router = fastapi.APIRouter()
router.include_router(
router=zenodo.router,
- )
+)
diff --git a/ai4papi/routers/v1/catalog/datasets/zenodo.py b/ai4papi/routers/v1/catalog/datasets/zenodo.py
index 9916a36..ea20a90 100644
--- a/ai4papi/routers/v1/catalog/datasets/zenodo.py
+++ b/ai4papi/routers/v1/catalog/datasets/zenodo.py
@@ -29,20 +29,20 @@
# If available, authenticate the call to Zenodo to increase rate limit.
# https://developers.zenodo.org/#rate-limiting
-API_URL = 'https://zenodo.org'
+API_URL = "https://zenodo.org"
session = requests.Session()
-zenodo_token = os.environ.get('ZENODO_TOKEN', None)
+zenodo_token = os.environ.get("ZENODO_TOKEN", None)
if zenodo_token:
session.headers = {
- 'Authorization': f'Bearer {zenodo_token}',
+ "Authorization": f"Bearer {zenodo_token}",
}
-@cached(cache=TTLCache(maxsize=1024, ttl=6*60*60))
+@cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60))
def _zenodo_proxy(
api_route: str,
params: Union[frozenset, None] = None,
- ):
+):
"""
We use this hidden function to allow for caching responses.
Otherwise error will be raised, because "authorization" param cannot be cached
@@ -59,11 +59,11 @@ def _zenodo_proxy(
# To avoid security issues, only allow a subset of Zenodo API (to avoid users
# using *our* Zenodo token to update any record)
allowed_routes = [
- '^communities',
- '^communities/[a-zA-Z0-9-]+/records*$',
- '^records/[0-9]+',
- '^records/[0-9]+/versions*$',
- ]
+ "^communities",
+ "^communities/[a-zA-Z0-9-]+/records*$",
+ "^records/[0-9]+",
+ "^records/[0-9]+/versions*$",
+ ]
allowed = False
for i in allowed_routes:
if re.match(i, api_route):
@@ -72,21 +72,20 @@ def _zenodo_proxy(
if not allowed:
raise HTTPException(
status_code=400,
- detail="Zenodo API route not allowed." \
- f"Allowed routes: {allowed_routes}",
- )
+ detail="Zenodo API route not allowed." f"Allowed routes: {allowed_routes}",
+ )
# Make the call
r = session.get(
f"{API_URL}/api/{api_route}",
params=params,
- )
+ )
if not r.ok:
raise HTTPException(
status_code=500,
detail="Failed to query Zenodo.",
- )
+ )
return r.json()
@@ -96,7 +95,7 @@ def zenodo_proxy(
api_route: str,
params: Union[dict, None] = None,
authorization=Depends(security),
- ):
+):
"""
Zenodo proxy
diff --git a/ai4papi/routers/v1/catalog/modules.py b/ai4papi/routers/v1/catalog/modules.py
index 20b5c58..57d343e 100644
--- a/ai4papi/routers/v1/catalog/modules.py
+++ b/ai4papi/routers/v1/catalog/modules.py
@@ -13,26 +13,26 @@ def get_config(
self,
item_name: str,
vo: str,
- ):
+):
# Check if module exists
modules = self.get_items()
if item_name not in modules.keys():
raise HTTPException(
status_code=400,
detail=f"{item_name} is not an available module.",
- )
+ )
# Retrieve module configuration
- conf = deepcopy(papiconf.MODULES['user']['full'])
+ conf = deepcopy(papiconf.MODULES["user"]["full"])
# Retrieve module metadata
metadata = self.get_metadata(item_name)
# Parse docker registry
- registry = metadata['links']['docker_image']
- repo, image = registry.split('/')[-2:]
- if repo not in ['deephdc', 'ai4oshub']:
- repo = 'ai4oshub'
+ registry = metadata["links"]["docker_image"]
+ repo, image = registry.split("/")[-2:]
+ if repo not in ["deephdc", "ai4oshub"]:
+ repo = "ai4oshub"
# Fill with correct Docker image
conf["general"]["docker_image"]["value"] = f"{repo}/{image}"
@@ -43,7 +43,7 @@ def get_config(
conf["general"]["docker_tag"]["value"] = tags[0]
# Custom conf for development environment
- if item_name == 'ai4os-dev-env':
+ if item_name == "ai4os-dev-env":
# For dev-env, order the tags in "Z-A" order instead of "newest"
# This is done because builds are done in parallel, so "newest" is meaningless
# (Z-A + natsort) allows to show more recent semver first
@@ -52,12 +52,14 @@ def get_config(
conf["general"]["docker_tag"]["value"] = tags[0]
# Use VS Code (Coder OSS) in the development container
- conf["general"]["service"]["value"] = 'vscode'
- conf["general"]["service"]["options"].insert(0, 'vscode')
- conf["general"]["service"]["options"].remove('deepaas') # no models installed in dev
+ conf["general"]["service"]["value"] = "vscode"
+ conf["general"]["service"]["options"].insert(0, "vscode")
+ conf["general"]["service"]["options"].remove(
+ "deepaas"
+ ) # no models installed in dev
# Modify the resources limits for a given user or VO
- conf['hardware'] = quotas.limit_resources(
+ conf["hardware"] = quotas.limit_resources(
item_name=item_name,
vo=vo,
)
@@ -71,8 +73,8 @@ def get_config(
Modules = Catalog(
- repo='ai4os-hub/modules-catalog',
- item_type='module',
+ repo="ai4os-hub/modules-catalog",
+ item_type="module",
)
Modules.get_config = types.MethodType(get_config, Modules)
@@ -86,28 +88,28 @@ def get_config(
"",
Modules.get_filtered_list,
methods=["GET"],
- )
+)
router.add_api_route(
"/detail",
Modules.get_summary,
methods=["GET"],
- )
+)
router.add_api_route(
"/tags",
Modules.get_tags,
methods=["GET"],
deprecated=True,
- )
+)
router.add_api_route(
"/{item_name}/metadata",
Modules.get_metadata,
methods=["GET"],
- )
+)
router.add_api_route(
"/{item_name}/config",
Modules.get_config,
methods=["GET"],
- )
+)
router.add_api_route(
"/{item_name}/refresh",
diff --git a/ai4papi/routers/v1/catalog/tools.py b/ai4papi/routers/v1/catalog/tools.py
index c5d5dab..cbcbeb1 100644
--- a/ai4papi/routers/v1/catalog/tools.py
+++ b/ai4papi/routers/v1/catalog/tools.py
@@ -16,7 +16,7 @@ def get_config(
self,
item_name: str,
vo: str,
- ):
+):
"""
Returns the default configuration (dict) for creating a deployment
for a specific item. It is prefilled with the appropriate
@@ -25,24 +25,24 @@ def get_config(
# Retrieve tool configuration
try:
- conf = deepcopy(papiconf.TOOLS[item_name]['user']['full'])
+ conf = deepcopy(papiconf.TOOLS[item_name]["user"]["full"])
except Exception:
raise HTTPException(
status_code=400,
detail=f"{item_name} is not an available tool.",
- )
+ )
# Retrieve tool metadata
metadata = self.get_metadata(item_name)
# Parse docker registry
- registry = metadata['links']['docker_image']
- repo, image = registry.split('/')[-2:]
- if repo not in ['deephdc', 'ai4oshub']:
- repo = 'ai4oshub'
+ registry = metadata["links"]["docker_image"]
+ repo, image = registry.split("/")[-2:]
+ if repo not in ["deephdc", "ai4oshub"]:
+ repo = "ai4oshub"
# Fill with correct Docker image and tags (not needed for CVAT because hardcoded)
- if item_name != 'ai4os-cvat':
+ if item_name != "ai4os-cvat":
conf["general"]["docker_image"]["value"] = f"{repo}/{image}"
tags = retrieve_docker_tags(image=image, repo=repo)
@@ -60,8 +60,8 @@ def get_config(
Tools = Catalog(
- repo='ai4os/tools-catalog',
- item_type='tool',
+ repo="ai4os/tools-catalog",
+ item_type="tool",
)
Tools.get_config = types.MethodType(get_config, Tools)
@@ -75,28 +75,28 @@ def get_config(
"",
Tools.get_filtered_list,
methods=["GET"],
- )
+)
router.add_api_route(
"/detail",
Tools.get_summary,
methods=["GET"],
- )
+)
router.add_api_route(
"/tags",
Tools.get_tags,
methods=["GET"],
deprecated=True,
- )
+)
router.add_api_route(
"/{item_name}/metadata",
Tools.get_metadata,
methods=["GET"],
- )
+)
router.add_api_route(
"/{item_name}/config",
Tools.get_config,
methods=["GET"],
- )
+)
router.add_api_route(
"/{item_name}/refresh",
Tools.refresh_metadata_cache_entry,
diff --git a/ai4papi/routers/v1/deployments/__init__.py b/ai4papi/routers/v1/deployments/__init__.py
index 7ae724f..e5a4de1 100644
--- a/ai4papi/routers/v1/deployments/__init__.py
+++ b/ai4papi/routers/v1/deployments/__init__.py
@@ -6,9 +6,9 @@
router = fastapi.APIRouter()
router.include_router(
router=modules.router,
- prefix='/deployments',
- )
+ prefix="/deployments",
+)
router.include_router(
router=tools.router,
- prefix='/deployments',
- )
+ prefix="/deployments",
+)
diff --git a/ai4papi/routers/v1/deployments/modules.py b/ai4papi/routers/v1/deployments/modules.py
index 2c4fa39..9ee8091 100644
--- a/ai4papi/routers/v1/deployments/modules.py
+++ b/ai4papi/routers/v1/deployments/modules.py
@@ -1,7 +1,6 @@
from copy import deepcopy
import datetime
import os
-import re
import types
from typing import Tuple, Union
import uuid
@@ -24,9 +23,9 @@
# When deploying in production, force the definition of a provenance token
-provenance_token = os.environ.get('PAPI_PROVENANCE_TOKEN', None)
+provenance_token = os.environ.get("PAPI_PROVENANCE_TOKEN", None)
if not papiconf.IS_DEV and not provenance_token:
- raise Exception("You need to define the variable \"PAPI_PROVENANCE_TOKEN\".")
+ raise Exception('You need to define the variable "PAPI_PROVENANCE_TOKEN".')
@router.get("")
@@ -34,7 +33,7 @@ def get_deployments(
vos: Union[Tuple, None] = Query(default=None),
full_info: bool = Query(default=False),
authorization=Depends(security),
- ):
+):
"""
Returns a list of all deployments belonging to a user.
@@ -51,21 +50,21 @@ def get_deployments(
# If no VOs, then retrieve jobs from all user VOs
# Always remove VOs that do not belong to the project
if not vos:
- vos = auth_info['vos']
- vos = set(vos).intersection(set(papiconf.MAIN_CONF['auth']['VO']))
+ vos = auth_info["vos"]
+ vos = set(vos).intersection(set(papiconf.MAIN_CONF["auth"]["VO"]))
if not vos:
raise HTTPException(
status_code=401,
- detail=f"The provided Virtual Organizations do not match with any of your available VOs: {auth_info['vos']}."
- )
+ detail=f"The provided Virtual Organizations do not match with any of your available VOs: {auth_info['vos']}.",
+ )
user_jobs = []
for vo in vos:
# Retrieve all jobs in namespace
jobs = nomad.get_deployments(
- namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo],
- owner=auth_info['id'],
- prefix='module',
+ namespace=papiconf.MAIN_CONF["nomad"]["namespaces"][vo],
+ owner=auth_info["id"],
+ prefix="module",
)
# Retrieve info for jobs in namespace
@@ -73,7 +72,7 @@ def get_deployments(
try:
job_info = get_deployment(
vo=vo,
- deployment_uuid=j['ID'],
+ deployment_uuid=j["ID"],
full_info=full_info,
authorization=types.SimpleNamespace(
credentials=authorization.credentials # token
@@ -82,12 +81,12 @@ def get_deployments(
except HTTPException: # not a module
continue
except Exception as e: # unexpected error
- raise(e)
+ raise (e)
user_jobs.append(job_info)
# Sort deployments by creation date
- seq = [j['submit_time'] for j in user_jobs]
+ seq = [j["submit_time"] for j in user_jobs]
args = sorted(range(len(seq)), key=seq.__getitem__)[::-1]
sorted_jobs = [user_jobs[i] for i in args]
@@ -100,7 +99,7 @@ def get_deployment(
deployment_uuid: str,
full_info: bool = Query(default=True),
authorization=Depends(security),
- ):
+):
"""
Retrieve the info of a specific deployment.
Format outputs to a Nomad-independent format to be used by the Dashboard
@@ -119,24 +118,24 @@ def get_deployment(
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Retrieve the associated namespace to that VO
- namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo]
+ namespace = papiconf.MAIN_CONF["nomad"]["namespaces"][vo]
job = nomad.get_deployment(
deployment_uuid=deployment_uuid,
namespace=namespace,
- owner=auth_info['id'],
+ owner=auth_info["id"],
full_info=full_info,
)
# Check the deployment is indeed a module
- if not job['name'].startswith('module'):
+ if not job["name"].startswith("module"):
raise HTTPException(
status_code=400,
detail="This deployment is not a module.",
- )
+ )
return job
@@ -146,7 +145,7 @@ def create_deployment(
vo: str,
conf: Union[dict, None] = None,
authorization=Depends(security),
- ):
+):
"""
Submit a deployment to Nomad.
@@ -172,11 +171,11 @@ def create_deployment(
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Load module configuration
- nomad_conf = deepcopy(papiconf.MODULES['nomad'])
- user_conf = deepcopy(papiconf.MODULES['user']['values'])
+ nomad_conf = deepcopy(papiconf.MODULES["nomad"])
+ user_conf = deepcopy(papiconf.MODULES["user"]["values"])
# Update values conf in case we received a submitted conf
if conf is not None:
@@ -210,128 +209,132 @@ def create_deployment(
job_uuid = uuid.uuid1()
# Jobs from tutorial users should have low priority (ie. can be displaced if needed)
- if vo == 'training.egi.eu':
+ if vo == "training.egi.eu":
priority = 25
else:
priority = 50
- base_domain = papiconf.MAIN_CONF['lb']['domain'][vo]
+ base_domain = papiconf.MAIN_CONF["lb"]["domain"][vo]
# Replace the Nomad job template
nomad_conf = nomad_conf.safe_substitute(
{
- 'JOB_UUID': job_uuid,
- 'NAMESPACE': papiconf.MAIN_CONF['nomad']['namespaces'][vo],
- 'PRIORITY': priority,
- 'OWNER': auth_info['id'],
- 'OWNER_NAME': auth_info['name'],
- 'OWNER_EMAIL': auth_info['email'],
- 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters
- 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters
- 'BASE_DOMAIN': base_domain,
- 'HOSTNAME': job_uuid,
- 'DOCKER_IMAGE': user_conf['general']['docker_image'],
- 'DOCKER_TAG': user_conf['general']['docker_tag'],
- 'SERVICE': user_conf['general']['service'],
- 'CPU_NUM': user_conf['hardware']['cpu_num'],
- 'RAM': user_conf['hardware']['ram'],
- 'DISK': user_conf['hardware']['disk'],
- 'SHARED_MEMORY': user_conf['hardware']['ram'] * 10**6 * 0.5,
+ "JOB_UUID": job_uuid,
+ "NAMESPACE": papiconf.MAIN_CONF["nomad"]["namespaces"][vo],
+ "PRIORITY": priority,
+ "OWNER": auth_info["id"],
+ "OWNER_NAME": auth_info["name"],
+ "OWNER_EMAIL": auth_info["email"],
+ "TITLE": user_conf["general"]["title"][
+ :45
+ ], # keep only 45 first characters
+ "DESCRIPTION": user_conf["general"]["desc"][
+ :1000
+ ], # limit to 1K characters
+ "BASE_DOMAIN": base_domain,
+ "HOSTNAME": job_uuid,
+ "DOCKER_IMAGE": user_conf["general"]["docker_image"],
+ "DOCKER_TAG": user_conf["general"]["docker_tag"],
+ "SERVICE": user_conf["general"]["service"],
+ "CPU_NUM": user_conf["hardware"]["cpu_num"],
+ "RAM": user_conf["hardware"]["ram"],
+ "DISK": user_conf["hardware"]["disk"],
+ "SHARED_MEMORY": user_conf["hardware"]["ram"] * 10**6 * 0.5,
# Limit at 50% of RAM memory, in bytes
- 'GPU_NUM': user_conf['hardware']['gpu_num'],
- 'GPU_MODELNAME': user_conf['hardware']['gpu_type'],
- 'JUPYTER_PASSWORD': user_conf['general']['jupyter_password'],
- 'RCLONE_CONFIG_RSHARE_URL': user_conf['storage']['rclone_url'],
- 'RCLONE_CONFIG_RSHARE_VENDOR': user_conf['storage']['rclone_vendor'],
- 'RCLONE_CONFIG_RSHARE_USER': user_conf['storage']['rclone_user'],
- 'RCLONE_CONFIG_RSHARE_PASS': user_conf['storage']['rclone_password'],
- 'RCLONE_CONFIG': user_conf['storage']['rclone_conf'],
- 'MAILING_TOKEN': os.getenv("MAILING_TOKEN", default=""),
- 'PROJECT_NAME': papiconf.MAIN_CONF['nomad']['namespaces'][vo].upper(),
- 'TODAY': str(datetime.date.today()),
+ "GPU_NUM": user_conf["hardware"]["gpu_num"],
+ "GPU_MODELNAME": user_conf["hardware"]["gpu_type"],
+ "JUPYTER_PASSWORD": user_conf["general"]["jupyter_password"],
+ "RCLONE_CONFIG_RSHARE_URL": user_conf["storage"]["rclone_url"],
+ "RCLONE_CONFIG_RSHARE_VENDOR": user_conf["storage"]["rclone_vendor"],
+ "RCLONE_CONFIG_RSHARE_USER": user_conf["storage"]["rclone_user"],
+ "RCLONE_CONFIG_RSHARE_PASS": user_conf["storage"]["rclone_password"],
+ "RCLONE_CONFIG": user_conf["storage"]["rclone_conf"],
+ "MAILING_TOKEN": os.getenv("MAILING_TOKEN", default=""),
+ "PROJECT_NAME": papiconf.MAIN_CONF["nomad"]["namespaces"][vo].upper(),
+ "TODAY": str(datetime.date.today()),
}
)
# Convert template to Nomad conf
nomad_conf = nomad.load_job_conf(nomad_conf)
- tasks = nomad_conf['TaskGroups'][0]['Tasks']
- usertask = [t for t in tasks if t['Name']=='main'][0]
+ tasks = nomad_conf["TaskGroups"][0]["Tasks"]
+ usertask = [t for t in tasks if t["Name"] == "main"][0]
# Apply patches if needed
usertask = module_patches.patch_nextcloud_mount(
- user_conf['general']['docker_image'],
- usertask
+ user_conf["general"]["docker_image"], usertask
)
# Modify the GPU section
- if user_conf['hardware']['gpu_num'] <= 0:
+ if user_conf["hardware"]["gpu_num"] <= 0:
# Delete GPU section in CPU deployments
- usertask['Resources']['Devices'] = None
+ usertask["Resources"]["Devices"] = None
else:
# If gpu_type not provided, remove constraint to GPU model
- if not user_conf['hardware']['gpu_type']:
- usertask['Resources']['Devices'][0]['Constraints'] = None
+ if not user_conf["hardware"]["gpu_type"]:
+ usertask["Resources"]["Devices"][0]["Constraints"] = None
# If the image belong to Harbor, then it's a user snapshot
- docker_image = user_conf['general']['docker_image']
- if docker_image.split('/')[0] == "registry.services.ai4os.eu":
-
+ docker_image = user_conf["general"]["docker_image"]
+ if docker_image.split("/")[0] == "registry.services.ai4os.eu":
# Check the user is the owner of the image
- if docker_image.split('/')[-1] != auth_info['id'].replace('@', '_at_'):
+ if docker_image.split("/")[-1] != auth_info["id"].replace("@", "_at_"):
raise HTTPException(
status_code=401,
detail="You are not the owner of the Harbor image.",
- )
+ )
# Check the snapshot indeed exists
user_snapshots = v1.snapshots.get_harbor_snapshots(
- owner=auth_info['id'],
+ owner=auth_info["id"],
vo=vo,
)
- snapshot_ids = [s['snapshot_ID'] for s in user_snapshots]
- if user_conf['general']['docker_tag'] not in snapshot_ids:
+ snapshot_ids = [s["snapshot_ID"] for s in user_snapshots]
+ if user_conf["general"]["docker_tag"] not in snapshot_ids:
raise HTTPException(
status_code=400,
detail="The snapshot does not exist.",
- )
+ )
# Add Harbor authentication credentials to Nomad job
- usertask['Config']['auth'] = [{
- 'username': papiconf.HARBOR_USER,
- 'password': papiconf.HARBOR_PASS,
- }]
+ usertask["Config"]["auth"] = [
+ {
+ "username": papiconf.HARBOR_USER,
+ "password": papiconf.HARBOR_PASS,
+ }
+ ]
# If storage credentials not provided, remove all storage-related tasks
- rclone = {k: v for k, v in user_conf['storage'].items() if k.startswith('rclone')}
+ rclone = {k: v for k, v in user_conf["storage"].items() if k.startswith("rclone")}
if not all(rclone.values()):
- exclude_tasks = ['storage_mount', 'storage_cleanup', 'dataset_download']
+ exclude_tasks = ["storage_mount", "storage_cleanup", "dataset_download"]
else:
# If datasets provided, replicate 'dataset_download' task as many times as needed
- if user_conf['storage']['datasets']:
- download_task = [t for t in tasks if t['Name'] == 'dataset_download'][0]
- for i, dataset in enumerate(user_conf['storage']['datasets']):
+ if user_conf["storage"]["datasets"]:
+ download_task = [t for t in tasks if t["Name"] == "dataset_download"][0]
+ for i, dataset in enumerate(user_conf["storage"]["datasets"]):
t = deepcopy(download_task)
- t['Env']['DOI'] = dataset['doi']
- t['Env']['FORCE_PULL'] = dataset['doi']
- t['Name'] = f'dataset_download_{i}'
+ t["Env"]["DOI"] = dataset["doi"]
+ t["Env"]["FORCE_PULL"] = dataset["doi"]
+ t["Name"] = f"dataset_download_{i}"
tasks.append(t)
# Always exclude initial 'dataset_download' task, as it is used as template
- exclude_tasks = ['dataset_download']
+ exclude_tasks = ["dataset_download"]
# If DEEPaaS was not launched, do not launch UI because it will fail
- if user_conf['general']['service'] != 'deepaas':
- exclude_tasks.append('ui')
+ if user_conf["general"]["service"] != "deepaas":
+ exclude_tasks.append("ui")
- tasks[:] = [t for t in tasks if t['Name'] not in exclude_tasks]
+ tasks[:] = [t for t in tasks if t["Name"] not in exclude_tasks]
# Remove appropriate Traefik domains in each case (no need to remove the ports)
- services = nomad_conf['TaskGroups'][0]['Services']
- if user_conf['general']['service'] == 'deepaas':
- exclude_services = ['ide']
+ services = nomad_conf["TaskGroups"][0]["Services"]
+ if user_conf["general"]["service"] == "deepaas":
+ exclude_services = ["ide"]
else:
- exclude_services = ['ui']
- services[:] = [s for s in services if s['PortLabel'] not in exclude_services]
+ exclude_services = ["ui"]
+ services[:] = [s for s in services if s["PortLabel"] not in exclude_services]
# Submit job
r = nomad.create_deployment(nomad_conf)
@@ -344,7 +347,7 @@ def delete_deployment(
vo: str,
deployment_uuid: str,
authorization=Depends(security),
- ):
+):
"""
Delete a deployment. Users can only delete their own deployments.
@@ -356,13 +359,13 @@ def delete_deployment(
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Delete deployment
r = nomad.delete_deployment(
deployment_uuid=deployment_uuid,
- namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo],
- owner=auth_info['id'],
+ namespace=papiconf.MAIN_CONF["nomad"]["namespaces"][vo],
+ owner=auth_info["id"],
)
return r
diff --git a/ai4papi/routers/v1/deployments/tools.py b/ai4papi/routers/v1/deployments/tools.py
index 49cf252..2c7fb85 100644
--- a/ai4papi/routers/v1/deployments/tools.py
+++ b/ai4papi/routers/v1/deployments/tools.py
@@ -1,5 +1,4 @@
from copy import deepcopy
-from datetime import datetime
import re
import secrets
import types
@@ -30,7 +29,7 @@ def get_deployments(
vos: Union[Tuple, None] = Query(default=None),
full_info: bool = Query(default=False),
authorization=Depends(security),
- ):
+):
"""
Returns a list of all deployments belonging to a user.
@@ -47,21 +46,21 @@ def get_deployments(
# If no VOs, then retrieve jobs from all user VOs
# Always remove VOs that do not belong to the project
if not vos:
- vos = auth_info['vos']
- vos = set(vos).intersection(set(papiconf.MAIN_CONF['auth']['VO']))
+ vos = auth_info["vos"]
+ vos = set(vos).intersection(set(papiconf.MAIN_CONF["auth"]["VO"]))
if not vos:
raise HTTPException(
status_code=401,
- detail=f"The provided Virtual Organizations do not match with any of your available VOs: {auth_info['vos']}."
- )
+ detail=f"The provided Virtual Organizations do not match with any of your available VOs: {auth_info['vos']}.",
+ )
user_jobs = []
for vo in vos:
# Retrieve all jobs in namespace
jobs = nomad.get_deployments(
- namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo],
- owner=auth_info['id'],
- prefix='tool',
+ namespace=papiconf.MAIN_CONF["nomad"]["namespaces"][vo],
+ owner=auth_info["id"],
+ prefix="tool",
)
# Retrieve info for jobs in namespace
@@ -69,7 +68,7 @@ def get_deployments(
try:
job_info = get_deployment(
vo=vo,
- deployment_uuid=j['ID'],
+ deployment_uuid=j["ID"],
full_info=full_info,
authorization=types.SimpleNamespace(
credentials=authorization.credentials # token
@@ -78,12 +77,12 @@ def get_deployments(
except HTTPException: # not a tool
continue
except Exception as e: # unexpected error
- raise(e)
+ raise (e)
user_jobs.append(job_info)
# Sort deployments by creation date
- seq = [j['submit_time'] for j in user_jobs]
+ seq = [j["submit_time"] for j in user_jobs]
args = sorted(range(len(seq)), key=seq.__getitem__)[::-1]
sorted_jobs = [user_jobs[i] for i in args]
@@ -96,7 +95,7 @@ def get_deployment(
deployment_uuid: str,
full_info: bool = Query(default=True),
authorization=Depends(security),
- ):
+):
"""
Retrieve the info of a specific deployment.
Format outputs to a Nomad-independent format to be used by the Dashboard
@@ -111,39 +110,43 @@ def get_deployment(
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Retrieve the associated namespace to that VO
- namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo]
+ namespace = papiconf.MAIN_CONF["nomad"]["namespaces"][vo]
job = nomad.get_deployment(
deployment_uuid=deployment_uuid,
namespace=namespace,
- owner=auth_info['id'],
+ owner=auth_info["id"],
full_info=full_info,
)
# Check the deployment is indeed a tool
- if not job['name'].startswith('tool'):
+ if not job["name"].startswith("tool"):
raise HTTPException(
status_code=400,
detail="This deployment is not a tool.",
- )
+ )
# Add an additional field with the tool type
# We map name from Nomad job to tool ID
- match = re.search(r'tool-(.*?)-[a-f0-9-]{36}', job['name'])
- nomad_name = match.group(1) if match else ''
- tool_id = papiconf.tools_nomad2id.get(nomad_name, '')
- job['tool_name'] = tool_id
+ match = re.search(r"tool-(.*?)-[a-f0-9-]{36}", job["name"])
+ nomad_name = match.group(1) if match else ""
+ tool_id = papiconf.tools_nomad2id.get(nomad_name, "")
+ job["tool_name"] = tool_id
# Additional checks
- if tool_id == 'ai4os-cvat':
+ if tool_id == "ai4os-cvat":
# Remove useless endpoints (they all point to same url)
- ignore = ['server', 'grafana']
- job['endpoints'] = {k: v for k, v in job['endpoints'].items() if k not in ignore}
- if job['active_endpoints']:
- job['active_endpoints'] = [k for k in job['active_endpoints'] if k not in ignore]
+ ignore = ["server", "grafana"]
+ job["endpoints"] = {
+ k: v for k, v in job["endpoints"].items() if k not in ignore
+ }
+ if job["active_endpoints"]:
+ job["active_endpoints"] = [
+ k for k in job["active_endpoints"] if k not in ignore
+ ]
return job
@@ -154,7 +157,7 @@ def create_deployment(
tool_name: str,
conf: Union[dict, None] = None,
authorization=Depends(security),
- ):
+):
"""
Submit a deployment to Nomad.
@@ -180,18 +183,18 @@ def create_deployment(
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Check tool_ID
if tool_name not in Tools_catalog.get_items().keys():
raise HTTPException(
status_code=400,
detail="This ID does not correspond to an available tool.",
- )
+ )
# Load tool configuration
- nomad_conf = deepcopy(papiconf.TOOLS[tool_name]['nomad'])
- user_conf = deepcopy(papiconf.TOOLS[tool_name]['user']['values'])
+ nomad_conf = deepcopy(papiconf.TOOLS[tool_name]["nomad"])
+ user_conf = deepcopy(papiconf.TOOLS[tool_name]["user"]["values"])
# Update values conf in case we received a submitted conf
if conf is not None:
@@ -205,7 +208,7 @@ def create_deployment(
# Check if the provided configuration is within the job quotas
# Skip this check with CVAT because it does not have a "hardware" section in the conf
- if tool_name not in ['ai4os-cvat']:
+ if tool_name not in ["ai4os-cvat"]:
quotas.check_jobwise(
conf=user_conf,
vo=vo,
@@ -215,21 +218,20 @@ def create_deployment(
job_uuid = uuid.uuid1()
# Jobs from tutorial users should have low priority (ie. can be displaced if needed)
- if vo == 'training.egi.eu':
+ if vo == "training.egi.eu":
priority = 25
else:
priority = 50
- base_domain = papiconf.MAIN_CONF['lb']['domain'][vo]
+ base_domain = papiconf.MAIN_CONF["lb"]["domain"][vo]
# Deploy a Federated server
- if tool_name == 'ai4os-federated-server':
-
+ if tool_name == "ai4os-federated-server":
# Create a default secret for the Federated Server
_ = ai4secrets.create_secret(
vo=vo,
secret_path=f"deployments/{job_uuid}/federated/default",
- secret_data={'token': secrets.token_hex()},
+ secret_data={"token": secrets.token_hex()},
authorization=SimpleNamespace(
credentials=authorization.credentials,
),
@@ -238,107 +240,124 @@ def create_deployment(
# Create a Vault token so that the deployment can access the Federated secret
vault_token = ai4secrets.create_vault_token(
jwt=authorization.credentials,
- issuer=auth_info['issuer'],
- ttl='365d', # 1 year expiration date
+ issuer=auth_info["issuer"],
+ ttl="365d", # 1 year expiration date
)
# Replace the Nomad job template
nomad_conf = nomad_conf.safe_substitute(
{
- 'JOB_UUID': job_uuid,
- 'NAMESPACE': papiconf.MAIN_CONF['nomad']['namespaces'][vo],
- 'PRIORITY': priority,
- 'OWNER': auth_info['id'],
- 'OWNER_NAME': auth_info['name'],
- 'OWNER_EMAIL': auth_info['email'],
- 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters
- 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters
- 'BASE_DOMAIN': base_domain,
- 'HOSTNAME': job_uuid,
- 'DOCKER_IMAGE': user_conf['general']['docker_image'],
- 'DOCKER_TAG': user_conf['general']['docker_tag'],
- 'CPU_NUM': user_conf['hardware']['cpu_num'],
- 'RAM': user_conf['hardware']['ram'],
- 'DISK': user_conf['hardware']['disk'],
- 'SHARED_MEMORY': user_conf['hardware']['ram'] * 10**6 * 0.5,
+ "JOB_UUID": job_uuid,
+ "NAMESPACE": papiconf.MAIN_CONF["nomad"]["namespaces"][vo],
+ "PRIORITY": priority,
+ "OWNER": auth_info["id"],
+ "OWNER_NAME": auth_info["name"],
+ "OWNER_EMAIL": auth_info["email"],
+ "TITLE": user_conf["general"]["title"][
+ :45
+ ], # keep only 45 first characters
+ "DESCRIPTION": user_conf["general"]["desc"][
+ :1000
+ ], # limit to 1K characters
+ "BASE_DOMAIN": base_domain,
+ "HOSTNAME": job_uuid,
+ "DOCKER_IMAGE": user_conf["general"]["docker_image"],
+ "DOCKER_TAG": user_conf["general"]["docker_tag"],
+ "CPU_NUM": user_conf["hardware"]["cpu_num"],
+ "RAM": user_conf["hardware"]["ram"],
+ "DISK": user_conf["hardware"]["disk"],
+ "SHARED_MEMORY": user_conf["hardware"]["ram"] * 10**6 * 0.5,
# Limit at 50% of RAM memory, in bytes
- 'JUPYTER_PASSWORD': user_conf['general']['jupyter_password'],
- 'VAULT_TOKEN': vault_token,
- 'FEDERATED_ROUNDS': user_conf['configuration']['rounds'],
- 'FEDERATED_METRIC': user_conf['configuration']['metric'],
- 'FEDERATED_MIN_FIT_CLIENTS': user_conf['configuration']['min_fit_clients'],
- 'FEDERATED_MIN_AVAILABLE_CLIENTS': user_conf['configuration']['min_available_clients'],
- 'FEDERATED_STRATEGY': user_conf['configuration']['strategy'],
- 'MU_FEDPROX': user_conf['configuration']['mu'],
- 'FEDAVGM_SERVER_FL' : user_conf['configuration']['fl'],
- 'FEDAVGM_SERVER_MOMENTUM': user_conf['configuration']['momentum'],
- 'DP': user_conf['configuration']['dp'],
- 'NOISE_MULT': user_conf['configuration']['noise_mult'],
- 'SAMPLED_CLIENTS': user_conf['configuration']['sampled_clients'],
- 'CLIP_NORM': user_conf['configuration']['clip_norm']
+ "JUPYTER_PASSWORD": user_conf["general"]["jupyter_password"],
+ "VAULT_TOKEN": vault_token,
+ "FEDERATED_ROUNDS": user_conf["configuration"]["rounds"],
+ "FEDERATED_METRIC": user_conf["configuration"]["metric"],
+ "FEDERATED_MIN_FIT_CLIENTS": user_conf["configuration"][
+ "min_fit_clients"
+ ],
+ "FEDERATED_MIN_AVAILABLE_CLIENTS": user_conf["configuration"][
+ "min_available_clients"
+ ],
+ "FEDERATED_STRATEGY": user_conf["configuration"]["strategy"],
+ "MU_FEDPROX": user_conf["configuration"]["mu"],
+ "FEDAVGM_SERVER_FL": user_conf["configuration"]["fl"],
+ "FEDAVGM_SERVER_MOMENTUM": user_conf["configuration"]["momentum"],
+ "DP": user_conf["configuration"]["dp"],
+ "NOISE_MULT": user_conf["configuration"]["noise_mult"],
+ "SAMPLED_CLIENTS": user_conf["configuration"]["sampled_clients"],
+ "CLIP_NORM": user_conf["configuration"]["clip_norm"],
}
)
# Convert template to Nomad conf
nomad_conf = nomad.load_job_conf(nomad_conf)
- tasks = nomad_conf['TaskGroups'][0]['Tasks']
- usertask = [t for t in tasks if t['Name']=='main'][0]
+ tasks = nomad_conf["TaskGroups"][0]["Tasks"]
+ usertask = [t for t in tasks if t["Name"] == "main"][0]
# Launch `deep-start` compatible service if needed
- service = user_conf['general']['service']
- if service in ['deepaas', 'jupyter', 'vscode']:
- usertask['Config']['command'] = 'deep-start'
- usertask['Config']['args'] = [f'--{service}']
+ service = user_conf["general"]["service"]
+ if service in ["deepaas", "jupyter", "vscode"]:
+ usertask["Config"]["command"] = "deep-start"
+ usertask["Config"]["args"] = [f"--{service}"]
# Deploy a CVAT tool
- elif tool_name == 'ai4os-cvat':
-
+ elif tool_name == "ai4os-cvat":
# Enforce defining CVAT username and password
- cvat = {k: v for k, v in user_conf['general'].items() if k in ['cvat_username', 'cvat_password']}
+ cvat = {
+ k: v
+ for k, v in user_conf["general"].items()
+ if k in ["cvat_username", "cvat_password"]
+ }
if not all(cvat.values()):
raise HTTPException(
status_code=400,
detail="You must fill all CVAT-related variables.",
- )
+ )
# Enforce all rclone vars are defined
- rclone = {k: v for k, v in user_conf['storage'].items() if k.startswith('rclone')}
+ rclone = {
+ k: v for k, v in user_conf["storage"].items() if k.startswith("rclone")
+ }
if not all(rclone.values()):
raise HTTPException(
status_code=400,
detail="You must fill all RCLONE-related variables.",
- )
+ )
# Replace the Nomad job template
job_title = re.sub(
r'[<>:"/\\|?* ]',
- '_',
- user_conf['general']['title'][:45],
- ) # make title foldername-friendly
+ "_",
+ user_conf["general"]["title"][:45],
+ ) # make title foldername-friendly
nomad_conf = nomad_conf.safe_substitute(
{
- 'JOB_UUID': job_uuid,
- 'NAMESPACE': papiconf.MAIN_CONF['nomad']['namespaces'][vo],
- 'PRIORITY': priority,
- 'OWNER': auth_info['id'],
- 'OWNER_NAME': auth_info['name'],
- 'OWNER_EMAIL': auth_info['email'],
- 'TITLE': user_conf['general']['title'][:45], # keep only 45 first characters
- 'DESCRIPTION': user_conf['general']['desc'][:1000], # limit to 1K characters
- 'BASE_DOMAIN': base_domain,
- 'HOSTNAME': job_uuid,
- 'CVAT_USERNAME': user_conf['general']['cvat_username'],
- 'CVAT_PASSWORD': user_conf['general']['cvat_password'],
- 'RESTORE_FROM': user_conf['storage']['cvat_backup'],
- 'BACKUP_NAME': f'{job_title}',
- 'RCLONE_CONFIG_RSHARE_URL': user_conf['storage']['rclone_url'],
- 'RCLONE_CONFIG_RSHARE_VENDOR': user_conf['storage']['rclone_vendor'],
- 'RCLONE_CONFIG_RSHARE_USER': user_conf['storage']['rclone_user'],
- 'RCLONE_CONFIG_RSHARE_PASS': user_conf['storage']['rclone_password'],
- 'RCLONE_CONFIG': user_conf['storage']['rclone_conf'],
- }
+ "JOB_UUID": job_uuid,
+ "NAMESPACE": papiconf.MAIN_CONF["nomad"]["namespaces"][vo],
+ "PRIORITY": priority,
+ "OWNER": auth_info["id"],
+ "OWNER_NAME": auth_info["name"],
+ "OWNER_EMAIL": auth_info["email"],
+ "TITLE": user_conf["general"]["title"][
+ :45
+ ], # keep only 45 first characters
+ "DESCRIPTION": user_conf["general"]["desc"][
+ :1000
+ ], # limit to 1K characters
+ "BASE_DOMAIN": base_domain,
+ "HOSTNAME": job_uuid,
+ "CVAT_USERNAME": user_conf["general"]["cvat_username"],
+ "CVAT_PASSWORD": user_conf["general"]["cvat_password"],
+ "RESTORE_FROM": user_conf["storage"]["cvat_backup"],
+ "BACKUP_NAME": f"{job_title}",
+ "RCLONE_CONFIG_RSHARE_URL": user_conf["storage"]["rclone_url"],
+ "RCLONE_CONFIG_RSHARE_VENDOR": user_conf["storage"]["rclone_vendor"],
+ "RCLONE_CONFIG_RSHARE_USER": user_conf["storage"]["rclone_user"],
+ "RCLONE_CONFIG_RSHARE_PASS": user_conf["storage"]["rclone_password"],
+ "RCLONE_CONFIG": user_conf["storage"]["rclone_conf"],
+ }
)
# Convert template to Nomad conf
@@ -355,7 +374,7 @@ def delete_deployment(
vo: str,
deployment_uuid: str,
authorization=Depends(security),
- ):
+):
"""
Delete a deployment. Users can only delete their own deployments.
@@ -367,13 +386,13 @@ def delete_deployment(
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Delete deployment
r = nomad.delete_deployment(
deployment_uuid=deployment_uuid,
- namespace=papiconf.MAIN_CONF['nomad']['namespaces'][vo],
- owner=auth_info['id'],
+ namespace=papiconf.MAIN_CONF["nomad"]["namespaces"][vo],
+ owner=auth_info["id"],
)
# Remove Vault secrets belonging to that deployment
diff --git a/ai4papi/routers/v1/inference/__init__.py b/ai4papi/routers/v1/inference/__init__.py
index c30d6e7..2c6eacc 100644
--- a/ai4papi/routers/v1/inference/__init__.py
+++ b/ai4papi/routers/v1/inference/__init__.py
@@ -6,5 +6,5 @@
router = fastapi.APIRouter()
router.include_router(
router=oscar.router,
- prefix='/inference',
- )
+ prefix="/inference",
+)
diff --git a/ai4papi/routers/v1/inference/oscar.py b/ai4papi/routers/v1/inference/oscar.py
index 527dd9d..d436925 100644
--- a/ai4papi/routers/v1/inference/oscar.py
+++ b/ai4papi/routers/v1/inference/oscar.py
@@ -1,6 +1,7 @@
"""
Manage OSCAR clusters to create and execute services.
"""
+
from copy import deepcopy
from datetime import datetime
from functools import wraps
@@ -25,15 +26,16 @@
responses={404: {"description": "Inference not found"}},
)
+
class Service(BaseModel):
image: str
cpu: NonNegativeInt = 2
memory: NonNegativeInt = 3000
allowed_users: List[str] = [] # no additional users by default
- title: str = ''
+ title: str = ""
# Not configurable
- _name: str = '' # filled by PAPI with UUID
+ _name: str = "" # filled by PAPI with UUID
model_config = {
"json_schema_extra": {
@@ -43,12 +45,13 @@ class Service(BaseModel):
"image": "deephdc/deep-oc-image-classification-tf",
"cpu": 2,
"memory": 3000,
- "allowed_users": []
+ "allowed_users": [],
}
]
}
}
+
security = HTTPBearer()
@@ -56,9 +59,9 @@ def raise_for_status(func):
"""
Raise HTML error if the response of OSCAR functions has status!=2**.
"""
+
@wraps(func)
def wrapper(*args, **kwargs):
-
# Catch first errors happening internally
try:
r = func(*args, **kwargs)
@@ -66,12 +69,12 @@ def wrapper(*args, **kwargs):
raise HTTPException(
status_code=400,
detail=e,
- )
+ )
except requests.exceptions.HTTPError as e:
raise HTTPException(
status_code=500,
detail=e,
- )
+ )
# Catch errors when the function itself does not raise errors but the response
# has a non-successful code
@@ -81,7 +84,7 @@ def wrapper(*args, **kwargs):
raise HTTPException(
status_code=r.status_code,
detail=r.text,
- )
+ )
return wrapper
@@ -91,11 +94,11 @@ def get_client_from_auth(token, vo):
Retrieve authenticated user info and init OSCAR client.
"""
client_options = {
- 'cluster_id': MAIN_CONF["oscar"]["clusters"][vo]['cluster_id'],
- 'endpoint': MAIN_CONF["oscar"]["clusters"][vo]['endpoint'],
- 'oidc_token': token,
- 'ssl': 'true',
- }
+ "cluster_id": MAIN_CONF["oscar"]["clusters"][vo]["cluster_id"],
+ "endpoint": MAIN_CONF["oscar"]["clusters"][vo]["endpoint"],
+ "oidc_token": token,
+ "ssl": "true",
+ }
try:
client = Client(client_options)
@@ -115,22 +118,21 @@ def get_client_from_auth(token, vo):
def make_service_definition(svc_conf, vo):
-
# Create service definition
service = deepcopy(OSCAR_TMPL) # init from template
service = service.safe_substitute(
{
- 'CLUSTER_ID': MAIN_CONF["oscar"]["clusters"][vo]["cluster_id"],
- 'NAME': svc_conf._name,
- 'IMAGE': svc_conf.image,
- 'CPU': svc_conf.cpu,
- 'MEMORY': svc_conf.memory,
- 'ALLOWED_USERS': svc_conf.allowed_users,
- 'VO': vo,
- 'ENV_VARS': {
- 'Variables':{
- 'PAPI_TITLE': svc_conf.title,
- 'PAPI_CREATED': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+ "CLUSTER_ID": MAIN_CONF["oscar"]["clusters"][vo]["cluster_id"],
+ "NAME": svc_conf._name,
+ "IMAGE": svc_conf.image,
+ "CPU": svc_conf.cpu,
+ "MEMORY": svc_conf.memory,
+ "ALLOWED_USERS": svc_conf.allowed_users,
+ "VO": vo,
+ "ENV_VARS": {
+ "Variables": {
+ "PAPI_TITLE": svc_conf.title,
+ "PAPI_CREATED": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
},
}
@@ -144,14 +146,14 @@ def make_service_definition(svc_conf, vo):
def get_cluster_info(
vo: str,
authorization=Depends(security),
- ):
+):
"""
Gets information about the cluster.
- Returns a JSON with the cluster information.
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Get cluster info
client = get_client_from_auth(authorization.credentials, vo)
@@ -165,7 +167,7 @@ def get_services_list(
vo: str,
public: bool = Query(default=False),
authorization=Depends(security),
- ):
+):
"""
Retrieves a list of all the deployed services of the cluster.
@@ -177,7 +179,7 @@ def get_services_list(
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Get services list
client = get_client_from_auth(authorization.credentials, vo)
@@ -186,27 +188,26 @@ def get_services_list(
# Filter services
services = []
for s in json.loads(r.text):
-
# Filter out public services, if requested
- if not (s.get('allowed_users', None) or public):
+ if not (s.get("allowed_users", None) or public):
continue
# Retrieve only services launched by PAPI
- if not s.get('name', '').startswith('ai4papi-'):
+ if not s.get("name", "").startswith("ai4papi-"):
continue
# Keep only services that belong to vo
- if vo not in s.get('vo', []):
+ if vo not in s.get("vo", []):
continue
# Add service endpoint
cluster_endpoint = MAIN_CONF["oscar"]["clusters"][vo]["endpoint"]
- s['endpoint'] = f"{cluster_endpoint}/run/{s['name']}"
+ s["endpoint"] = f"{cluster_endpoint}/run/{s['name']}"
services.append(s)
# Sort services by creation time, recent to old
- dates = [s['environment']['Variables']['PAPI_CREATED'] for s in services]
+ dates = [s["environment"]["Variables"]["PAPI_CREATED"] for s in services]
idxs = sorted(range(len(dates)), key=dates.__getitem__) # argsort
sorted_services = [services[i] for i in idxs[::-1]]
@@ -218,14 +219,14 @@ def get_service(
vo: str,
service_name: str,
authorization=Depends(security),
- ):
+):
"""
Retrieves a specific service.
- Returns a JSON with the cluster information.
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Get service
client = get_client_from_auth(authorization.credentials, vo)
@@ -234,7 +235,7 @@ def get_service(
# Add service endpoint
cluster_endpoint = MAIN_CONF["oscar"]["clusters"][vo]["endpoint"]
- service['endpoint'] = f"{cluster_endpoint}/run/{service_name}"
+ service["endpoint"] = f"{cluster_endpoint}/run/{service_name}"
return service
@@ -244,25 +245,25 @@ def create_service(
vo: str,
svc_conf: Service,
authorization=Depends(security),
- ):
+):
"""
Creates a new inference service for an AI pre-trained model on a specific cluster.
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Assign random UUID to service to avoid clashes
# We clip it because OSCAR only seems to support names smaller than 39 characters
- svc_conf._name = f'ai4papi-{uuid.uuid1()}'[:39]
+ svc_conf._name = f"ai4papi-{uuid.uuid1()}"[:39]
# Create service definition
service_definition = make_service_definition(svc_conf, vo)
- service_definition['allowed_users'] += [auth_info['id']] # add service owner
+ service_definition["allowed_users"] += [auth_info["id"]] # add service owner
# Update service
client = get_client_from_auth(authorization.credentials, vo)
- r = client.create_service(service_definition)
+ _ = client.create_service(service_definition)
return svc_conf._name
@@ -273,23 +274,23 @@ def update_service(
service_name: str,
svc_conf: Service,
authorization=Depends(security),
- ):
+):
"""
Updates service if it exists.
The method needs all service parameters to be on the request.
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Create service definition
svc_conf._name = service_name
service_definition = make_service_definition(svc_conf, vo)
- service_definition['allowed_users'] += [auth_info['id']] # add service owner
+ service_definition["allowed_users"] += [auth_info["id"]] # add service owner
# Update service
client = get_client_from_auth(authorization.credentials, vo)
- r = client.update_service(svc_conf._name, service_definition)
+ _ = client.update_service(svc_conf._name, service_definition)
return service_name
@@ -299,17 +300,17 @@ def delete_service(
vo: str,
service_name: str,
authorization=Depends(security),
- ):
+):
"""
Delete a specific service.
Raises 500 if the service does not exists.
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Delete service
client = get_client_from_auth(authorization.credentials, vo)
- r = client.remove_service(service_name)
+ _ = client.remove_service(service_name)
return service_name
diff --git a/ai4papi/routers/v1/secrets.py b/ai4papi/routers/v1/secrets.py
index 47c79c4..30f40e3 100644
--- a/ai4papi/routers/v1/secrets.py
+++ b/ai4papi/routers/v1/secrets.py
@@ -29,22 +29,22 @@ def vault_client(jwt, issuer):
Common init steps of Vault client
"""
# Check we are using EGI Check-In prod
- if issuer != 'https://aai.egi.eu/auth/realms/egi':
+ if issuer != "https://aai.egi.eu/auth/realms/egi":
raise HTTPException(
status_code=400,
- detail="Secrets are only compatible with EGI Check-In Production OIDC " \
- "provider.",
- )
+ detail="Secrets are only compatible with EGI Check-In Production OIDC "
+ "provider.",
+ )
# Init the Vault client
client = hvac.Client(
url=VAULT_ADDR,
- )
+ )
client.auth.jwt.jwt_login(
role=VAULT_ROLE,
jwt=jwt,
path=VAULT_AUTH_PATH,
- )
+ )
return client
@@ -52,8 +52,8 @@ def vault_client(jwt, issuer):
def create_vault_token(
jwt,
issuer,
- ttl='1h',
- ):
+ ttl="1h",
+):
"""
Create a Vault token from a JWT.
@@ -70,7 +70,7 @@ def create_vault_token(
# So instead of creating a child token, we have to *extend* login token.
client.auth.token.renew_self(increment=ttl)
- #TODO: for extra security we should only allow reading/listing from a given subpath.
+ # TODO: for extra security we should only allow reading/listing from a given subpath.
# - Restrict to read/list can be done with user roles
# - Restricting subpaths might not be done because policies are static (and
# deployment paths are dynamic). In addition only admins can create policies)
@@ -86,12 +86,11 @@ def recursive_path_builder(client, kv_list):
# if any list items end in '/' return 1
for li in kv_list[:]:
- if li[-1] == '/':
+ if li[-1] == "/":
r = client.secrets.kv.v1.list_secrets(
- path=li,
- mount_point=VAULT_MOUNT_POINT
+ path=li, mount_point=VAULT_MOUNT_POINT
)
- append_list = r['data']['keys']
+ append_list = r["data"]["keys"]
for new_item in append_list:
kv_list.append(li + new_item)
# remove list item ending in '/'
@@ -108,9 +107,9 @@ def recursive_path_builder(client, kv_list):
@router.get("")
def get_secrets(
vo: str,
- subpath: str = '',
+ subpath: str = "",
authorization=Depends(security),
- ):
+):
"""
Returns a list of secrets belonging to a user.
@@ -123,28 +122,27 @@ def get_secrets(
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Init the Vault client
client = vault_client(
jwt=authorization.credentials,
- issuer=auth_info['issuer'],
+ issuer=auth_info["issuer"],
)
# Check subpath syntax
- if not subpath.startswith('/'):
- subpath = '/' + subpath
- if not subpath.endswith('/'):
- subpath += '/'
+ if not subpath.startswith("/"):
+ subpath = "/" + subpath
+ if not subpath.endswith("/"):
+ subpath += "/"
# Retrieve initial level-0 secrets
user_path = f"users/{auth_info['id']}/{vo}"
try:
r = client.secrets.kv.v1.list_secrets(
- path = user_path + subpath,
- mount_point=VAULT_MOUNT_POINT
+ path=user_path + subpath, mount_point=VAULT_MOUNT_POINT
)
- seed_list = r['data']['keys']
+ seed_list = r["data"]["keys"]
except hvac.exceptions.InvalidPath:
# InvalidPath is raised when there are no secrets available
return {}
@@ -163,8 +161,8 @@ def get_secrets(
)
# Remove user-path prefix and save
- secret_path = secret_path.replace(user_path, '')
- out[secret_path] = r1['data']
+ secret_path = secret_path.replace(user_path, "")
+ out[secret_path] = r1["data"]
return out
@@ -175,7 +173,7 @@ def create_secret(
secret_path: str,
secret_data: dict,
authorization=Depends(security),
- ):
+):
"""
Creates a new secret or updates an existing one.
@@ -191,22 +189,22 @@ def create_secret(
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Init the Vault client
client = vault_client(
jwt=authorization.credentials,
- issuer=auth_info['issuer'],
+ issuer=auth_info["issuer"],
)
# Create secret
client.secrets.kv.v1.create_or_update_secret(
path=f"users/{auth_info['id']}/{vo}/{secret_path}",
- mount_point='/secrets/',
+ mount_point="/secrets/",
secret=secret_data,
)
- return {'status': 'success'}
+ return {"status": "success"}
@router.delete("")
@@ -214,7 +212,7 @@ def delete_secret(
vo: str,
secret_path: str,
authorization=Depends(security),
- ):
+):
"""
Delete a secret.
@@ -227,12 +225,12 @@ def delete_secret(
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Init the Vault client
client = vault_client(
jwt=authorization.credentials,
- issuer=auth_info['issuer'],
+ issuer=auth_info["issuer"],
)
# Delete secret
@@ -241,4 +239,4 @@ def delete_secret(
mount_point=VAULT_MOUNT_POINT,
)
- return {'status': 'success'}
+ return {"status": "success"}
diff --git a/ai4papi/routers/v1/snapshots.py b/ai4papi/routers/v1/snapshots.py
index f6a0920..f1943d9 100644
--- a/ai4papi/routers/v1/snapshots.py
+++ b/ai4papi/routers/v1/snapshots.py
@@ -160,7 +160,7 @@ def create_snapshot(
nomad_conf = nomad_common.load_job_conf(nomad_conf)
# Submit job
- r = nomad_common.create_deployment(nomad_conf)
+ _ = nomad_common.create_deployment(nomad_conf)
return {
"status": "success",
@@ -318,7 +318,6 @@ def get_nomad_snapshots(
# user_jobs = []
snapshots = []
for j in jobs:
-
# Get job to retrieve the metadata
job_info = Nomad.job.get_job(
id_=j["ID"],
@@ -355,7 +354,9 @@ def get_nomad_snapshots(
][::-1] # more recent first
# Retrieve tasks
- tasks = allocs[0]["TaskStates"] if allocs else {} # if no allocations, use empty dict
+ tasks = (
+ allocs[0]["TaskStates"] if allocs else {}
+ ) # if no allocations, use empty dict
tasks = tasks or {} # if None, use empty dict
client_status = allocs[0]["ClientStatus"] if allocs else None
diff --git a/ai4papi/routers/v1/stats/__init__.py b/ai4papi/routers/v1/stats/__init__.py
index df69733..54e786f 100644
--- a/ai4papi/routers/v1/stats/__init__.py
+++ b/ai4papi/routers/v1/stats/__init__.py
@@ -6,5 +6,5 @@
router = fastapi.APIRouter()
router.include_router(
router=deployments.router,
- prefix='/deployments',
- )
+ prefix="/deployments",
+)
diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py
index e399bc2..78c355e 100644
--- a/ai4papi/routers/v1/stats/deployments.py
+++ b/ai4papi/routers/v1/stats/deployments.py
@@ -29,63 +29,60 @@
main_dir = Path(__file__).resolve().parent
Nomad = nomad.Nomad()
-Nomad.job.get_allocations = types.MethodType(
- npatches.get_allocations,
- Nomad.job
-)
+Nomad.job.get_allocations = types.MethodType(npatches.get_allocations, Nomad.job)
cluster_stats = None
-@cached(cache=TTLCache(maxsize=1024, ttl=6*60*60))
+@cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60))
def load_stats(
namespace: str,
- ):
+):
"""
CSV reader and data filtering could be improved with Pandas, but that's a heavy
dependency, so we're keeping it like this for the moment.
"""
- main_dir = os.environ.get('ACCOUNTING_PTH', None)
+ main_dir = os.environ.get("ACCOUNTING_PTH", None)
if not main_dir:
raise HTTPException(
status_code=500,
detail="Deployments stats information not available (no env var).",
- )
+ )
# Load all stats files
stats = {}
- for name in ['full-agg', 'timeseries', 'users-agg']:
- pth = Path(main_dir) / 'summaries' / f'{namespace}-{name}.csv'
+ for name in ["full-agg", "timeseries", "users-agg"]:
+ pth = Path(main_dir) / "summaries" / f"{namespace}-{name}.csv"
if not pth.is_file():
raise HTTPException(
status_code=500,
detail="Deployments stats information not available (missing file).",
- )
+ )
- with open(pth, 'r') as f:
- reader = csv.DictReader(f, delimiter=';')
+ with open(pth, "r") as f:
+ reader = csv.DictReader(f, delimiter=";")
stats[name] = {k: [] for k in reader.fieldnames}
for row in reader:
for k, v in row.items():
- if k not in ['date', 'owner']:
- v= int(v)
+ if k not in ["date", "owner"]:
+ v = int(v)
stats[name][k].append(v)
# In VO timeseries, only return last three months
threshold = datetime.now() - timedelta(days=90)
threshold = str(threshold.date())
try:
- idx = [i > threshold for i in stats['timeseries']['date']].index(True)
+ idx = [i > threshold for i in stats["timeseries"]["date"]].index(True)
except Exception:
# If there are no data in the last 90 days, then return last 90 dates
idx = -90
- for k, v in stats['timeseries'].items():
- stats['timeseries'][k] = v[idx:]
+ for k, v in stats["timeseries"].items():
+ stats["timeseries"][k] = v[idx:]
# Namespace aggregates are not lists
- stats['full-agg'] = {k: v[0] for k, v in stats['full-agg'].items()}
+ stats["full-agg"] = {k: v[0] for k, v in stats["full-agg"].items()}
return stats
@@ -94,7 +91,7 @@ def load_stats(
def get_user_stats(
vo: str,
authorization=Depends(security),
- ):
+):
"""
Returns the following stats (per resource type):
* the time-series usage of that VO
@@ -107,10 +104,10 @@ def get_user_stats(
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Retrieve the associated namespace to that VO
- namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo]
+ namespace = papiconf.MAIN_CONF["nomad"]["namespaces"][vo]
# Load proper namespace stats
full_stats = load_stats(namespace=namespace)
@@ -118,63 +115,66 @@ def get_user_stats(
# Keep only stats from the current user
user_stats = copy.deepcopy(full_stats)
try:
- idx = full_stats['users-agg']['owner'].index(auth_info['id'])
- user_stats['users-agg'] = {k: v[idx] for k, v in full_stats['users-agg'].items()}
+ idx = full_stats["users-agg"]["owner"].index(auth_info["id"])
+ user_stats["users-agg"] = {
+ k: v[idx] for k, v in full_stats["users-agg"].items()
+ }
except ValueError: # user has still no recorded stats
- user_stats['users-agg'] = None
+ user_stats["users-agg"] = None
return user_stats
def get_proper_allocation(allocs):
-
- # Reorder allocations based on recency
- dates = [a['CreateTime'] for a in allocs]
- allocs = [x for _, x in sorted(
+ # Reorder allocations based on recency
+ dates = [a["CreateTime"] for a in allocs]
+ allocs = [
+ x
+ for _, x in sorted(
zip(dates, allocs),
key=lambda pair: pair[0],
- )][::-1] # more recent first
-
- # Select the proper allocation
- statuses = [a['ClientStatus'] for a in allocs]
- if 'unknown' in statuses:
- # The node has lost connection. Avoid showing temporary reallocated job,
- # to avoid confusions when the original allocation is restored back again.
- idx = statuses.index('unknown')
- elif 'running' in statuses:
- # If an allocation is running, return that allocation
- # It happens that after a network cut, when the network is restored,
- # the temporary allocation created in the meantime (now with status
- # 'complete') is more recent than the original allocation that we
- # recovered (with status 'running'), so using only recency does not work.
- idx = statuses.index('running')
- else:
- # Return most recent allocation
- idx = 0
-
- return allocs[idx]['ID']
-
-
-@cached(cache=TTLCache(maxsize=1024, ttl=6*60*60))
+ )
+ ][::-1] # more recent first
+
+ # Select the proper allocation
+ statuses = [a["ClientStatus"] for a in allocs]
+ if "unknown" in statuses:
+ # The node has lost connection. Avoid showing temporary reallocated job,
+ # to avoid confusions when the original allocation is restored back again.
+ idx = statuses.index("unknown")
+ elif "running" in statuses:
+ # If an allocation is running, return that allocation
+ # It happens that after a network cut, when the network is restored,
+ # the temporary allocation created in the meantime (now with status
+ # 'complete') is more recent than the original allocation that we
+ # recovered (with status 'running'), so using only recency does not work.
+ idx = statuses.index("running")
+ else:
+ # Return most recent allocation
+ idx = 0
+
+ return allocs[idx]["ID"]
+
+
+@cached(cache=TTLCache(maxsize=1024, ttl=6 * 60 * 60))
def load_datacenters():
-
# Check if datacenter info file is available
- pth = papiconf.main_path.parent / 'var' / 'datacenters_info.csv'
+ pth = papiconf.main_path.parent / "var" / "datacenters_info.csv"
if not pth.is_file():
return {}
# Load datacenter info
datacenters = {}
- with open(pth, 'r') as f:
- reader = csv.DictReader(f, delimiter=';')
+ with open(pth, "r") as f:
+ reader = csv.DictReader(f, delimiter=";")
dc_keys = reader.fieldnames.copy()
- dc_keys.remove('name')
+ dc_keys.remove("name")
for row in reader:
for k, v in row.items():
- if k == 'name':
+ if k == "name":
name = v
datacenters[name] = {k: 0 for k in dc_keys}
- datacenters[name]['nodes'] = {}
+ datacenters[name]["nodes"] = {}
else:
datacenters[name][k] = float(v)
@@ -185,7 +185,7 @@ def load_datacenters():
@cached(cache=TTLCache(maxsize=1024, ttl=30))
def get_cluster_stats(
vo: str,
- ):
+):
"""
Returns the following stats of the nodes and the cluster (per resource type):
* the aggregated usage
@@ -201,45 +201,46 @@ def get_cluster_stats(
cluster_stats = get_cluster_stats_bg()
stats = copy.deepcopy(cluster_stats)
- namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo]
-
- for k, v in stats['datacenters'].copy().items():
+ namespace = papiconf.MAIN_CONF["nomad"]["namespaces"][vo]
+ for k, v in stats["datacenters"].copy().items():
# Filter out nodes that do not support the given VO
nodes = {}
- for n_id, n_stats in v['nodes'].items():
- if namespace in n_stats['namespaces']:
+ for n_id, n_stats in v["nodes"].items():
+ if namespace in n_stats["namespaces"]:
nodes[n_id] = n_stats
# Ignore datacenters with no nodes
if not nodes:
- del stats['datacenters'][k]
+ del stats["datacenters"][k]
else:
- stats['datacenters'][k]['nodes'] = nodes
+ stats["datacenters"][k]["nodes"] = nodes
# Compute cluster stats after node filtering is done
- for dc_stats in stats['datacenters'].values():
- for n_stats in dc_stats['nodes'].values():
+ for dc_stats in stats["datacenters"].values():
+ for n_stats in dc_stats["nodes"].values():
for k, v in n_stats.items():
-
# Ignore keys
- if k in ['name', 'namespaces', 'eligibility', 'status', 'tags']:
+ if k in ["name", "namespaces", "eligibility", "status", "tags"]:
continue
# Aggregate nested gpu_models dict
- elif k == 'gpu_models':
+ elif k == "gpu_models":
for k1, v1 in v.items():
- model_stats = stats['cluster']['gpu_models'].get(
+ model_stats = stats["cluster"]["gpu_models"].get(
k1,
- {'gpu_total': 0, 'gpu_used': 0,} # init value
+ {
+ "gpu_total": 0,
+ "gpu_used": 0,
+ }, # init value
)
for k2, v2 in v1.items():
model_stats[k2] += v2
- stats['cluster']['gpu_models'][k1] = model_stats
+ stats["cluster"]["gpu_models"][k1] = model_stats
# Aggregate other resources
else:
- stats['cluster'][k] += v
+ stats["cluster"][k] += v
return stats
@@ -253,132 +254,142 @@ def get_cluster_stats_bg():
"""
resources = [
- 'jobs_num',
- 'cpu_total',
- 'cpu_used',
- 'gpu_total',
- 'gpu_used',
- 'ram_total',
- 'ram_used',
- 'disk_total',
- 'disk_used',
+ "jobs_num",
+ "cpu_total",
+ "cpu_used",
+ "gpu_total",
+ "gpu_used",
+ "ram_total",
+ "ram_used",
+ "disk_total",
+ "disk_used",
]
- datacenters = load_datacenters() # available datacenters info
+ datacenters = load_datacenters() # available datacenters info
stats = {
- 'datacenters' : datacenters, # aggregated datacenter usage
- 'cluster': {k: 0 for k in resources}, # aggregated cluster usage
- }
- stats['cluster']['gpu_models'] = {}
+ "datacenters": datacenters, # aggregated datacenter usage
+ "cluster": {k: 0 for k in resources}, # aggregated cluster usage
+ }
+ stats["cluster"]["gpu_models"] = {}
# Load nodes
nodes = Nomad.nodes.get_nodes(resources=True)
- nodes_dc = {} # dict(node, datacenter)
+ nodes_dc = {} # dict(node, datacenter)
# Get total stats for each node
for n in nodes:
- node = Nomad.node.get_node(n['ID'])
+ node = Nomad.node.get_node(n["ID"])
n_stats = {k: 0 for k in resources}
- n_stats['name'] = node['Name']
- n_stats['eligibility'] = node['SchedulingEligibility']
- n_stats['cpu_total'] = int(node['Attributes']['cpu.numcores'])
- n_stats['ram_total'] = int(node['Attributes']['memory.totalbytes']) / 2**20
- n_stats['disk_total'] = int(node['Attributes']['unique.storage.bytestotal']) / 2**20
- n_stats['gpu_models'] = {}
- n_stats['namespaces'] = node['Meta'].get('namespace', '')
- n_stats['status'] = node['Meta'].get('status', '')
- n_stats['tags'] = node['Meta'].get('tags', '')
-
- if n['NodeResources']['Devices']:
- for devices in n['NodeResources']['Devices']:
- if devices['Type'] == 'gpu':
- n_stats['gpu_total'] += len(devices['Instances'])
+ n_stats["name"] = node["Name"]
+ n_stats["eligibility"] = node["SchedulingEligibility"]
+ n_stats["cpu_total"] = int(node["Attributes"]["cpu.numcores"])
+ n_stats["ram_total"] = int(node["Attributes"]["memory.totalbytes"]) / 2**20
+ n_stats["disk_total"] = (
+ int(node["Attributes"]["unique.storage.bytestotal"]) / 2**20
+ )
+ n_stats["gpu_models"] = {}
+ n_stats["namespaces"] = node["Meta"].get("namespace", "")
+ n_stats["status"] = node["Meta"].get("status", "")
+ n_stats["tags"] = node["Meta"].get("tags", "")
+
+ if n["NodeResources"]["Devices"]:
+ for devices in n["NodeResources"]["Devices"]:
+ if devices["Type"] == "gpu":
+ n_stats["gpu_total"] += len(devices["Instances"])
# Track stats per GPU model type
- if devices['Name'] not in n_stats['gpu_models'].keys():
- n_stats['gpu_models'][devices['Name']] = {'gpu_total': 0, 'gpu_used': 0}
+ if devices["Name"] not in n_stats["gpu_models"].keys():
+ n_stats["gpu_models"][devices["Name"]] = {
+ "gpu_total": 0,
+ "gpu_used": 0,
+ }
- n_stats['gpu_models'][devices['Name']]['gpu_total'] += len(devices['Instances'])
+ n_stats["gpu_models"][devices["Name"]]["gpu_total"] += len(
+ devices["Instances"]
+ )
# If datacenter is not in csv, load default info
- if n['Datacenter'] not in stats['datacenters']:
- stats['datacenters'][n['Datacenter']] = {'lat':0, 'lon':0, 'PUE':0, 'energy_quality':0, 'nodes':{}}
-
- stats['datacenters'][n['Datacenter']]['nodes'][n['ID']] = n_stats
- nodes_dc[n['ID']] = n['Datacenter']
+ if n["Datacenter"] not in stats["datacenters"]:
+ stats["datacenters"][n["Datacenter"]] = {
+ "lat": 0,
+ "lon": 0,
+ "PUE": 0,
+ "energy_quality": 0,
+ "nodes": {},
+ }
+
+ stats["datacenters"][n["Datacenter"]]["nodes"][n["ID"]] = n_stats
+ nodes_dc[n["ID"]] = n["Datacenter"]
# Get aggregated usage stats for each node
- namespaces = ['default'] + list(papiconf.MAIN_CONF['nomad']['namespaces'].values())
+ namespaces = ["default"] + list(papiconf.MAIN_CONF["nomad"]["namespaces"].values())
for namespace in namespaces:
jobs = Nomad.jobs.get_jobs(namespace=namespace, filter_='Status == "running"')
for j in jobs:
-
# Retrieve full job for meta
job = Nomad.job.get_job(
- id_=j['ID'],
+ id_=j["ID"],
namespace=namespace,
- )
+ )
allocs = Nomad.job.get_allocations(
- id_=job['ID'],
+ id_=job["ID"],
namespace=namespace,
- )
+ )
# Keep the proper allocation
- a = Nomad.allocation.get_allocation(
- get_proper_allocation(allocs)
- )
+ a = Nomad.allocation.get_allocation(get_proper_allocation(allocs))
# Add resources
- datacenter = nodes_dc[a['NodeID']]
- n_stats = stats['datacenters'][datacenter]['nodes'][a['NodeID']]
+ datacenter = nodes_dc[a["NodeID"]]
+ n_stats = stats["datacenters"][datacenter]["nodes"][a["NodeID"]]
- #TODO: we are ignoring resources consumed by other jobs
- if job['Name'].startswith('module') or job['Name'].startswith('tool'):
- n_stats['jobs_num'] += 1
+ # TODO: we are ignoring resources consumed by other jobs
+ if job["Name"].startswith("module") or job["Name"].startswith("tool"):
+ n_stats["jobs_num"] += 1
- #TODO: we are ignoring resources consumed by other tasks
- if 'main' in a['AllocatedResources']['Tasks']:
- res = a['AllocatedResources']['Tasks']['main']
+ # TODO: we are ignoring resources consumed by other tasks
+ if "main" in a["AllocatedResources"]["Tasks"]:
+ res = a["AllocatedResources"]["Tasks"]["main"]
# cpu
- if res['Cpu']['ReservedCores']:
- n_stats['cpu_used'] += len(res['Cpu']['ReservedCores'])
+ if res["Cpu"]["ReservedCores"]:
+ n_stats["cpu_used"] += len(res["Cpu"]["ReservedCores"])
# ram
- n_stats['ram_used'] += res['Memory']['MemoryMB']
+ n_stats["ram_used"] += res["Memory"]["MemoryMB"]
# disk
# Note: In theory we can get the total disk used in a node looking at the
# metadata (ie. "unique.storage.bytesfree"). But that gave us the disk that
# is actually used. But we are instead interested on the disk that is reserved
# by users (regardless of whether they are actually using it).
- n_stats['disk_used'] += a['AllocatedResources']['Shared']['DiskMB']
+ n_stats["disk_used"] += a["AllocatedResources"]["Shared"]["DiskMB"]
# gpu
- if res['Devices']:
- gpu = [d for d in res['Devices'] if d['Type'] == 'gpu'][0]
- gpu_num = len(gpu['DeviceIDs']) if gpu else 0
+ if res["Devices"]:
+ gpu = [d for d in res["Devices"] if d["Type"] == "gpu"][0]
+ gpu_num = len(gpu["DeviceIDs"]) if gpu else 0
# Sometimes the node fails and GPUs are not detected [1].
# In that case, avoid counting that GPU in the stats.
# [1]: https://docs.ai4os.eu/en/latest/user/others/faq.html#my-gpu-just-disappeared-from-my-deployment
- if n_stats['gpu_models']:
- n_stats['gpu_used'] += gpu_num
- n_stats['gpu_models'][gpu['Name']]['gpu_used'] += gpu_num
+ if n_stats["gpu_models"]:
+ n_stats["gpu_used"] += gpu_num
+ n_stats["gpu_models"][gpu["Name"]]["gpu_used"] += gpu_num
else:
continue
# Keep ineligible nodes, but set (used=total) for all resources
# We don't remove the node altogether because jobs might still be running there
# and we want to show them in the stats
- for datacenter in stats['datacenters'].values():
- for n_stats in datacenter['nodes'].values():
- if n_stats['eligibility'] == 'ineligible':
- for r in ['cpu', 'gpu', 'ram', 'disk']:
- n_stats[f'{r}_total'] = n_stats[f'{r}_used']
- for g_stats in n_stats['gpu_models'].values():
- g_stats['gpu_total'] = n_stats['gpu_used']
+ for datacenter in stats["datacenters"].values():
+ for n_stats in datacenter["nodes"].values():
+ if n_stats["eligibility"] == "ineligible":
+ for r in ["cpu", "gpu", "ram", "disk"]:
+ n_stats[f"{r}_total"] = n_stats[f"{r}_used"]
+ for g_stats in n_stats["gpu_models"].values():
+ g_stats["gpu_total"] = n_stats["gpu_used"]
# Set the new shared variable
global cluster_stats
diff --git a/ai4papi/routers/v1/storage.py b/ai4papi/routers/v1/storage.py
index 23a2b12..a067ac2 100644
--- a/ai4papi/routers/v1/storage.py
+++ b/ai4papi/routers/v1/storage.py
@@ -25,9 +25,9 @@
def storage_ls(
vo: str,
storage_name: str,
- subpath: str = '',
+ subpath: str = "",
authorization=Depends(security),
- ):
+):
"""
Returns a list of files/folders inside a given subpath of the specified storage.
It is using RCLONE under-the-hood.
@@ -41,19 +41,19 @@ def storage_ls(
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Retrieve storage credentials
if storage_name:
# Retrieve the rclone credentials
secrets = ai4secrets.get_secrets(
vo=vo,
- subpath='/services/storage/',
+ subpath="/services/storage/",
authorization=types.SimpleNamespace(
credentials=authorization.credentials,
),
)
- storage = secrets[f'/services/storage/{storage_name}']
+ storage = secrets[f"/services/storage/{storage_name}"]
if not storage:
raise HTTPException(
status_code=401,
@@ -61,21 +61,22 @@ def storage_ls(
)
# Use rclone to list content of subpath
- result = subprocess.run([
- f"export RCLONE_CONFIG_RSHARE_VENDOR={storage['vendor']} && "
- f"export RCLONE_CONFIG_RSHARE_URL={storage['server']}/remote.php/dav/files/{storage['loginName']} && "
- "export RCLONE_CONFIG_RSHARE_TYPE=webdav && "
- f"export RCLONE_CONFIG_RSHARE_USER={storage['loginName']} && "
- f"export RCLONE_CONFIG_RSHARE_PASS={storage['appPassword']} && "
- "export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $RCLONE_CONFIG_RSHARE_PASS) && "
- f"rclone lsjson rshare:/{subpath} ;"
- "status=$? ;" # we want to return the status code of the rclone purge command
- "for var in $(env | grep '^RCLONE_CONFIG_RSHARE_' | awk -F= '{print $1}'); do unset $var; done;"
- "exit $status"
+ result = subprocess.run(
+ [
+ f"export RCLONE_CONFIG_RSHARE_VENDOR={storage['vendor']} && "
+ f"export RCLONE_CONFIG_RSHARE_URL={storage['server']}/remote.php/dav/files/{storage['loginName']} && "
+ "export RCLONE_CONFIG_RSHARE_TYPE=webdav && "
+ f"export RCLONE_CONFIG_RSHARE_USER={storage['loginName']} && "
+ f"export RCLONE_CONFIG_RSHARE_PASS={storage['appPassword']} && "
+ "export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $RCLONE_CONFIG_RSHARE_PASS) && "
+ f"rclone lsjson rshare:/{subpath} ;"
+ "status=$? ;" # we want to return the status code of the rclone purge command
+ "for var in $(env | grep '^RCLONE_CONFIG_RSHARE_' | awk -F= '{print $1}'); do unset $var; done;"
+ "exit $status"
],
shell=True,
capture_output=True,
- text=True
+ text=True,
)
# Check for possible errors
@@ -102,7 +103,7 @@ def storage_rm(
storage_name: str,
subpath: str,
authorization=Depends(security),
- ):
+):
"""
Deletes the files/folders inside a given subpath of the specified storage.
It is using RCLONE under-the-hood.
@@ -116,26 +117,26 @@ def storage_rm(
"""
# Retrieve authenticated user info
auth_info = auth.get_user_info(token=authorization.credentials)
- auth.check_vo_membership(vo, auth_info['vos'])
+ auth.check_vo_membership(vo, auth_info["vos"])
# Do not allow to delete root folder to prevent accidents
- if not subpath.strip('/'):
+ if not subpath.strip("/"):
raise HTTPException(
status_code=400,
detail="You cannot delete the root folder for security reasons.",
- )
+ )
# Retrieve storage credentials
if storage_name:
# Retrieve the rclone credentials
secrets = ai4secrets.get_secrets(
vo=vo,
- subpath='/services/storage/',
+ subpath="/services/storage/",
authorization=types.SimpleNamespace(
credentials=authorization.credentials,
),
)
- storage = secrets[f'/services/storage/{storage_name}']
+ storage = secrets[f"/services/storage/{storage_name}"]
if not storage:
raise HTTPException(
status_code=401,
@@ -143,21 +144,22 @@ def storage_rm(
)
# Use rclone to delete the subpath
- result = subprocess.run([
- f"export RCLONE_CONFIG_RSHARE_VENDOR={storage['vendor']} && "
- f"export RCLONE_CONFIG_RSHARE_URL={storage['server']}/remote.php/dav/files/{storage['loginName']} && "
- "export RCLONE_CONFIG_RSHARE_TYPE=webdav && "
- f"export RCLONE_CONFIG_RSHARE_USER={storage['loginName']} && "
- f"export RCLONE_CONFIG_RSHARE_PASS={storage['appPassword']} && "
- "export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $RCLONE_CONFIG_RSHARE_PASS) && "
- f"rclone purge rshare:/{subpath} ;"
- "status=$? ;" # we want to return the status code of the rclone purge command
- "for var in $(env | grep '^RCLONE_CONFIG_RSHARE_' | awk -F= '{print $1}'); do unset $var; done;"
- "exit $status"
+ result = subprocess.run(
+ [
+ f"export RCLONE_CONFIG_RSHARE_VENDOR={storage['vendor']} && "
+ f"export RCLONE_CONFIG_RSHARE_URL={storage['server']}/remote.php/dav/files/{storage['loginName']} && "
+ "export RCLONE_CONFIG_RSHARE_TYPE=webdav && "
+ f"export RCLONE_CONFIG_RSHARE_USER={storage['loginName']} && "
+ f"export RCLONE_CONFIG_RSHARE_PASS={storage['appPassword']} && "
+ "export RCLONE_CONFIG_RSHARE_PASS=$(rclone obscure $RCLONE_CONFIG_RSHARE_PASS) && "
+ f"rclone purge rshare:/{subpath} ;"
+ "status=$? ;" # we want to return the status code of the rclone purge command
+ "for var in $(env | grep '^RCLONE_CONFIG_RSHARE_' | awk -F= '{print $1}'); do unset $var; done;"
+ "exit $status"
],
shell=True,
capture_output=True,
- text=True
+ text=True,
)
# Check for possible errors
@@ -167,4 +169,4 @@ def storage_rm(
detail=f"Error deleting the selected subpath from storage. \n\n {result.stderr}",
)
- return {'status': 'success'}
+ return {"status": "success"}
diff --git a/ai4papi/routers/v1/try_me/__init__.py b/ai4papi/routers/v1/try_me/__init__.py
index 18169cb..7c6da08 100644
--- a/ai4papi/routers/v1/try_me/__init__.py
+++ b/ai4papi/routers/v1/try_me/__init__.py
@@ -6,5 +6,5 @@
router = fastapi.APIRouter()
router.include_router(
router=nomad.router,
- prefix='/try_me',
- )
+ prefix="/try_me",
+)
diff --git a/ai4papi/routers/v1/try_me/nomad.py b/ai4papi/routers/v1/try_me/nomad.py
index 2d267fa..c79f41c 100644
--- a/ai4papi/routers/v1/try_me/nomad.py
+++ b/ai4papi/routers/v1/try_me/nomad.py
@@ -26,14 +26,14 @@
# (!) try-me jobs are always deployed in AI4EOSC
VO = "vo.ai4eosc.eu"
-NAMESPACE = papiconf.MAIN_CONF['nomad']['namespaces'][VO]
+NAMESPACE = papiconf.MAIN_CONF["nomad"]["namespaces"][VO]
@router.get("")
def get_deployments(
full_info: bool = Query(default=False),
authorization=Depends(security),
- ):
+):
"""
Returns a list of all deployments belonging to a user.
@@ -50,14 +50,14 @@ def get_deployments(
# Retrieve all jobs in namespace
jobs = nomad.get_deployments(
namespace=NAMESPACE,
- owner=auth_info['id'],
- prefix='try',
+ owner=auth_info["id"],
+ prefix="try",
)
user_jobs = []
for j in jobs:
try:
job_info = get_deployment(
- deployment_uuid=j['ID'],
+ deployment_uuid=j["ID"],
full_info=full_info,
authorization=types.SimpleNamespace(
credentials=authorization.credentials # token
@@ -66,12 +66,12 @@ def get_deployments(
except HTTPException: # not a try-me
continue
except Exception as e: # unexpected error
- raise(e)
+ raise (e)
user_jobs.append(job_info)
# Sort deployments by creation date
- seq = [j['submit_time'] for j in user_jobs]
+ seq = [j["submit_time"] for j in user_jobs]
args = sorted(range(len(seq)), key=seq.__getitem__)[::-1]
sorted_jobs = [user_jobs[i] for i in args]
@@ -83,7 +83,7 @@ def get_deployment(
deployment_uuid: str,
full_info: bool = Query(default=True),
authorization=Depends(security),
- ):
+):
"""
This function is used mainly to be able to retrieve the endpoint of the try_me job.
We cannot return the endpoint when creating the job, because the final endpoint will
@@ -100,12 +100,12 @@ def get_deployment(
job = nomad.get_deployment(
deployment_uuid=deployment_uuid,
namespace=NAMESPACE,
- owner=auth_info['id'],
+ owner=auth_info["id"],
full_info=full_info,
)
# Rewrite main endpoint, otherwise it automatically selects DEEPaaS API
- job['main_endpoint'] = 'ui'
+ job["main_endpoint"] = "ui"
return job
@@ -115,7 +115,7 @@ def create_deployment(
module_name: str,
title: str = Query(default=""),
authorization=Depends(security),
- ):
+):
"""
Submit a try-me deployment to Nomad.
The deployment will automatically kill himself after a short amount of time.
@@ -127,11 +127,11 @@ def create_deployment(
# Retrieve docker_image from module_name
meta = Modules.get_metadata(module_name)
- registry = meta['links']['docker_image']
- docker_image = '/'.join(registry.split('/')[-2:])
+ registry = meta["links"]["docker_image"]
+ docker_image = "/".join(registry.split("/")[-2:])
# Load module configuration
- nomad_conf = deepcopy(papiconf.TRY_ME['nomad'])
+ nomad_conf = deepcopy(papiconf.TRY_ME["nomad"])
# Generate UUID from (MAC address+timestamp) so it's unique
job_uuid = uuid.uuid1()
@@ -139,15 +139,15 @@ def create_deployment(
# Replace the Nomad job template
nomad_conf = nomad_conf.safe_substitute(
{
- 'JOB_UUID': job_uuid,
- 'NAMESPACE': NAMESPACE,
- 'TITLE': title[:45],
- 'OWNER': auth_info['id'],
- 'OWNER_NAME': auth_info['name'],
- 'OWNER_EMAIL': auth_info['email'],
- 'BASE_DOMAIN': papiconf.MAIN_CONF['lb']['domain'][VO],
- 'HOSTNAME': job_uuid,
- 'DOCKER_IMAGE': docker_image,
+ "JOB_UUID": job_uuid,
+ "NAMESPACE": NAMESPACE,
+ "TITLE": title[:45],
+ "OWNER": auth_info["id"],
+ "OWNER_NAME": auth_info["name"],
+ "OWNER_EMAIL": auth_info["email"],
+ "BASE_DOMAIN": papiconf.MAIN_CONF["lb"]["domain"][VO],
+ "HOSTNAME": job_uuid,
+ "DOCKER_IMAGE": docker_image,
}
)
@@ -158,39 +158,42 @@ def create_deployment(
# these jobs cannot be left queueing
# We check for every resource metric (cpu, disk, ram)
stats = get_cluster_stats(vo=VO)
- resources = ['cpu', 'ram', 'disk']
+ resources = ["cpu", "ram", "disk"]
keys = [f"{i}_used" for i in resources] + [f"{i}_total" for i in resources]
status = {k: 0 for k in keys}
- for _, datacenter in stats['datacenters'].items():
- for _, node in datacenter['nodes'].items():
- if 'tryme' in node['tags'] and node['status'] == 'ready':
+ for _, datacenter in stats["datacenters"].items():
+ for _, node in datacenter["nodes"].items():
+ if "tryme" in node["tags"] and node["status"] == "ready":
for k in keys:
status[k] += node[k]
for r in resources:
- if status[f"{r}_total"] == 0 or status[f"{r}_used"] / status[f"{r}_total"] > 0.85:
+ if (
+ status[f"{r}_total"] == 0
+ or status[f"{r}_used"] / status[f"{r}_total"] > 0.85
+ ):
# We cut of somehow earlier than 100% because we are only accounting for
# cores consumed in "main" task. But UI task is also consuming resources.
raise HTTPException(
status_code=503,
- detail="Sorry, but there seem to be no resources available right " \
- "now to test the module. Please try later.",
- )
+ detail="Sorry, but there seem to be no resources available right "
+ "now to test the module. Please try later.",
+ )
# Check that the user hasn't too many "try-me" jobs currently running
jobs = nomad.get_deployments(
namespace=NAMESPACE,
- owner=auth_info['id'],
+ owner=auth_info["id"],
prefix="try",
)
if len(jobs) >= 3:
raise HTTPException(
status_code=503,
- detail="Sorry, but you seem to be currently running 3 `try-me` environments already. " \
- "Before launching a new one, you will need to wait till one of your " \
- "existing environments gets automatically deleted (ca. 10 min) or delete it manually " \
- "in the Dashboard."
- )
+ detail="Sorry, but you seem to be currently running 3 `try-me` environments already. "
+ "Before launching a new one, you will need to wait till one of your "
+ "existing environments gets automatically deleted (ca. 10 min) or delete it manually "
+ "in the Dashboard.",
+ )
# Submit job
r = nomad.create_deployment(nomad_conf)
@@ -202,7 +205,7 @@ def create_deployment(
def delete_deployment(
deployment_uuid: str,
authorization=Depends(security),
- ):
+):
"""
Delete a deployment. Users can only delete their own deployments.
@@ -219,7 +222,7 @@ def delete_deployment(
r = nomad.delete_deployment(
deployment_uuid=deployment_uuid,
namespace=NAMESPACE,
- owner=auth_info['id'],
+ owner=auth_info["id"],
)
return r
diff --git a/ai4papi/utils.py b/ai4papi/utils.py
index 563ab20..2f2a46d 100644
--- a/ai4papi/utils.py
+++ b/ai4papi/utils.py
@@ -1,6 +1,7 @@
"""
Miscellaneous utils
"""
+
from datetime import datetime
import json
from pathlib import Path
@@ -18,7 +19,7 @@
session = requests.Session()
# Retrieve tokens for better rate limit
-github_token = os.environ.get('PAPI_GITHUB_TOKEN', None)
+github_token = os.environ.get("PAPI_GITHUB_TOKEN", None)
def update_values_conf(submitted, reference):
@@ -27,13 +28,11 @@ def update_values_conf(submitted, reference):
We also check that the submitted conf has the appropriate keys.
"""
for k in submitted.keys():
-
# Check level 1 keys
if k not in reference.keys():
raise HTTPException(
- status_code=400,
- detail=f"The key `{k}` in not a valid parameter."
- )
+ status_code=400, detail=f"The key `{k}` in not a valid parameter."
+ )
# Check level 2 keys
s1 = set(submitted[k].keys())
@@ -41,9 +40,8 @@ def update_values_conf(submitted, reference):
subs = s1.difference(s2)
if subs:
raise HTTPException(
- status_code=400,
- detail=f"The keys `{subs}` are not a valid parameters."
- )
+ status_code=400, detail=f"The keys `{subs}` are not a valid parameters."
+ )
# Update with user values
reference[k].update(submitted[k])
@@ -57,42 +55,44 @@ def validate_conf(conf):
"""
# Check that the Dockerhub image belongs either to "deephdc" or "ai4oshub"
# or that it points to our Harbor instance (eg. CVAT)
- image = conf.get('general', {}).get('docker_image')
+ image = conf.get("general", {}).get("docker_image")
if image:
- if image.split('/')[0] not in ["deephdc", "ai4oshub", "registry.services.ai4os.eu"]:
+ if image.split("/")[0] not in [
+ "deephdc",
+ "ai4oshub",
+ "registry.services.ai4os.eu",
+ ]:
raise HTTPException(
status_code=400,
detail="The docker image should belong to either 'deephdc' or 'ai4oshub' \
- DockerHub organizations or be hosted in the project's Harbor."
- )
+ DockerHub organizations or be hosted in the project's Harbor.",
+ )
# Check datasets_info list
- datasets = conf.get('storage', {}).get('datasets')
+ datasets = conf.get("storage", {}).get("datasets")
if datasets:
for d in datasets:
-
# Validate DOI and URL
# ref: https://stackoverflow.com/a/48524047/18471590
doiPattern = r"^10.\d{4,9}/[-._;()/:A-Z0-9]+$"
urlPattern = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
- if not (re.match(doiPattern, d['doi'], re.IGNORECASE) or re.match(urlPattern, d['doi'], re.IGNORECASE)):
- raise HTTPException(
- status_code=400,
- detail="Invalid DOI or URL."
- )
+ if not (
+ re.match(doiPattern, d["doi"], re.IGNORECASE)
+ or re.match(urlPattern, d["doi"], re.IGNORECASE)
+ ):
+ raise HTTPException(status_code=400, detail="Invalid DOI or URL.")
# Check force pull parameter
- if not isinstance(d['force_pull'], bool):
+ if not isinstance(d["force_pull"], bool):
raise HTTPException(
- status_code=400,
- detail="Force pull should be bool."
- )
+ status_code=400, detail="Force pull should be bool."
+ )
return conf
-#TODO: temporarily parse every 24hrs (instead of 6hrs) to reduce a bit the latency
-@cached(cache=TTLCache(maxsize=1024, ttl=24*60*60))
+# TODO: temporarily parse every 24hrs (instead of 6hrs) to reduce a bit the latency
+@cached(cache=TTLCache(maxsize=1024, ttl=24 * 60 * 60))
def get_github_info(owner, repo):
"""
Retrieve information from a Github repo
@@ -100,31 +100,39 @@ def get_github_info(owner, repo):
# Avoid running this function if were are doing local development, because
# repeatedly calling the Github API will otherwise get you blocked
if papiconf.IS_DEV:
- print('[info] Skipping Github API info fetching (development).')
+ print("[info] Skipping Github API info fetching (development).")
return {}
# Retrieve information from Github API
url = f"https://api.github.com/repos/{owner}/{repo}"
- headers = {'Authorization': f'token {github_token}'} if github_token else {}
+ headers = {"Authorization": f"token {github_token}"} if github_token else {}
r = session.get(url, headers=headers)
# Parse the information
out = {}
if r.ok:
repo_data = r.json()
- out['created'] = datetime.strptime(
- repo_data['created_at'],
- "%Y-%m-%dT%H:%M:%SZ",
- ).date().strftime("%Y-%m-%d") # keep only the date
- out['updated'] = datetime.strptime(
- repo_data['updated_at'],
- "%Y-%m-%dT%H:%M:%SZ",
- ).date().strftime("%Y-%m-%d")
- out['license'] = (repo_data['license'] or {}).get('spdx_id', '')
+ out["created"] = (
+ datetime.strptime(
+ repo_data["created_at"],
+ "%Y-%m-%dT%H:%M:%SZ",
+ )
+ .date()
+ .strftime("%Y-%m-%d")
+ ) # keep only the date
+ out["updated"] = (
+ datetime.strptime(
+ repo_data["updated_at"],
+ "%Y-%m-%dT%H:%M:%SZ",
+ )
+ .date()
+ .strftime("%Y-%m-%d")
+ )
+ out["license"] = (repo_data["license"] or {}).get("spdx_id", "")
# out['stars'] = repo_data['stargazers_count']
else:
msg = "API rate limit exceeded" if r.status_code == 403 else ""
- print(f' [Error] Failed to parse Github repo info: {msg}')
+ print(f" [Error] Failed to parse Github repo info: {msg}")
return out
@@ -132,7 +140,7 @@ def get_github_info(owner, repo):
@cached(cache=LRUCache(maxsize=20))
def retrieve_from_snapshots(
deployment_uuid: str,
- ):
+):
"""
Retrieve the deployment info from Nomad periodic snapshots.
@@ -143,31 +151,31 @@ def retrieve_from_snapshots(
Anyway, not a big concern because this function is not meant to be called very
frequently and latency from reading JSONs is very small.
"""
- main_dir = os.environ.get('ACCOUNTING_PTH', None)
+ main_dir = os.environ.get("ACCOUNTING_PTH", None)
if not main_dir:
raise HTTPException(
status_code=500,
detail="Accounting repo with snapshots not available.",
- )
- snapshot_dir = Path(main_dir) / 'snapshots'
+ )
+ snapshot_dir = Path(main_dir) / "snapshots"
# Iterate over snapshots, from recent to old
- for snapshot_pth in sorted(snapshot_dir.glob('**/*.json'))[::-1]:
-
+ for snapshot_pth in sorted(snapshot_dir.glob("**/*.json"))[::-1]:
# Load the snapshot
- with open(snapshot_pth, 'r') as f:
+ with open(snapshot_pth, "r") as f:
snapshot = json.load(f)
# Iterate over deployments until we find the correct one
for namespace, jobs in snapshot.items():
for job in jobs:
- if (job['job_ID'] == deployment_uuid) and (job['status'] == 'running'):
- job['namespace'] = namespace
- job['alloc_end'] = f'{snapshot_pth.stem}0000Z' # the end date is approximate (true value lies between this snapshot date and next one)
+ if (job["job_ID"] == deployment_uuid) and (job["status"] == "running"):
+ job["namespace"] = namespace
+ job["alloc_end"] = (
+ f"{snapshot_pth.stem}0000Z" # the end date is approximate (true value lies between this snapshot date and next one)
+ )
return job
# If no deployment found, show error
raise HTTPException(
- status_code=404,
- detail="Could not find the deployment in the database."
- )
+ status_code=404, detail="Could not find the deployment in the database."
+ )
diff --git a/requirements.txt b/requirements.txt
index 18bf410..f15d0ff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,3 +16,4 @@ pydantic >= 2.5.2, <= 2.9.2
natsort >= 8.1.0, < 9.0
ai4_metadata >= 2.0.2, < 3.0
harborapi == 0.25.3
+pre-commit >= 4.0.1, <= 5.0
diff --git a/ruff.toml b/ruff.toml
new file mode 100644
index 0000000..efffbb7
--- /dev/null
+++ b/ruff.toml
@@ -0,0 +1,8 @@
+line-length = 88
+
+[format]
+quote-style = "double"
+
+[lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
+"tests/main.py" = ["E402", "F401"]
diff --git a/setup.py b/setup.py
index f3e0709..10256ae 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,4 @@
import setuptools
-setuptools.setup(
- setup_requires=['pbr>=5.3.0'],
- pbr=True)
+setuptools.setup(setup_requires=["pbr>=5.3.0"], pbr=True)
diff --git a/tests/catalog/modules.py b/tests/catalog/modules.py
index 83f15e0..702bb20 100644
--- a/tests/catalog/modules.py
+++ b/tests/catalog/modules.py
@@ -8,18 +8,18 @@
modules_list = list(Modules.get_items().keys())
assert isinstance(modules_list, list)
-assert 'dogs-breed-detector' in modules_list
-assert 'ai4os-federated-server' not in modules_list
+assert "dogs-breed-detector" in modules_list
+assert "ai4os-federated-server" not in modules_list
# List filtered modules
modules_list2 = Modules.get_filtered_list(
- tags=('development',),
+ tags=("development",),
tags_any=None,
not_tags=None,
not_tags_any=None,
)
assert isinstance(modules_list2, list)
-assert 'ai4os-dev-env' in modules_list2
+assert "ai4os-dev-env" in modules_list2
# Get modules summaries
modules_sum = Modules.get_summary(
@@ -41,28 +41,28 @@
# Get module config
module_conf = Modules.get_config(
item_name=module_name,
- vo='vo.ai4eosc.eu',
+ vo="vo.ai4eosc.eu",
)
assert isinstance(module_conf, dict)
-assert 'general' in module_conf.keys()
+assert "general" in module_conf.keys()
# Get module metadata
module_meta = Modules.get_metadata(
item_name=module_name,
)
assert isinstance(module_meta, dict)
-assert 'title' in module_meta.keys()
+assert "title" in module_meta.keys()
# Refresh metadata cache
-common.JENKINS_TOKEN = '1234'
+common.JENKINS_TOKEN = "1234"
module_meta = Modules.refresh_metadata_cache_entry(
item_name=module_name,
authorization=SimpleNamespace(
- credentials='1234',
+ credentials="1234",
),
)
assert isinstance(module_meta, dict)
-#TODO: we should not be able to get config or metadata for a tool_name
+# TODO: we should not be able to get config or metadata for a tool_name
-print('Catalog (modules) tests passed!')
+print("Catalog (modules) tests passed!")
diff --git a/tests/catalog/tools.py b/tests/catalog/tools.py
index 0666c05..d50ff6f 100644
--- a/tests/catalog/tools.py
+++ b/tests/catalog/tools.py
@@ -7,32 +7,32 @@
# Retrieve EGI token (not generated on the fly in case the are rate limiting issues
# if too many queries)
-token = os.getenv('TMP_EGI_TOKEN')
+token = os.getenv("TMP_EGI_TOKEN")
if not token:
raise Exception(
-'Please remember to set a token as ENV variable before executing \
+ 'Please remember to set a token as ENV variable before executing \
the tests! \n\n \
export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \
If running from VScode make sure to launch `code` from that terminal so it can access \
that ENV variable.'
- )
+ )
# List tools
tools_list = list(Tools.get_items().keys())
assert isinstance(tools_list, list)
-assert 'ai4os-federated-server' in tools_list
-assert 'dogs-breed-detector' not in tools_list
+assert "ai4os-federated-server" in tools_list
+assert "dogs-breed-detector" not in tools_list
# List filtered tools
tools_list2 = Tools.get_filtered_list(
- tags=('docker',),
+ tags=("docker",),
tags_any=None,
not_tags=None,
not_tags_any=None,
)
assert isinstance(tools_list2, list)
-assert 'ai4os-federated-server' in tools_list
+assert "ai4os-federated-server" in tools_list
# Get tools summaries
tools_sum = Tools.get_summary(
@@ -52,34 +52,33 @@
# Contrary than for modules, we do this for all tools because tool configurations are
# particular for each tool
for tool_name in tools_list:
-
- print(f' - Testing {tool_name}')
+ print(f" - Testing {tool_name}")
# Get tool config
tool_conf = Tools.get_config(
item_name=tool_name,
- vo='vo.ai4eosc.eu',
+ vo="vo.ai4eosc.eu",
)
assert isinstance(tool_conf, dict)
- assert 'general' in tool_conf.keys()
+ assert "general" in tool_conf.keys()
# Get tool metadata
tool_meta = Tools.get_metadata(
item_name=tool_name,
)
assert isinstance(tool_meta, dict)
- assert 'title' in tool_meta.keys()
+ assert "title" in tool_meta.keys()
# Refresh metadata cache
-common.JENKINS_TOKEN = '1234'
+common.JENKINS_TOKEN = "1234"
module_meta = Tools.refresh_metadata_cache_entry(
item_name=tool_name,
authorization=SimpleNamespace(
- credentials='1234',
+ credentials="1234",
),
)
assert isinstance(module_meta, dict)
-#TODO: we should not be able to get config or metadata for a module_name
+# TODO: we should not be able to get config or metadata for a module_name
-print('Catalog (tools) tests passed!')
+print("Catalog (tools) tests passed!")
diff --git a/tests/deployments/modules.py b/tests/deployments/modules.py
index 1f192f3..af21813 100644
--- a/tests/deployments/modules.py
+++ b/tests/deployments/modules.py
@@ -8,52 +8,46 @@
# Retrieve EGI token (not generated on the fly in case the are rate limiting issues
# if too many queries)
-token = os.getenv('TMP_EGI_TOKEN')
+token = os.getenv("TMP_EGI_TOKEN")
if not token:
raise Exception(
-'Please remember to set a token as ENV variable before executing \
+ 'Please remember to set a token as ENV variable before executing \
the tests! \n\n \
export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \
If running from VScode make sure to launch `code` from that terminal so it can access \
that ENV variable.'
- )
+ )
# Create module
rcreate = modules.create_deployment(
- vo='vo.ai4eosc.eu',
+ vo="vo.ai4eosc.eu",
conf={},
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rcreate, dict)
-assert 'job_ID' in rcreate.keys()
+assert "job_ID" in rcreate.keys()
time.sleep(0.2) # Nomad takes some time to allocate deployment
# Retrieve that module
rdep = modules.get_deployment(
- vo='vo.ai4eosc.eu',
- deployment_uuid=rcreate['job_ID'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ deployment_uuid=rcreate["job_ID"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rdep, dict)
-assert 'job_ID' in rdep.keys()
-assert rdep['job_ID']==rcreate['job_ID']
-assert rdep['status']!='error'
+assert "job_ID" in rdep.keys()
+assert rdep["job_ID"] == rcreate["job_ID"]
+assert rdep["status"] != "error"
# Retrieve all modules
rdeps = modules.get_deployments(
- vos=['vo.ai4eosc.eu'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vos=["vo.ai4eosc.eu"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rdeps, list)
-assert any([d['job_ID']==rcreate['job_ID'] for d in rdeps])
-assert all([d['job_ID']!='error' for d in rdeps])
+assert any([d["job_ID"] == rcreate["job_ID"] for d in rdeps])
+assert all([d["job_ID"] != "error" for d in rdeps])
# Check that we cannot retrieve that module from tools
# This should break!
@@ -67,44 +61,36 @@
# Check that we cannot retrieve that module from tools list
rdeps2 = tools.get_deployments(
- vos=['vo.ai4eosc.eu'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vos=["vo.ai4eosc.eu"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rdeps2, list)
-assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps2])
+assert not any([d["job_ID"] == rcreate["job_ID"] for d in rdeps2])
# Delete module
rdel = modules.delete_deployment(
- vo='vo.ai4eosc.eu',
- deployment_uuid=rcreate['job_ID'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ deployment_uuid=rcreate["job_ID"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rdel, dict)
-assert 'status' in rdel.keys()
+assert "status" in rdel.keys()
time.sleep(3) # Nomad takes some time to delete
# Check module no longer exists
rdeps3 = modules.get_deployments(
- vos=['vo.ai4eosc.eu'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vos=["vo.ai4eosc.eu"],
+ authorization=SimpleNamespace(credentials=token),
)
-assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps3])
+assert not any([d["job_ID"] == rcreate["job_ID"] for d in rdeps3])
# Check that we are able to retrieve info from Nomad snapshots (provenance)
-modules.provenance_token = '1234'
+modules.provenance_token = "1234"
r_prov = modules.get_deployment(
- vo='',
- deployment_uuid='de0599d6-a1b9-11ef-b98d-0242ac120005',
- authorization=SimpleNamespace(
- credentials='1234'
- ),
+ vo="",
+ deployment_uuid="de0599d6-a1b9-11ef-b98d-0242ac120005",
+ authorization=SimpleNamespace(credentials="1234"),
)
-print('Deployments (modules) tests passed!')
+print("Deployments (modules) tests passed!")
diff --git a/tests/deployments/tools.py b/tests/deployments/tools.py
index ade3da3..30bf052 100644
--- a/tests/deployments/tools.py
+++ b/tests/deployments/tools.py
@@ -8,55 +8,49 @@
# Retrieve EGI token (not generated on the fly in case the are rate limiting issues
# if too many queries)
-token = os.getenv('TMP_EGI_TOKEN')
+token = os.getenv("TMP_EGI_TOKEN")
if not token:
raise Exception(
-'Please remember to set a token as ENV variable before executing \
+ 'Please remember to set a token as ENV variable before executing \
the tests! \n\n \
export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \
If running from VScode make sure to launch `code` from that terminal so it can access \
that ENV variable.'
- )
+ )
-print(' Testing FL server')
+print(" Testing FL server")
# Create tool
rcreate = tools.create_deployment(
- vo='vo.ai4eosc.eu',
- tool_name='ai4os-federated-server',
+ vo="vo.ai4eosc.eu",
+ tool_name="ai4os-federated-server",
conf={},
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rcreate, dict)
-assert 'job_ID' in rcreate.keys()
+assert "job_ID" in rcreate.keys()
time.sleep(0.2) # Nomad takes some time to allocate deployment
# Retrieve that tool
rdep = tools.get_deployment(
- vo='vo.ai4eosc.eu',
- deployment_uuid=rcreate['job_ID'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ deployment_uuid=rcreate["job_ID"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rdep, dict)
-assert 'job_ID' in rdep.keys()
-assert rdep['job_ID']==rcreate['job_ID']
-assert rdep['status']!='error'
+assert "job_ID" in rdep.keys()
+assert rdep["job_ID"] == rcreate["job_ID"]
+assert rdep["status"] != "error"
# Retrieve all tools
rdeps = tools.get_deployments(
- vos=['vo.ai4eosc.eu'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vos=["vo.ai4eosc.eu"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rdeps, list)
-assert any([d['job_ID']==rcreate['job_ID'] for d in rdeps])
-assert all([d['job_ID']!='error' for d in rdeps])
+assert any([d["job_ID"] == rcreate["job_ID"] for d in rdeps])
+assert all([d["job_ID"] != "error" for d in rdeps])
# Check that we cannot retrieve that tool from modules
# This should break!
@@ -70,79 +64,69 @@
# Check that we cannot retrieve that tool from modules list
rdeps2 = modules.get_deployments(
- vos=['vo.ai4eosc.eu'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vos=["vo.ai4eosc.eu"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rdeps2, list)
-assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps2])
+assert not any([d["job_ID"] == rcreate["job_ID"] for d in rdeps2])
# Delete tool
rdel = tools.delete_deployment(
- vo='vo.ai4eosc.eu',
- deployment_uuid=rcreate['job_ID'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ deployment_uuid=rcreate["job_ID"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rdel, dict)
-assert 'status' in rdel.keys()
+assert "status" in rdel.keys()
time.sleep(3) # Nomad takes some time to delete
# Check tool no longer exists
rdeps3 = tools.get_deployments(
- vos=['vo.ai4eosc.eu'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vos=["vo.ai4eosc.eu"],
+ authorization=SimpleNamespace(credentials=token),
)
-assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps3])
+assert not any([d["job_ID"] == rcreate["job_ID"] for d in rdeps3])
############################################################
# Additionally test simply the creation of the other tools #
############################################################
-print(' Testing CVAT')
+print(" Testing CVAT")
# Create tool
rcreate = tools.create_deployment(
- vo='vo.ai4eosc.eu',
- tool_name='ai4os-cvat',
+ vo="vo.ai4eosc.eu",
+ tool_name="ai4os-cvat",
conf={
- 'general':{
- 'title': 'CVAT test',
- 'cvat_username': 'mock_user',
- 'cvat_password': 'mock_password',
+ "general": {
+ "title": "CVAT test",
+ "cvat_username": "mock_user",
+ "cvat_password": "mock_password",
+ },
+ "storage": {
+ "rclone_conf": "/srv/.rclone/rclone.conf",
+ "rclone_url": "https://share.services.ai4os.eu/remote.php/webdav",
+ "rclone_vendor": "nextcloud",
+ "rclone_user": "mock_user",
+ "rclone_password": "mock_password",
},
- 'storage': {
- 'rclone_conf': '/srv/.rclone/rclone.conf',
- 'rclone_url': 'https://share.services.ai4os.eu/remote.php/webdav',
- 'rclone_vendor': 'nextcloud',
- 'rclone_user': 'mock_user',
- 'rclone_password': 'mock_password',
- }
},
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rcreate, dict)
-assert 'job_ID' in rcreate.keys()
-assert rdep['status']!='error'
+assert "job_ID" in rcreate.keys()
+assert rdep["status"] != "error"
time.sleep(0.2) # Nomad takes some time to allocate deployment
# Delete tool
rdel = tools.delete_deployment(
- vo='vo.ai4eosc.eu',
- deployment_uuid=rcreate['job_ID'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ deployment_uuid=rcreate["job_ID"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rdel, dict)
-assert 'status' in rdel.keys()
+assert "status" in rdel.keys()
-print('Deployments (tools) tests passed!')
+print("Deployments (tools) tests passed!")
diff --git a/tests/inference/oscar.py b/tests/inference/oscar.py
index 17373b7..4f02b3c 100644
--- a/tests/inference/oscar.py
+++ b/tests/inference/oscar.py
@@ -5,69 +5,59 @@
# Retrieve EGI token (not generated on the fly in case the are rate limitng issues
# if too many queries)
-token = os.getenv('TMP_EGI_TOKEN')
+token = os.getenv("TMP_EGI_TOKEN")
if not token:
raise Exception(
-'Please remember to set a token as ENV variable before executing \
+ 'Please remember to set a token as ENV variable before executing \
the tests! \n\n \
export TMP_EGI_TOKEN="$(oidc-token egi-checkin-demo)" \n\n \
If running from VScode make sure to launch `code` from that terminal so it can access \
that ENV variable.'
- )
+ )
# Test service
service = oscar.Service(
- image='deephdc/deep-oc-image-classification-tf',
+ image="deephdc/deep-oc-image-classification-tf",
cpu=2,
)
# Create service
sname = oscar.create_service(
- vo='vo.ai4eosc.eu',
+ vo="vo.ai4eosc.eu",
svc_conf=service,
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
# Check service exists
slist = oscar.get_services_list(
- vo='vo.ai4eosc.eu',
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ authorization=SimpleNamespace(credentials=token),
)
-names = [s['name'] for s in slist]
+names = [s["name"] for s in slist]
assert sname in names, "Service does not exist"
# Update service
service.cpu = 1
oscar.update_service(
- vo='vo.ai4eosc.eu',
+ vo="vo.ai4eosc.eu",
service_name=sname,
svc_conf=service,
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
# Delete the service
oscar.delete_service(
- vo='vo.ai4eosc.eu',
+ vo="vo.ai4eosc.eu",
service_name=sname,
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
# Check service does not longer exist
slist = oscar.get_services_list(
- vo='vo.ai4eosc.eu',
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ authorization=SimpleNamespace(credentials=token),
)
-names = [s['name'] for s in slist]
+names = [s["name"] for s in slist]
assert sname not in names, "Service exists"
-print('Inference (OSCAR) tests passed!')
+print("Inference (OSCAR) tests passed!")
diff --git a/tests/main.py b/tests/main.py
index 3605e32..538d685 100644
--- a/tests/main.py
+++ b/tests/main.py
@@ -7,9 +7,9 @@
Nomad (ie. after launching)
"""
-#TODO: move to proper testing package
-#TODO: rename test script: modules --> test_modules
-#TODO: add spinners
+# TODO: move to proper testing package
+# TODO: rename test script: modules --> test_modules
+# TODO: add spinners
import ai4papi.conf as papiconf
diff --git a/tests/routes.py b/tests/routes.py
index 4635316..f0cccf7 100644
--- a/tests/routes.py
+++ b/tests/routes.py
@@ -8,41 +8,43 @@
# Check routes
routes = [(r.path, r.methods) for r in app.routes]
-for collection in ['modules', 'tools']:
-
- assert (f'/v1/catalog/{collection}', {'GET'}) in routes
- assert (f'/v1/catalog/{collection}/detail', {'GET'}) in routes
- assert (f'/v1/catalog/{collection}/tags', {'GET'}) in routes
- assert (f'/v1/catalog/{collection}/' + '{item_name}/config', {'GET'}) in routes
- assert (f'/v1/catalog/{collection}/' + '{item_name}/metadata', {'GET'}) in routes
-
- assert (f'/v1/deployments/{collection}', {'GET'}) in routes
- assert (f'/v1/deployments/{collection}', {'POST'}) in routes
- assert (f'/v1/deployments/{collection}/' + '{deployment_uuid}', {'GET'}) in routes
- assert (f'/v1/deployments/{collection}/' + '{deployment_uuid}', {'DELETE'}) in routes
-
-
-assert ('/v1/datasets/zenodo', {'POST'}) in routes
-
-assert ('/v1/inference/oscar/cluster', {'GET'}) in routes
-assert ('/v1/inference/oscar/services', {'GET'}) in routes
-assert ('/v1/inference/oscar/services', {'POST'}) in routes
-assert ('/v1/inference/oscar/services/{service_name}', {'GET'}) in routes
-assert ('/v1/inference/oscar/services/{service_name}', {'PUT'}) in routes
-assert ('/v1/inference/oscar/services/{service_name}', {'DELETE'}) in routes
-
-assert ('/v1/secrets', {'GET'}) in routes
-assert ('/v1/secrets', {'POST'}) in routes
-assert ('/v1/secrets', {'DELETE'}) in routes
-
-assert ('/v1/deployments/stats/user', {'GET'}) in routes
-assert ('/v1/deployments/stats/cluster', {'GET'}) in routes
-
-assert ('/v1/try_me/nomad', {'POST'}) in routes
-assert ('/v1/try_me/nomad', {'GET'}) in routes
-assert ('/v1/try_me/nomad/{deployment_uuid}', {'GET'}) in routes
-assert ('/v1/try_me/nomad/{deployment_uuid}', {'DELETE'}) in routes
-
-assert ('/v1/storage/{storage_name}/ls', {'GET'}) in routes
-
-print('Checks for API routes passed!')
+for collection in ["modules", "tools"]:
+ assert (f"/v1/catalog/{collection}", {"GET"}) in routes
+ assert (f"/v1/catalog/{collection}/detail", {"GET"}) in routes
+ assert (f"/v1/catalog/{collection}/tags", {"GET"}) in routes
+ assert (f"/v1/catalog/{collection}/" + "{item_name}/config", {"GET"}) in routes
+ assert (f"/v1/catalog/{collection}/" + "{item_name}/metadata", {"GET"}) in routes
+
+ assert (f"/v1/deployments/{collection}", {"GET"}) in routes
+ assert (f"/v1/deployments/{collection}", {"POST"}) in routes
+ assert (f"/v1/deployments/{collection}/" + "{deployment_uuid}", {"GET"}) in routes
+ assert (
+ f"/v1/deployments/{collection}/" + "{deployment_uuid}",
+ {"DELETE"},
+ ) in routes
+
+
+assert ("/v1/datasets/zenodo", {"POST"}) in routes
+
+assert ("/v1/inference/oscar/cluster", {"GET"}) in routes
+assert ("/v1/inference/oscar/services", {"GET"}) in routes
+assert ("/v1/inference/oscar/services", {"POST"}) in routes
+assert ("/v1/inference/oscar/services/{service_name}", {"GET"}) in routes
+assert ("/v1/inference/oscar/services/{service_name}", {"PUT"}) in routes
+assert ("/v1/inference/oscar/services/{service_name}", {"DELETE"}) in routes
+
+assert ("/v1/secrets", {"GET"}) in routes
+assert ("/v1/secrets", {"POST"}) in routes
+assert ("/v1/secrets", {"DELETE"}) in routes
+
+assert ("/v1/deployments/stats/user", {"GET"}) in routes
+assert ("/v1/deployments/stats/cluster", {"GET"}) in routes
+
+assert ("/v1/try_me/nomad", {"POST"}) in routes
+assert ("/v1/try_me/nomad", {"GET"}) in routes
+assert ("/v1/try_me/nomad/{deployment_uuid}", {"GET"}) in routes
+assert ("/v1/try_me/nomad/{deployment_uuid}", {"DELETE"}) in routes
+
+assert ("/v1/storage/{storage_name}/ls", {"GET"}) in routes
+
+print("Checks for API routes passed!")
diff --git a/tests/test_launch.py b/tests/test_launch.py
index 6331730..ad17b0e 100644
--- a/tests/test_launch.py
+++ b/tests/test_launch.py
@@ -11,10 +11,10 @@
server_process = subprocess.Popen(
- ['uvicorn', 'ai4papi.main:app', '--host', '0.0.0.0', '--port', '8080'],
+ ["uvicorn", "ai4papi.main:app", "--host", "0.0.0.0", "--port", "8080"],
stdout=subprocess.DEVNULL,
- stderr = subprocess.DEVNULL,
- )
+ stderr=subprocess.DEVNULL,
+)
time.sleep(15) # wait for PAPI to start
try:
diff --git a/tests/test_secrets.py b/tests/test_secrets.py
index f3ea026..48e12ff 100644
--- a/tests/test_secrets.py
+++ b/tests/test_secrets.py
@@ -6,55 +6,47 @@
# Retrieve EGI token (not generated on the fly in case the are rate limiting issues
# if too many queries)
-token = os.getenv('TMP_EGI_TOKEN')
+token = os.getenv("TMP_EGI_TOKEN")
if not token:
raise Exception(
-'Please remember to set a token as ENV variable before executing \
+ 'Please remember to set a token as ENV variable before executing \
the tests! \n\n \
export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \
If running from VScode make sure to launch `code` from that terminal so it can access \
that ENV variable.'
- )
+ )
-SECRET_PATH = '/demo-papi-tests/demo-secret'
-SECRET_DATA = {'pwd': 12345}
+SECRET_PATH = "/demo-papi-tests/demo-secret"
+SECRET_DATA = {"pwd": 12345}
# Create secret
r = secrets.create_secret(
- vo='vo.ai4eosc.eu',
+ vo="vo.ai4eosc.eu",
secret_path=SECRET_PATH,
secret_data=SECRET_DATA,
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
# Check that secret is in list
r = secrets.get_secrets(
- vo='vo.ai4eosc.eu',
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ authorization=SimpleNamespace(credentials=token),
)
assert SECRET_PATH in r.keys()
assert r[SECRET_PATH] == SECRET_DATA
# Delete
r = secrets.delete_secret(
- vo='vo.ai4eosc.eu',
+ vo="vo.ai4eosc.eu",
secret_path=SECRET_PATH,
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
# Check that secret is no longer in list
r = secrets.get_secrets(
- vo='vo.ai4eosc.eu',
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ authorization=SimpleNamespace(credentials=token),
)
assert SECRET_PATH not in r.keys()
-print('Secrets tests passed!')
+print("Secrets tests passed!")
diff --git a/tests/test_snapshots.py b/tests/test_snapshots.py
index 0d71c1b..84fa5c0 100644
--- a/tests/test_snapshots.py
+++ b/tests/test_snapshots.py
@@ -8,85 +8,73 @@
# Retrieve EGI token (not generated on the fly in case the are rate limiting issues
# if too many queries)
-token = os.getenv('TMP_EGI_TOKEN')
+token = os.getenv("TMP_EGI_TOKEN")
if not token:
raise Exception(
-'Please remember to set a token as ENV variable before executing \
+ 'Please remember to set a token as ENV variable before executing \
the tests! \n\n \
export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \
If running from VScode make sure to launch `code` from that terminal so it can access \
that ENV variable.'
- )
+ )
# Create Nomad deployment
njob = modules.create_deployment(
- vo='vo.ai4eosc.eu',
+ vo="vo.ai4eosc.eu",
conf={},
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(njob, dict)
-assert 'job_ID' in njob.keys()
+assert "job_ID" in njob.keys()
time.sleep(60)
# Make snapshot of that module
created = snapshots.create_snapshot(
- vo='vo.ai4eosc.eu',
- deployment_uuid=njob['job_ID'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ deployment_uuid=njob["job_ID"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(created, dict)
-assert 'snapshot_ID' in created.keys()
+assert "snapshot_ID" in created.keys()
time.sleep(10)
# Retrieve all snapshots
retrieved = snapshots.get_snapshots(
- vos=['vo.ai4eosc.eu'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vos=["vo.ai4eosc.eu"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(retrieved, list)
-assert any([d['snapshot_ID']==created['snapshot_ID'] for d in retrieved])
-#TODO: waiting 10s the snapshot is still probably queued in Nomad, we should wait more if we want to test also Harbor
+assert any([d["snapshot_ID"] == created["snapshot_ID"] for d in retrieved])
+# TODO: waiting 10s the snapshot is still probably queued in Nomad, we should wait more if we want to test also Harbor
# Delete snapshot
deleted = snapshots.delete_snapshot(
- vo='vo.ai4eosc.eu',
- snapshot_uuid=created['snapshot_ID'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ snapshot_uuid=created["snapshot_ID"],
+ authorization=SimpleNamespace(credentials=token),
)
time.sleep(10) # it takes some time to delete
assert isinstance(deleted, dict)
-assert 'status' in deleted.keys()
+assert "status" in deleted.keys()
# Check snapshot no longer exists
retrieved2 = snapshots.get_snapshots(
- vos=['vo.ai4eosc.eu'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vos=["vo.ai4eosc.eu"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(retrieved, list)
-assert not any([d['snapshot_ID']==created['snapshot_ID'] for d in retrieved2])
+assert not any([d["snapshot_ID"] == created["snapshot_ID"] for d in retrieved2])
# Delete deployment
ndel = modules.delete_deployment(
- vo='vo.ai4eosc.eu',
- deployment_uuid=njob['job_ID'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ deployment_uuid=njob["job_ID"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(ndel, dict)
-assert 'status' in ndel.keys()
+assert "status" in ndel.keys()
-print('Snapshot tests passed!')
+print("Snapshot tests passed!")
diff --git a/tests/test_stats.py b/tests/test_stats.py
index 43ad934..e8d20e8 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -6,33 +6,31 @@
# Retrieve EGI token (not generated on the fly in case the are rate limiting issues
# if too many queries)
-token = os.getenv('TMP_EGI_TOKEN')
+token = os.getenv("TMP_EGI_TOKEN")
if not token:
raise Exception(
-'Please remember to set a token as ENV variable before executing \
+ 'Please remember to set a token as ENV variable before executing \
the tests! \n\n \
export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \
If running from VScode make sure to launch `code` from that terminal so it can access \
that ENV variable.'
- )
+ )
-SECRET_PATH = '/demo-papi-tests/demo-secret'
-SECRET_DATA = {'pwd': 12345}
+SECRET_PATH = "/demo-papi-tests/demo-secret"
+SECRET_DATA = {"pwd": 12345}
# Retrieve user stats
r = stats.deployments.get_user_stats(
- vo='vo.ai4eosc.eu',
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ authorization=SimpleNamespace(credentials=token),
)
-assert r, 'User stats dict is empty'
+assert r, "User stats dict is empty"
# Retrieve cluster stats
_ = stats.deployments.get_cluster_stats_bg()
r = stats.deployments.get_cluster_stats(
- vo='vo.ai4eosc.eu',
+ vo="vo.ai4eosc.eu",
)
-assert r, 'Cluster stats dict is empty'
+assert r, "Cluster stats dict is empty"
-print('Stats tests passed!')
+print("Stats tests passed!")
diff --git a/tests/test_storage.py b/tests/test_storage.py
index 11c8e28..48523ac 100644
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@@ -6,23 +6,21 @@
# Retrieve EGI token (not generated on the fly in case the are rate limiting issues
# if too many queries)
-token = os.getenv('TMP_EGI_TOKEN')
+token = os.getenv("TMP_EGI_TOKEN")
if not token:
raise Exception(
-'Please remember to set a token as ENV variable before executing \
+ 'Please remember to set a token as ENV variable before executing \
the tests! \n\n \
export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \
If running from VScode make sure to launch `code` from that terminal so it can access \
that ENV variable.'
- )
+ )
r = storage.storage_ls(
- vo='vo.ai4eosc.eu',
- storage_name='share.services.ai4os.eu',
- subpath='ai4os-storage',
- authorization=SimpleNamespace(
- credentials=token
- ),
+ vo="vo.ai4eosc.eu",
+ storage_name="share.services.ai4os.eu",
+ subpath="ai4os-storage",
+ authorization=SimpleNamespace(credentials=token),
)
-print('Storage tests passed!')
+print("Storage tests passed!")
diff --git a/tests/try_me/test_nomad.py b/tests/try_me/test_nomad.py
index 65d3a07..b492e55 100644
--- a/tests/try_me/test_nomad.py
+++ b/tests/try_me/test_nomad.py
@@ -7,64 +7,54 @@
# Retrieve EGI token (not generated on the fly in case the are rate limiting issues
# if too many queries)
-token = os.getenv('TMP_EGI_TOKEN')
+token = os.getenv("TMP_EGI_TOKEN")
if not token:
raise Exception(
-'Please remember to set a token as ENV variable before executing \
+ 'Please remember to set a token as ENV variable before executing \
the tests! \n\n \
export TMP_EGI_TOKEN="$(oidc-token egi-checkin)" \n\n \
If running from VScode make sure to launch `code` from that terminal so it can access \
that ENV variable.'
- )
+ )
# Create deployment
rcreate = nomad.create_deployment(
module_name="ai4os-demo-app",
title="PAPI tests",
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rcreate, dict)
-assert 'job_ID' in rcreate.keys()
+assert "job_ID" in rcreate.keys()
# Retrieve that deployment
rdep = nomad.get_deployment(
- deployment_uuid=rcreate['job_ID'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ deployment_uuid=rcreate["job_ID"],
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rdep, dict)
-assert 'job_ID' in rdep.keys()
-assert rdep['job_ID']==rcreate['job_ID']
+assert "job_ID" in rdep.keys()
+assert rdep["job_ID"] == rcreate["job_ID"]
# Retrieve all deployments
rdeps = nomad.get_deployments(
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
assert isinstance(rdeps, list)
-assert any([d['job_ID']==rcreate['job_ID'] for d in rdeps])
+assert any([d["job_ID"] == rcreate["job_ID"] for d in rdeps])
# Delete deployment
rdel = nomad.delete_deployment(
- deployment_uuid=rcreate['job_ID'],
- authorization=SimpleNamespace(
- credentials=token
- ),
+ deployment_uuid=rcreate["job_ID"],
+ authorization=SimpleNamespace(credentials=token),
)
time.sleep(3) # Nomad takes some time to delete
assert isinstance(rdel, dict)
-assert 'status' in rdel.keys()
+assert "status" in rdel.keys()
# Check module no longer exists
rdeps3 = nomad.get_deployments(
- authorization=SimpleNamespace(
- credentials=token
- ),
+ authorization=SimpleNamespace(credentials=token),
)
-assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps3])
+assert not any([d["job_ID"] == rcreate["job_ID"] for d in rdeps3])
-print('Try-me (nomad) tests passed!')
+print("Try-me (nomad) tests passed!")