Skip to content

Commit

Permalink
Merge branch 'master' into try-nomad
Browse files Browse the repository at this point in the history
  • Loading branch information
IgnacioHeredia committed Aug 23, 2024
2 parents 4716bd9 + d8a3161 commit f4d5bdb
Show file tree
Hide file tree
Showing 14 changed files with 122 additions and 44 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ These are the configuration files the API uses:
* `etc/main_conf.yaml`: main configuration file of the API
* `etc/modules`: configuration files for standard modules
* `etc/tools`: configuration files for tools
- `deep-oc-federated-server`: federated server
- `ai4os-federated-server`: federated server

The pattern for the subfolders follows:
- `user.yaml`: user customizable configuration to make a deployment in Nomad.
Expand Down
16 changes: 16 additions & 0 deletions ai4papi/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from pathlib import Path
from string import Template
import subprocess

import yaml

Expand Down Expand Up @@ -85,3 +86,18 @@ def load_yaml_conf(fpath):
TRY_ME = {
'nomad': nmd,
}

# Retrieve git info from PAPI, to show current version in the docs
papi_commit = subprocess.run(
['git', 'log', '-1', '--format=%H'],
stdout=subprocess.PIPE,
text=True,
cwd=main_path,
).stdout.strip()
papi_branch = subprocess.run(
['git', 'rev-parse', '--abbrev-ref', '--symbolic-full-name', '@{u}'],
stdout=subprocess.PIPE,
text=True,
cwd=main_path,
).stdout.strip()
papi_branch = papi_branch.split('/')[-1] # remove the "origin/" part
6 changes: 5 additions & 1 deletion ai4papi/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import fastapi
import uvicorn

from ai4papi.conf import MAIN_CONF, paths
from ai4papi.conf import MAIN_CONF, paths, papi_branch, papi_commit
from fastapi.responses import FileResponse
from ai4papi.routers import v1
from ai4papi.routers.v1.stats.deployments import get_cluster_stats_bg
Expand Down Expand Up @@ -39,7 +39,11 @@
"This work is co-funded by [AI4EOSC](https://ai4eosc.eu/) project that has "
"received funding from the European Union's Horizon Europe 2022 research and "
"innovation programme under agreement No 101058593"
"<br><br>"

"PAPI version:"
f"[`ai4-papi/{papi_branch}@{papi_commit[:5]}`]"
f"(https://github.com/ai4os/ai4-papi/tree/{papi_commit})"
)

@asynccontextmanager
Expand Down
35 changes: 17 additions & 18 deletions ai4papi/nomad/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,30 +336,29 @@ def delete_deployment(
Returns a dict with status
"""
# Check the deployment exists
try:
j = Nomad.job.get_job(
id_=deployment_uuid,
namespace=namespace,
)
except exceptions.URLNotFoundNomadException:
raise HTTPException(
status_code=400,
detail="No deployment exists with this uuid.",
)
# Retrieve the deployment information. Under-the-hood it checks that:
# - the job indeed exists
# - the owner does indeed own the job
info = get_deployment(
deployment_uuid=deployment_uuid,
namespace=namespace,
owner=owner,
full_info=False,
)

# Check job does belong to owner
if j['Meta'] and owner != j['Meta'].get('owner', ''):
raise HTTPException(
status_code=400,
detail="You are not the owner of that deployment.",
)
# If job is in "queued" status, allow deleting with purge.
# Most of the time, when a job is in this status, it is due to a platform error.
# It gets stuck and cannot be deleted without purge
if info['status'] == 'queued':
purge = True
else:
purge = False

# Delete deployment
Nomad.job.deregister_job(
id_=deployment_uuid,
namespace=namespace,
purge=False,
purge=purge,
)

return {'status': 'success'}
Expand Down
15 changes: 15 additions & 0 deletions ai4papi/routers/v1/catalog/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
from fastapi import HTTPException, Query
import requests

from ai4papi import utils


class Catalog:

Expand Down Expand Up @@ -257,6 +259,19 @@ def get_metadata(
# Format "description" field nicely for the Dashboards Markdown parser
metadata["description"] = "\n".join(metadata["description"])

# Replace some fields with the info gathered from Github
pattern = r'github\.com/([^/]+)/([^/]+?)(?:\.git|/)?$'
match = re.search(pattern, items[item_name]['url'])
if match:
owner, repo = match.group(1), match.group(2)
gh_info = utils.get_github_info(owner, repo)

metadata['date_creation'] = gh_info.get('created', '')
# metadata['updated'] = gh_info.get('updated', '')
metadata['license'] = gh_info.get('license', '')
else:
print(f"Failed to parse owner/repo in {items[item_name]['url']}")

return metadata

def get_config(
Expand Down
5 changes: 1 addition & 4 deletions ai4papi/routers/v1/catalog/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,7 @@ def get_config(
conf["general"]["docker_tag"]["value"] = tags[0]

# Custom conf for development environment
if item_name == 'ai4os-dev-env' or item_name == 'deep-oc-generic-dev':
#TODO: remove second condition when 'deep-oc-generic-dev' is removed from the
# modules catalog

if item_name == 'ai4os-dev-env':
# For dev-env, order the tags in "Z-A" order instead of "newest"
# This is done because builds are done in parallel, so "newest" is meaningless
# (Z-A + natsort) allows to show more recent semver first
Expand Down
6 changes: 4 additions & 2 deletions ai4papi/routers/v1/catalog/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,16 @@
@cached(cache=TTLCache(maxsize=1024, ttl=6*60*60))
def get_items(self):
# Set default branch manually (because we are not yet reading this from submodules)
# TODO: start reading from submodules (only accept the submodules that have been
# integrated in papiconf.TOOLS)
tools_branches= {
'deep-oc-federated-server': 'main',
'ai4os-federated-server': 'main',
}

tools = {}
for k in papiconf.TOOLS.keys():
tools[k] = {
'url': f'https://github.com/deephdc/{k}', #TODO: this will need to be updated
'url': f'https://github.com/ai4os/{k}',
'branch': tools_branches[k],
}

Expand Down
21 changes: 15 additions & 6 deletions ai4papi/routers/v1/stats/deployments.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,12 @@ def get_cluster_stats(
"""

global cluster_stats
if not cluster_stats:
# If PAPI is used as a package, cluster_stats will be None, as the background
# computation of `get_cluster_stats_bg()` is only started when PAPI is launched
# with uvicorn.
# So if None, we need to initialize it
cluster_stats = get_cluster_stats_bg()
stats = copy.deepcopy(cluster_stats)

namespace = papiconf.MAIN_CONF['nomad']['namespaces'][vo]
Expand Down Expand Up @@ -277,10 +283,6 @@ def get_cluster_stats_bg():
n_stats['cpu_total'] = int(node['Attributes']['cpu.numcores'])
n_stats['ram_total'] = int(node['Attributes']['memory.totalbytes']) / 2**20
n_stats['disk_total'] = int(node['Attributes']['unique.storage.bytestotal']) / 2**20
n_stats['disk_used'] = \
( int(node['Attributes']['unique.storage.bytestotal']) \
- int(node['Attributes']['unique.storage.bytesfree'])) \
/ 2**20
n_stats['gpu_models'] = {}
n_stats['namespaces'] = node['Meta'].get('namespace', '')
n_stats['status'] = node['Meta'].get('status', '')
Expand Down Expand Up @@ -335,8 +337,8 @@ def get_cluster_stats_bg():
n_stats['jobs_num'] += 1

#TODO: we are ignoring resources consumed by other tasks
if 'usertask' in a['AllocatedResources']['Tasks']:
res = a['AllocatedResources']['Tasks']['usertask']
if 'main' in a['AllocatedResources']['Tasks']:
res = a['AllocatedResources']['Tasks']['main']

# cpu
if res['Cpu']['ReservedCores']:
Expand All @@ -345,6 +347,13 @@ def get_cluster_stats_bg():
# ram
n_stats['ram_used'] += res['Memory']['MemoryMB']

# disk
# Note: In theory we can get the total disk used in a node looking at the
# metadata (ie. "unique.storage.bytesfree"). But that gave us the disk that
# is actually used. But we are instead interested on the disk that is reserved
# by users (regardless of whether they are actually using it).
n_stats['disk_used'] += a['AllocatedResources']['Shared']['DiskMB']

# gpu
if res['Devices']:
gpu = [d for d in res['Devices'] if d['Type'] == 'gpu'][0]
Expand Down
36 changes: 36 additions & 0 deletions ai4papi/utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
"""
Miscellaneous utils
"""
from datetime import datetime
import os
import re

from cachetools import cached, TTLCache
from fastapi import HTTPException
import requests


# Persistent requests session for faster requests
session = requests.Session()

# Retrieve tokens for better rate limit
github_token = os.environ.get('PAPI_GITHUB_TOKEN', None)


def safe_hostname(
hostname: str,
Expand Down Expand Up @@ -164,3 +170,33 @@ def validate_conf(conf):
)

return conf


@cached(cache=TTLCache(maxsize=1024, ttl=6*60*60))
def get_github_info(owner, repo):
"""
Retrieve information from a Github repo
"""
# Retrieve information from Github API
url = f"https://api.github.com/repos/{owner}/{repo}"
headers = {'Authorization': f'token {github_token}'} if github_token else {}
r = session.get(url, headers=headers)

# Parse the information
out = {}
if r.ok:
repo_data = r.json()
out['created'] = datetime.strptime(
repo_data['created_at'],
"%Y-%m-%dT%H:%M:%SZ",
).date().strftime("%Y-%m-%d") # keep only the date
out['updated'] = datetime.strptime(
repo_data['updated_at'],
"%Y-%m-%dT%H:%M:%SZ",
).date().strftime("%Y-%m-%d")
out['license'] = (repo_data['license'] or {}).get('spdx_id', '')
# out['stars'] = repo_data['stargazers_count']
else:
print(f'Failed to parse Github repo: {owner}/{repo}')

return out
6 changes: 3 additions & 3 deletions etc/modules/nomad.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ job "module-${JOB_UUID}" {
# CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid
# overloading GPU clients with CPU-only jobs.
affinity {
attribute = "${node.unique.name}"
attribute = "${meta.tags}"
operator = "regexp"
value = "gpu"
weight = -50 # anti-affinity for GPU clients
value = "cpu"
weight = 50
}

# Avoid rescheduling the job on **other** nodes during a network cut
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ job "tool-fl-${JOB_UUID}" {
# CPU-only jobs should deploy *preferably* on CPU clients (affinity) to avoid
# overloading GPU clients with CPU-only jobs.
affinity {
attribute = "${node.unique.name}"
attribute = "${meta.tags}"
operator = "regexp"
value = "gpu"
weight = -50 # anti-affinity for GPU clients
value = "cpu"
weight = 50
}

# Avoid rescheduling the job on **other** nodes during a network cut
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ general:

docker_image:
name: Docker image
value: 'deephdc/deep-oc-federated-server'
value: 'ai4oshub/ai4os-federated-server'
description: Docker image to be used. For example `deephdc/deep-oc-image-classification-tf`.

docker_tag:
Expand Down Expand Up @@ -90,7 +90,7 @@ configuration:
strategy:
name: Federated aggregation strategy
value: 'Federated Averaging (FedAvg)'
description: Aggregation function or strategy that will be applied for aggregating the models received from the clients. <a href="https://github.com/deephdc/federated-server" target="_blank">Check the different options with their references.</a>
description: Aggregation function or strategy that will be applied for aggregating the models received from the clients. <a href="https://github.com/ai4os/ai4os-federated-server" target="_blank">Check the different options with their references.</a>
options: [
'Federated Averaging (FedAvg)', # fedavg
'FedProx strategy (FedProx)', # fedprox
Expand Down
4 changes: 2 additions & 2 deletions tests/catalog/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

assert isinstance(modules_list, list)
assert 'dogs-breed-detector' in modules_list
assert 'deep-oc-federated-server' not in modules_list
assert 'ai4os-federated-server' not in modules_list

# List filtered modules
modules_list2 = Modules.get_filtered_list(
Expand All @@ -16,7 +16,7 @@
not_tags_any=None,
)
assert isinstance(modules_list2, list)
assert 'deep-oc-generic-dev' in modules_list2
assert 'ai4os-dev-env' in modules_list2

# Get modules summaries
modules_sum = Modules.get_summary(
Expand Down
4 changes: 2 additions & 2 deletions tests/catalog/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
tools_list = list(Tools.get_items().keys())

assert isinstance(tools_list, list)
assert 'deep-oc-federated-server' in tools_list
assert 'ai4os-federated-server' in tools_list
assert 'dogs-breed-detector' not in tools_list

# List filtered tools
Expand All @@ -16,7 +16,7 @@
not_tags_any=None,
)
assert isinstance(tools_list2, list)
assert 'deep-oc-federated-server' in tools_list
assert 'ai4os-federated-server' in tools_list

# Get tools summaries
tools_sum = Tools.get_summary(
Expand Down

0 comments on commit f4d5bdb

Please sign in to comment.