Skip to content

Commit

Permalink
feat: support making snapshots from deployments (#62)
Browse files Browse the repository at this point in the history
* feat: snapshots

* fix: add harborapi as a requirement

* style: fix style

* fix: add job.hcl

* feat: ignacio's review

Main changes:
* changed route path
* break snapshot retrieval into nomad or harbor
* add checks (membership, quota limits, ownership, existence, etc)
* moved to synchronous Harbor client
* general code refactoring

* feat: add tests for snapshots

* feat: make snapshots deployable

* fix: snapshot date field

* fix: query info of non allocated snapshots

* fix: make sure to not break if allocations are not yet created

* fix: catch tasks is None

* fix: rename completed state to complete

* fix: handle better no allocs case

---------

Co-authored-by: Ignacio Heredia <[email protected]>
Co-authored-by: Marta Obregón <[email protected]>
  • Loading branch information
3 people authored Nov 25, 2024
1 parent 695e7a3 commit b49d07d
Show file tree
Hide file tree
Showing 8 changed files with 721 additions and 12 deletions.
34 changes: 23 additions & 11 deletions ai4papi/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,22 @@
# when running from the Docker container
IS_DEV = False if os.getenv('FORWARDED_ALLOW_IPS') else True

# Harbor token is kind of mandatory in production, otherwise snapshots won't work.
HARBOR_USER = "robot$user-snapshots+snapshot-api"
HARBOR_PASS = os.environ.get('HARBOR_ROBOT_PASSWORD')
if not HARBOR_PASS:
if IS_DEV:
# Not enforce this for developers
print("You should define the variable \"HARBOR_ROBOT_PASSWORD\" to use the \"/snapshots\" endpoint.")
else:
raise Exception("You need to define the variable \"HARBOR_ROBOT_PASSWORD\".")

# Paths
main_path = Path(__file__).parent.absolute()
paths = {
"conf": main_path.parent / "etc",
"media": main_path / "media",
}
}

# Load main API configuration
with open(paths['conf'] / 'main.yaml', 'r') as f:
Expand Down Expand Up @@ -50,13 +60,9 @@ def load_yaml_conf(fpath):
conf_values[group_name] = {}
for k, v in params.items():
if 'name' not in v.keys():
raise Exception(
f"Parameter {k} needs to have a name."
)
raise Exception(f"Parameter {k} needs to have a name.")
if 'value' not in v.keys():
raise Exception(
f"Parameter {k} needs to have a value."
)
raise Exception(f"Parameter {k} needs to have a value.")
conf_values[group_name][k] = v['value']

return conf_full, conf_values
Expand All @@ -70,7 +76,7 @@ def load_yaml_conf(fpath):
'user': {
'full': yml[0],
'values': yml[1],
}
},
}

# Tools
Expand All @@ -85,7 +91,7 @@ def load_yaml_conf(fpath):
'user': {
'full': yml[0],
'values': yml[1],
}
},
}

# For tools, map the Nomad job name prefixes to tool IDs
Expand All @@ -107,17 +113,23 @@ def load_yaml_conf(fpath):
'nomad': nmd,
}

# Snapshot endpoints
nmd = load_nomad_job(paths['conf'] / 'snapshots' / 'nomad.hcl')
SNAPSHOTS = {
'nomad': nmd,
}

# Retrieve git info from PAPI, to show current version in the docs
papi_commit = subprocess.run(
['git', 'log', '-1', '--format=%H'],
stdout=subprocess.PIPE,
text=True,
cwd=main_path,
).stdout.strip()
).stdout.strip()
papi_branch = subprocess.run(
['git', 'rev-parse', '--abbrev-ref', '--symbolic-full-name', '@{u}'],
stdout=subprocess.PIPE,
text=True,
cwd=main_path,
).stdout.strip()
).stdout.strip()
papi_branch = papi_branch.split('/')[-1] # remove the "origin/" part
3 changes: 2 additions & 1 deletion ai4papi/routers/v1/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import fastapi

from . import catalog, deployments, inference, secrets, stats, storage, try_me
from . import catalog, deployments, inference, secrets, stats, storage, try_me, snapshots


router = fastapi.APIRouter()
router.include_router(catalog.router)
router.include_router(deployments.router)
router.include_router(inference.router)
router.include_router(secrets.router)
router.include_router(snapshots.router)
router.include_router(stats.router)
router.include_router(storage.router)
router.include_router(try_me.router)
Expand Down
30 changes: 30 additions & 0 deletions ai4papi/routers/v1/deployments/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ai4papi import auth, module_patches, quotas, utils
import ai4papi.conf as papiconf
import ai4papi.nomad.common as nomad
from ai4papi.routers import v1


router = APIRouter(
Expand Down Expand Up @@ -272,6 +273,35 @@ def create_deployment(
if not user_conf['hardware']['gpu_type']:
usertask['Resources']['Devices'][0]['Constraints'] = None

# If the image belong to Harbor, then it's a user snapshot
docker_image = user_conf['general']['docker_image']
if docker_image.split('/')[0] == "registry.services.ai4os.eu":

# Check the user is the owner of the image
if docker_image.split('/')[-1] != auth_info['id'].replace('@', '_at_'):
raise HTTPException(
status_code=401,
detail="You are not the owner of the Harbor image.",
)

# Check the snapshot indeed exists
user_snapshots = v1.snapshots.get_harbor_snapshots(
owner=auth_info['id'],
vo=vo,
)
snapshot_ids = [s['snapshot_ID'] for s in user_snapshots]
if user_conf['general']['docker_tag'] not in snapshot_ids:
raise HTTPException(
status_code=400,
detail="The snapshot does not exist.",
)

# Add Harbor authentication credentials to Nomad job
usertask['Config']['auth'] = [{
'username': papiconf.HARBOR_USER,
'password': papiconf.HARBOR_PASS,
}]

# If storage credentials not provided, remove all storage-related tasks
rclone = {k: v for k, v in user_conf['storage'].items() if k.startswith('rclone')}
if not all(rclone.values()):
Expand Down
Loading

0 comments on commit b49d07d

Please sign in to comment.