feat: support Nomad info queries from the provenance pipeline (#75)

* feat: add support for provenance pipeline queries * test: improve token in tests
ai4os · Nov 25, 2024 · a3cf0aa · a3cf0aa
1 parent 233b38f
commit a3cf0aa
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 1 deletion.
diff --git a/ai4papi/routers/v1/deployments/modules.py b/ai4papi/routers/v1/deployments/modules.py
@@ -22,6 +22,12 @@
 security = HTTPBearer()
 
 
+# When deploying in production, force the definition of a provenance token
+provenance_token = os.environ.get('PAPI_PROVENANCE_TOKEN', None)
+if not papiconf.IS_DEV and not provenance_token:
+    raise Exception("You need to define the variable \"PAPI_PROVENANCE_TOKEN\".")
+
+
 @router.get("")
 def get_deployments(
     vos: Union[Tuple, None] = Query(default=None),
@@ -106,6 +112,10 @@ def get_deployment(
 
     Returns a dict with info
     """
+    # Check if the query comes from the provenance-workflow, if so search in snapshots
+    if authorization.credentials == provenance_token:
+        return utils.retrieve_from_snapshots(deployment_uuid)
+
     # Retrieve authenticated user info
     auth_info = auth.get_user_info(token=authorization.credentials)
     auth.check_vo_membership(vo, auth_info['vos'])

diff --git a/ai4papi/utils.py b/ai4papi/utils.py
@@ -2,10 +2,12 @@
 Miscellaneous utils
 """
 from datetime import datetime
+import json
+from pathlib import Path
 import os
 import re
 
-from cachetools import cached, TTLCache
+from cachetools import cached, TTLCache, LRUCache
 from fastapi import HTTPException
 import requests
 
@@ -125,3 +127,47 @@ def get_github_info(owner, repo):
         print(f'  [Error] Failed to parse Github repo info: {msg}')
 
     return out
+
+
+@cached(cache=LRUCache(maxsize=20))
+def retrieve_from_snapshots(
+    deployment_uuid: str,
+    ):
+    """
+    Retrieve the deployment info from Nomad periodic snapshots.
+
+    This implementation is ugly as hell (iterate through all JSONs). Hopefully
+    after refactoring the "ai4-accounting" repo we will implement something cleaner
+    (eg. database).
+
+    Anyway, not a big concern because this function is not meant to be called very
+    frequently and latency from reading JSONs is very small.
+    """
+    main_dir = os.environ.get('ACCOUNTING_PTH', None)
+    if not main_dir:
+        raise HTTPException(
+            status_code=500,
+            detail="Accounting repo with snapshots not available.",
+            )
+    snapshot_dir = Path(main_dir) / 'snapshots'
+
+    # Iterate over snapshots, from recent to old
+    for snapshot_pth in sorted(snapshot_dir.glob('**/*.json'))[::-1]:
+
+        # Load the snapshot
+        with open(snapshot_pth, 'r') as f:
+            snapshot = json.load(f)
+
+        # Iterate over deployments until we find the correct one
+        for namespace, jobs in snapshot.items():
+            for job in jobs:
+                if (job['job_ID'] == deployment_uuid) and (job['status'] == 'running'):
+                    job['namespace'] = namespace
+                    job['alloc_end'] = f'{snapshot_pth.stem}0000Z'  # the end date is approximate (true value lies between this snapshot date and next one)
+                    return job
+
+    # If no deployment found, show error
+    raise HTTPException(
+        status_code=404,
+        detail="Could not find the deployment in the database."
+        )
diff --git a/tests/deployments/modules.py b/tests/deployments/modules.py
@@ -97,4 +97,14 @@
 )
 assert not any([d['job_ID']==rcreate['job_ID'] for d in rdeps3])
 
+# Check that we are able to retrieve info from Nomad snapshots (provenance)
+modules.provenance_token = '1234'
+r_prov = modules.get_deployment(
+    vo='',
+    deployment_uuid='de0599d6-a1b9-11ef-b98d-0242ac120005',
+    authorization=SimpleNamespace(
+        credentials='1234'
+    ),
+)
+
 print('Deployments (modules) tests passed!')