From fd00d1482e676b9f75d5e0c12ea10e4821e54be0 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Wed, 19 Jun 2024 14:48:57 +0200 Subject: [PATCH] feat(stats): properly aggregate cluster resources --- ai4papi/routers/v1/stats/deployments.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/ai4papi/routers/v1/stats/deployments.py b/ai4papi/routers/v1/stats/deployments.py index b601cb7..c0cc764 100644 --- a/ai4papi/routers/v1/stats/deployments.py +++ b/ai4papi/routers/v1/stats/deployments.py @@ -210,6 +210,7 @@ def get_cluster_stats_bg(): """ resources = [ + 'jobs_num', 'cpu_total', 'cpu_used', 'gpu_total', @@ -224,7 +225,7 @@ def get_cluster_stats_bg(): 'datacenters' : datacenters, # aggregated datacenter usage 'cluster': {k: 0 for k in resources}, # aggregated cluster usage } - stats['cluster']['gpu_models'] = [] + stats['cluster']['gpu_models'] = {} # Load nodes nodes = Nomad.nodes.get_nodes(resources=True) @@ -343,10 +344,25 @@ def get_cluster_stats_bg(): for dc_stats in stats['datacenters'].values(): for n_stats in dc_stats['nodes'].values(): for k, v in n_stats.items(): - if k not in ['name', 'jobs_num']: - stats['cluster'][k] += v - stats['cluster']['gpu_models'] = gpu_stats + # Ignore keys + if k in ['name', 'eligibility']: + continue + + # Aggregate nested gpu_models dict + elif k == 'gpu_models': + for k1, v1 in v.items(): + model_stats = stats['cluster']['gpu_models'].get( + k1, + {'gpu_total': 0, 'gpu_used': 0,} # init value + ) + for k2, v2 in v1.items(): + model_stats[k2] += v2 + stats['cluster']['gpu_models'][k1] = model_stats + + # Aggregate other resources + else: + stats['cluster'][k] += v # Set the new shared variable global cluster_stats