Skip to content

Commit

Permalink
feat(stats): properly aggregate cluster resources
Browse files Browse the repository at this point in the history
  • Loading branch information
IgnacioHeredia committed Jun 19, 2024
1 parent 6560a10 commit fd00d14
Showing 1 changed file with 20 additions and 4 deletions.
24 changes: 20 additions & 4 deletions ai4papi/routers/v1/stats/deployments.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def get_cluster_stats_bg():
"""

resources = [
'jobs_num',
'cpu_total',
'cpu_used',
'gpu_total',
Expand All @@ -224,7 +225,7 @@ def get_cluster_stats_bg():
'datacenters' : datacenters, # aggregated datacenter usage
'cluster': {k: 0 for k in resources}, # aggregated cluster usage
}
stats['cluster']['gpu_models'] = []
stats['cluster']['gpu_models'] = {}

# Load nodes
nodes = Nomad.nodes.get_nodes(resources=True)
Expand Down Expand Up @@ -343,10 +344,25 @@ def get_cluster_stats_bg():
for dc_stats in stats['datacenters'].values():
for n_stats in dc_stats['nodes'].values():
for k, v in n_stats.items():
if k not in ['name', 'jobs_num']:
stats['cluster'][k] += v

stats['cluster']['gpu_models'] = gpu_stats
# Ignore keys
if k in ['name', 'eligibility']:
continue

# Aggregate nested gpu_models dict
elif k == 'gpu_models':
for k1, v1 in v.items():
model_stats = stats['cluster']['gpu_models'].get(
k1,
{'gpu_total': 0, 'gpu_used': 0,} # init value
)
for k2, v2 in v1.items():
model_stats[k2] += v2
stats['cluster']['gpu_models'][k1] = model_stats

# Aggregate other resources
else:
stats['cluster'][k] += v

# Set the new shared variable
global cluster_stats
Expand Down

0 comments on commit fd00d14

Please sign in to comment.