Skip to content

Commit

Permalink
add metrics for IAM role quotas
Browse files Browse the repository at this point in the history
  • Loading branch information
nhudson committed Aug 3, 2024
1 parent b14ba90 commit bd49ad6
Show file tree
Hide file tree
Showing 11 changed files with 236 additions and 17 deletions.
24 changes: 24 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ actix-web = "4.8"
aws-config = "1.5"
aws-sdk-servicequotas = "1.36"
aws-sdk-cloudformation = "1.41"
aws-sdk-iam = "1.38"
tokio = { version = "1", features = ["rt", "sync", "time", "parking_lot"] }
serde_json = "1"
clap = { version = "4.5", features = ["derive"] }
Expand Down
18 changes: 18 additions & 0 deletions charts/tembo-metrics/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,24 @@ prometheusRules:
annotations:
summary: "CloudFormation stack usage critically close to quota"
description: "CloudFormation stack usage is at {{ $value }}% of the quota"
# Warning alert when usage is over 90% of quota
- alert: AWSIAMRoleQuotaWarning
expr: (tembo_iam_role_usage / tembo_iam_role_quota) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "AWS IAM Role usage nearing quota"
description: "AWS IAM role usage is at {{ $value }}% of the quota"
# Critical alert when usage is over 95% of quota
- alert: AWSIAMRoleQuotaCritical
expr: (tembo_iam_role_usage / tembo_iam_role_quota) * 100 > 95
for: 5m
labels:
severity: critical
annotations:
summary: "AWS IAM Role usage critically close to quota"
description: "AWS IAM Role usage is at {{ $value }}% of the quota"

# Kubernetes service configuration
service:
Expand Down
70 changes: 63 additions & 7 deletions sample-aws-iam-policy.json
Original file line number Diff line number Diff line change
@@ -1,23 +1,79 @@
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "Stmt1722444480676",
"Action": [
"cloudformation:ListStacks"
"cloudformation:GetResource",
"cloudformation:GetResourceRequestStatus",
"cloudformation:ListResourceRequests",
"cloudformation:ListResources",
"cloudformation:DescribeStacks",
"cloudformation:ListChangeSets",
"cloudformation:ListExports",
"cloudformation:ListGeneratedTemplates",
"cloudformation:ListImports",
"cloudformation:ListResourceScanRelatedResources",
"cloudformation:ListResourceScanResources",
"cloudformation:ListResourceScans",
"cloudformation:ListStackInstanceResourceDrifts",
"cloudformation:ListStackInstances",
"cloudformation:ListStackResources",
"cloudformation:ListStacks",
"cloudformation:ListStackSetAutoDeploymentTargets",
"cloudformation:ListStackSetOperationResults",
"cloudformation:ListStackSetOperations",
"cloudformation:ListStackSets",
"cloudformation:ListTypeRegistrations",
"cloudformation:ListTypes",
"cloudformation:ListTypeVersions",
"cloudformation:DescribeAccountLimits",
"cloudformation:DescribeStackDriftDetectionStatus",
"cloudformation:DescribeStackEvents",
"cloudformation:DescribeStackInstance",
"cloudformation:DescribeStackResource",
"cloudformation:DescribeStackResourceDrifts",
"cloudformation:DescribeStackResources",
"cloudformation:DescribeStackSet",
"cloudformation:DescribeStackSetOperation",
"cloudformation:DetectStackDrift",
"cloudformation:DetectStackResourceDrift",
"cloudformation:DetectStackSetDrift",
"cloudformation:GetStackPolicy"
],
"Effect": "Allow",
"Resource": "*"
"Resource": "*",
"Sid": "TemboMetricsCFNAccess"
},
{
"Sid": "Stmt1722444523636",
"Action": [
"servicequotas:GetAssociationForServiceQuotaTemplate",
"servicequotas:GetAWSDefaultServiceQuota",
"servicequotas:GetRequestedServiceQuotaChange",
"servicequotas:GetServiceQuota",
"servicequotas:GetServiceQuotaIncreaseRequestFromTemplate",
"servicequotas:ListAWSDefaultServiceQuotas",
"servicequotas:ListRequestedServiceQuotaChangeHistory",
"servicequotas:ListRequestedServiceQuotaChangeHistoryByQuota",
"servicequotas:ListServiceQuotaIncreaseRequestsInTemplate",
"servicequotas:ListServiceQuotas",
"servicequotas:ListServices"
"servicequotas:ListServices",
"servicequotas:ListTagsForResource"
],
"Effect": "Allow",
"Resource": "*",
"Sid": "TemboMetericsServiceQuotasAccess"
},
{
"Sid": "TemboMetricsIAMAccess",
"Effect": "Allow",
"Action": [
"iam:ListRoles",
"iam:ListRoleTags",
"iam:ListRolePolicies",
"iam:GetRole",
"iam:GetRolePolicy"
],
"Resource": "*"
}
]
],
"Version": "2012-10-17"
}
26 changes: 26 additions & 0 deletions src/aws/iam.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
use aws_sdk_iam::Client;

/// get_iam_role_count looks up all Roles in the account and returns the count as a i64
pub async fn get_iam_role_count(client: &Client) -> Result<i64, Box<dyn std::error::Error>> {
let mut role_count = 0;
let mut marker: Option<String> = None;

loop {
let mut request = client.list_roles();
if let Some(token) = marker {
request = request.marker(token);
}

let response = request.send().await?;

let roles = response.roles();
role_count += roles.len() as i64;

match response.marker() {
Some(token) => marker = Some(token.to_string()),
None => break,
}
}

Ok(role_count)
}
1 change: 1 addition & 0 deletions src/aws/mod.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub mod cloudformation;
pub mod iam;
pub mod servicequotas;
18 changes: 18 additions & 0 deletions src/aws/servicequotas.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,21 @@ pub async fn get_cf_stack_quota(client: &Client) -> Result<i64, Box<dyn std::err

Ok(quota as i64)
}

/// get_iam_role_quota returns the IAM Role quota as i64
/// L-FE177D64 is the quota code for the IAM Role Per Account quota
pub async fn get_iam_role_quota(client: &Client) -> Result<i64, Box<dyn std::error::Error>> {
let resp = client
.get_service_quota()
.service_code("iam")
.quota_code("L-FE177D64")
.send()
.await?;

let quota = resp
.quota
.and_then(|q| q.value)
.ok_or("Failed to get IAM Role quote")?;

Ok(quota as i64)
}
16 changes: 16 additions & 0 deletions src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ pub struct Metrics {
pub registry: prometheus::Registry,
pub cf_stack_quota: prometheus::IntGauge,
pub cf_stack_usage: prometheus::IntGauge,
pub iam_role_quota: prometheus::IntGauge,
pub iam_role_usage: prometheus::IntGauge,
}

impl Default for Metrics {
Expand All @@ -24,18 +26,32 @@ impl Metrics {
"AWS CloudFormation stacks in use",
)
.expect("Failed to create cf_stack_usage metric");
let iam_role_quota =
prometheus::IntGauge::new("tembo_iam_role_quota", "AWS IAM role quota (L-FE177D64)")
.expect("Failed to create iam_role_quota metric");
let iam_role_usage =
prometheus::IntGauge::new("tembo_iam_role_usage", "AWS IAM roles in use")
.expect("Failed to create iam_role_usage metric");

registry
.register(Box::new(cf_stack_quota.clone()))
.expect("Failed to register cf_stack_quota metric");
registry
.register(Box::new(cf_stack_usage.clone()))
.expect("Failed to register cf_stack_usage metric");
registry
.register(Box::new(iam_role_quota.clone()))
.expect("Failed to register cf_stack_usage metric");
registry
.register(Box::new(iam_role_usage.clone()))
.expect("Failed to register cf_stack_usage metric");

Self {
registry,
cf_stack_quota,
cf_stack_usage,
iam_role_quota,
iam_role_usage,
}
}
}
Empty file removed src/monitoring.rs
Empty file.
59 changes: 49 additions & 10 deletions src/routes/metrics.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::aws::{cloudformation, servicequotas};
use crate::aws::{cloudformation, iam, servicequotas};
use crate::state::State;
use actix_web::{get, web, HttpResponse, Responder};
use prometheus::{Encoder, TextEncoder};
Expand Down Expand Up @@ -30,42 +30,81 @@ pub async fn metrics(data: web::Data<Arc<State>>) -> impl Responder {
}

pub async fn set_and_update_metrics(data: &State) -> Result<(), Box<dyn std::error::Error>> {
// Get a reference to the inner State
// let state = data.as_ref();
update_cf_stack_quota(data).await?;
update_cf_stack_usage(data).await?;
update_iam_role_quota(data).await?;
update_iam_role_usage(data).await?;
Ok(())
}

// Get CloudFormation stack quota from Service Quotas API
async fn update_cf_stack_quota(data: &State) -> Result<(), Box<dyn std::error::Error>> {
match servicequotas::get_cf_stack_quota(&data.sq_client).await {
Ok(quota) => {
data.metrics.cf_stack_quota.set(quota);
info!("Updated CF stack quota metric: {}", quota);
// Verify the update
info!(
"Verified CF stack quota metric: {}",
data.metrics.cf_stack_quota.get()
);
Ok(())
}
Err(e) => {
error!("Failed to get CF stack quota: {}", e);
return Err(e);
Err(e)
}
}
}

// Get CloudFormation stack count
async fn update_cf_stack_usage(data: &State) -> Result<(), Box<dyn std::error::Error>> {
match cloudformation::get_stack_count(&data.cfn_client).await {
Ok(stack_count) => {
data.metrics.cf_stack_usage.set(stack_count);
info!("Updated CF stack usage metric: {}", stack_count);
// Verify the update
info!(
"Verified CF stack usage metric: {}",
data.metrics.cf_stack_usage.get()
);
Ok(())
}
Err(e) => {
error!("Failed to get CF stack count: {}", e);
return Err(e);
Err(e)
}
}
}

Ok(())
async fn update_iam_role_quota(data: &State) -> Result<(), Box<dyn std::error::Error>> {
match servicequotas::get_iam_role_quota(&data.sq_client).await {
Ok(quota) => {
data.metrics.iam_role_quota.set(quota);
info!("Updated IAM role quota metric: {}", quota);
info!(
"Verified IAM role quota metric: {}",
data.metrics.iam_role_quota.get()
);
Ok(())
}
Err(e) => {
error!("Failed to get IAM role quota: {}", e);
Err(e)
}
}
}

async fn update_iam_role_usage(data: &State) -> Result<(), Box<dyn std::error::Error>> {
match iam::get_iam_role_count(&data.iam_client).await {
Ok(role_count) => {
data.metrics.iam_role_usage.set(role_count);
info!("Updated IAM role usage metric: {}", role_count);
info!(
"Verified IAM role usage metric: {}",
data.metrics.iam_role_usage.get()
);
Ok(())
}
Err(e) => {
error!("Failed to get IAM role count: {}", e);
Err(e)
}
}
}
Loading

0 comments on commit bd49ad6

Please sign in to comment.