Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
matiasb committed Jan 10, 2025
1 parent 6e75374 commit daadebc
Show file tree
Hide file tree
Showing 9 changed files with 351 additions and 0 deletions.
61 changes: 61 additions & 0 deletions engine/apps/alerts/models/alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
send_alert_group_signal_for_delete,
unsilence_task,
)
from apps.grafana_plugin.helpers import GrafanaAPIClient
from apps.grafana_plugin.ui_url_builder import UIURLBuilder
from apps.metrics_exporter.tasks import update_metrics_for_alert_group
from apps.slack.slack_formatter import SlackFormatter
Expand Down Expand Up @@ -573,6 +574,66 @@ def declare_incident_link(self) -> str:
def happened_while_maintenance(self):
return self.root_alert_group is not None and self.root_alert_group.maintenance_uuid is not None

def get_dependent_services(self, affected_only: bool = False) -> typing.List[str]:
"""Return a service name list of current alert group service dependent services.
Service name is extracted from current alert group labels.
If affected_only is True, return only dependent services with active alert groups.
"""
SERVICE_LABEL = "service_name"
affected_deps = []
organization = self.channel.organization
service_label = self.labels.filter(key_name=SERVICE_LABEL).first()

if not service_label:
return affected_deps

service_name = service_label.value_name
# query dependent services via aggregated API server
grafana_api_client = GrafanaAPIClient(api_url=organization.grafana_url, api_token=organization.api_token)
response_data, call_status = grafana_api_client.get_services_depending_on(
service_name, stack_id=organization.stack_id
)
if call_status["status_code"] != 200:
# check additional logs from Grafana API client
logger.info(f"get_dependent_services for alert_group {self.pk} failed")
return affected_deps
deps = [s["spec"]["from"]["ref"]["name"] for s in response_data.get("items", [])]

if not affected_only:
return deps

# check for dependent services with active alert groups
for dep_service_name in deps:
queryset = AlertGroup.objects.filter(
channel__organization=organization,
labels__organization=organization,
labels__key_name=SERVICE_LABEL,
labels__value_name=dep_service_name,
# check for firing and acknowledged alert groups
resolved=False,
silenced=False,
# TODO: is root?
# TODO: review this period? started around this one?
# eg. started after? last week? both? tests!
# started_at__gte=timezone.now() - timezone.timedelta(days=30),
started_at__gte=self.started_at,
started_at__gt=timezone.now() - timezone.timedelta(days=7),
)
# TODO: maybe we don't need this?
if settings.ALERT_GROUPS_DISABLE_PREFER_ORDERING_INDEX:
# workaround related to MySQL "ORDER BY LIMIT Query Optimizer Bug"
# read more: https://hackmysql.com/infamous-order-by-limit-query-optimizer-bug/
from django_mysql.models import add_QuerySetMixin

queryset = add_QuerySetMixin(queryset)
queryset = queryset.force_index("alert_group_list_index")

if queryset.exists():
affected_deps.append(dep_service_name)

return affected_deps

def get_paged_users(self) -> typing.List[PagedUser]:
from apps.alerts.models import AlertGroupLogRecord

Expand Down
141 changes: 141 additions & 0 deletions engine/apps/alerts/tests/test_alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -867,3 +867,144 @@ def test_slack_channel_id_no_slack_message_no_channel_filter(

# Assert that slack_channel_id is None
assert alert_group.slack_channel_id is None


@pytest.mark.django_db
def test_alert_group_dependent_services_failed_api_call(
make_organization,
make_alert_receive_channel,
make_alert_group,
make_alert_group_label_association,
):
organization = make_organization()
alert_receive_channel = make_alert_receive_channel(organization)
alert_group = make_alert_group(alert_receive_channel)
# set service name label
make_alert_group_label_association(organization, alert_group, key_name="service_name", value_name="service-a")

with patch(
"apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on"
) as mock_get_services_depending_on:
mock_get_services_depending_on.return_value = (None, {"status_code": 500})
services = alert_group.get_dependent_services()
assert services == []


@pytest.mark.django_db
def test_alert_group_dependent_services_no_service_set(
make_organization,
make_alert_receive_channel,
make_alert_group,
):
organization = make_organization()
alert_receive_channel = make_alert_receive_channel(organization)
alert_group = make_alert_group(alert_receive_channel)
# no service name label set

services = alert_group.get_dependent_services()
assert services == []


@pytest.mark.django_db
def test_alert_group_dependent_services_all(
make_organization,
make_alert_receive_channel,
make_alert_group,
make_alert_group_label_association,
):
organization = make_organization()
alert_receive_channel = make_alert_receive_channel(organization)
alert_group = make_alert_group(alert_receive_channel)
# set service name label
make_alert_group_label_association(organization, alert_group, key_name="service_name", value_name="service-a")

mock_related_services_response_data = {
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"items": [
{
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"kind": "Relation",
"metadata": {},
"spec": {
"from": {
"ref": {"kind": "Component", "name": f"service-{i}"},
"type": "depends-on",
},
"relationType": "dependency",
"to": {
"ref": {"kind": "Component", "name": "service-a"},
"type": "dependency-of",
},
},
}
for i in ("b", "c")
],
"kind": "RelationList",
"metadata": {"continue": "", "resourceVersion": "15552"},
}

with patch(
"apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on"
) as mock_get_services_depending_on:
mock_get_services_depending_on.return_value = (mock_related_services_response_data, {"status_code": 200})
services = alert_group.get_dependent_services()
assert services == ["service-b", "service-c"]


@pytest.mark.django_db
def test_alert_group_dependent_services_filter_affected(
make_organization,
make_alert_receive_channel,
make_alert_group,
make_alert_group_label_association,
):
organization = make_organization()
alert_receive_channel = make_alert_receive_channel(organization)
alert_group = make_alert_group(alert_receive_channel)
# set service name label
make_alert_group_label_association(organization, alert_group, key_name="service_name", value_name="service-a")

affected_states = {AlertGroupState.FIRING, AlertGroupState.ACKNOWLEDGED}
expected_services = []
for i, state in enumerate(AlertGroupState):
ag = make_alert_group(alert_receive_channel)
if state != AlertGroupState.FIRING:
setattr(ag, state.lower(), True)
ag.save()
# set service name label
service_name = f"service-{i}"
make_alert_group_label_association(organization, ag, key_name="service_name", value_name=service_name)
if state in affected_states:
expected_services.append(service_name)

mock_related_services_response_data = {
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"items": [
{
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"kind": "Relation",
"metadata": {},
"spec": {
"from": {
"ref": {"kind": "Component", "name": f"service-{i}"},
"type": "depends-on",
},
"relationType": "dependency",
"to": {
"ref": {"kind": "Component", "name": "service-a"},
"type": "dependency-of",
},
},
}
for i in range(len(AlertGroupState))
],
"kind": "RelationList",
"metadata": {"continue": "", "resourceVersion": "15552"},
}
with patch(
"apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on"
) as mock_get_services_depending_on:
mock_get_services_depending_on.return_value = (mock_related_services_response_data, {"status_code": 200})
services = alert_group.get_dependent_services(affected_only=True)
assert len(services) == len(affected_states)
assert services == expected_services
64 changes: 64 additions & 0 deletions engine/apps/api/tests/test_alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -2413,3 +2413,67 @@ def test_filter_default_started_at(
)
assert response.status_code == status.HTTP_200_OK
assert response.json()["pk"] == old_alert_group.public_primary_key


@pytest.mark.django_db
def test_alert_group_affected_services(
alert_group_internal_api_setup,
make_user_for_organization,
make_user_auth_headers,
make_alert_group_label_association,
):
_, token, alert_groups = alert_group_internal_api_setup
resolved_ag, ack_ag, new_ag, silenced_ag = alert_groups
organization = new_ag.channel.organization
user = make_user_for_organization(organization)

# make sure the firing alert group started before the others
new_ag.started_at = timezone.now() - timezone.timedelta(days=1)
new_ag.save(update_fields=["started_at"])
# set firing alert group service label
make_alert_group_label_association(organization, new_ag, key_name="service_name", value_name="service-a")
# set other service name labels for other alert groups
make_alert_group_label_association(organization, ack_ag, key_name="service_name", value_name="service-2")
make_alert_group_label_association(organization, resolved_ag, key_name="service_name", value_name="service-3")
make_alert_group_label_association(organization, silenced_ag, key_name="service_name", value_name="service-4")

client = APIClient()
url = reverse("api-internal:alertgroup-related-affected-services", kwargs={"pk": new_ag.public_primary_key})

mock_related_services_response_data = {
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"items": [
{
"apiVersion": "gamma.ext.grafana.com/v1alpha1",
"kind": "Relation",
"metadata": {},
"spec": {
"from": {
"ref": {"kind": "Component", "name": f"service-{i}"},
"type": "depends-on",
},
"relationType": "dependency",
"to": {
"ref": {"kind": "Component", "name": "service-a"},
"type": "dependency-of",
},
},
}
for i in range(5)
],
"kind": "RelationList",
"metadata": {"continue": "", "resourceVersion": "15552"},
}
with patch(
"apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on"
) as mock_get_services_depending_on:
mock_get_services_depending_on.return_value = (mock_related_services_response_data, {"status_code": 200})
response = client.get(url, format="json", **make_user_auth_headers(user, token))

assert response.status_code == status.HTTP_200_OK
expected = {
"name": "service-2",
"service_url": "a/grafana-slo-app/service/service-2",
"alert_groups_url": "a/grafana-oncall-app/alert-groups?status=0&status=1&started_at=now-30d_now&label=service_name:service-2",
}
assert response.json() == [expected]
30 changes: 30 additions & 0 deletions engine/apps/api/views/alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from apps.api.serializers.team import TeamSerializer
from apps.auth_token.auth import PluginAuthentication
from apps.base.models.user_notification_policy_log_record import UserNotificationPolicyLogRecord
from apps.grafana_plugin.ui_url_builder import UIURLBuilder
from apps.labels.utils import is_labels_feature_enabled
from apps.mobile_app.auth import MobileAppAuthTokenAuthentication
from apps.user_management.models import Team, User
Expand Down Expand Up @@ -283,6 +284,7 @@ class AlertGroupView(
"bulk_action": [RBACPermission.Permissions.ALERT_GROUPS_WRITE],
"preview_template": [RBACPermission.Permissions.INTEGRATIONS_TEST],
"escalation_snapshot": [RBACPermission.Permissions.ALERT_GROUPS_READ],
"related_affected_services": [RBACPermission.Permissions.ALERT_GROUPS_READ],
}

queryset = AlertGroup.objects.none() # needed for drf-spectacular introspection
Expand Down Expand Up @@ -881,3 +883,31 @@ def escalation_snapshot(self, request, pk=None):
escalation_snapshot = alert_group.escalation_snapshot
result = AlertGroupEscalationSnapshotAPISerializer(escalation_snapshot).data if escalation_snapshot else {}
return Response(result)

@extend_schema(
responses=inline_serializer(
name="RelatedAffectedServices",
fields={
"name": serializers.CharField(),
"service_url": serializers.CharField(),
"alert_groups_url": serializers.CharField(),
},
many=True,
)
)
@action(methods=["get"], detail=True)
def related_affected_services(self, request, pk=None):
alert_group = self.get_object()
affected_deps = alert_group.get_dependent_services(affected_only=True)
url_builder = UIURLBuilder(alert_group.channel.organization)
result = [
{
"name": service_name,
"service_url": url_builder.service_page(service_name),
"alert_groups_url": url_builder.alert_groups(
f"?status=0&status=1&started_at=now-30d_now&label=service_name:{service_name}"
),
}
for service_name in affected_deps
]
return Response(result)
11 changes: 11 additions & 0 deletions engine/apps/grafana_plugin/helpers/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,17 @@ def validate_grafana_token_format(grafana_token: str) -> bool:
return False
return True

def get_services_depending_on(self, service_name, stack_id=None):
namespace = "default"
if settings.LICENSE == settings.CLOUD_LICENSE_NAME and stack_id:
namespace = f"stacks-{stack_id}"
query_params = {
"fieldSelector": f"spec.to.type=dependency-of,spec.to.ref.name={service_name}",
}
return self.api_get(
f"/apis/gamma.ext.grafana.com/v1alpha1/namespaces/{namespace}/relations", params=query_params
)


class GcomAPIClient(APIClient):
ACTIVE_INSTANCE_QUERY = "instances?status=active"
Expand Down
34 changes: 34 additions & 0 deletions engine/apps/grafana_plugin/tests/test_grafana_api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,37 @@ def test_it_returns_based_on_status_code_of_head_call(

api_client = GrafanaAPIClient(API_URL, API_TOKEN)
assert api_client.is_rbac_enabled_for_organization() == expected


class TestGetServicesDependingOn:
@patch("apps.grafana_plugin.helpers.client.GrafanaAPIClient.api_get")
def test_api_call_cloud(self, mock_grafana_api_client_api_get, settings):
settings.LICENSE = settings.CLOUD_LICENSE_NAME
api_client = GrafanaAPIClient(API_URL, API_TOKEN)

service_name = "service-foo"
stack_id = 42
api_client.get_services_depending_on(service_name, stack_id=stack_id)
expected_params = {
"fieldSelector": f"spec.to.type=dependency-of,spec.to.ref.name={service_name}",
}
mock_grafana_api_client_api_get.assert_called_with(
f"/apis/gamma.ext.grafana.com/v1alpha1/namespaces/stacks-{stack_id}/relations",
params=expected_params,
)

@patch("apps.grafana_plugin.helpers.client.GrafanaAPIClient.api_get")
def test_api_call_oss(self, mock_grafana_api_client_api_get, settings):
settings.LICENSE = settings.OPEN_SOURCE_LICENSE_NAME
api_client = GrafanaAPIClient(API_URL, API_TOKEN)

service_name = "service-foo"
stack_id = 42
api_client.get_services_depending_on(service_name, stack_id=stack_id)
expected_params = {
"fieldSelector": f"spec.to.type=dependency-of,spec.to.ref.name={service_name}",
}
mock_grafana_api_client_api_get.assert_called_with(
"/apis/gamma.ext.grafana.com/v1alpha1/namespaces/default/relations",
params=expected_params,
)
Loading

0 comments on commit daadebc

Please sign in to comment.