diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index 6bcb8d735f..efc732f6e4 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -26,6 +26,7 @@ send_alert_group_signal_for_delete, unsilence_task, ) +from apps.grafana_plugin.helpers import GrafanaAPIClient from apps.grafana_plugin.ui_url_builder import UIURLBuilder from apps.metrics_exporter.tasks import update_metrics_for_alert_group from apps.slack.slack_formatter import SlackFormatter @@ -573,6 +574,66 @@ def declare_incident_link(self) -> str: def happened_while_maintenance(self): return self.root_alert_group is not None and self.root_alert_group.maintenance_uuid is not None + def get_dependent_services(self, affected_only: bool = False) -> typing.List[str]: + """Return a service name list of current alert group service dependent services. + + Service name is extracted from current alert group labels. + If affected_only is True, return only dependent services with active alert groups. + """ + SERVICE_LABEL = "service_name" + affected_deps = [] + organization = self.channel.organization + service_label = self.labels.filter(key_name=SERVICE_LABEL).first() + + if not service_label: + return affected_deps + + service_name = service_label.value_name + # query dependent services via aggregated API server + grafana_api_client = GrafanaAPIClient(api_url=organization.grafana_url, api_token=organization.api_token) + response_data, call_status = grafana_api_client.get_services_depending_on( + service_name, stack_id=organization.stack_id + ) + if call_status["status_code"] != 200: + # check additional logs from Grafana API client + logger.info(f"get_dependent_services for alert_group {self.pk} failed") + return affected_deps + deps = [s["spec"]["from"]["ref"]["name"] for s in response_data.get("items", [])] + + if not affected_only: + return deps + + # check for dependent services with active alert groups + for dep_service_name in deps: + queryset = AlertGroup.objects.filter( + channel__organization=organization, + labels__organization=organization, + labels__key_name=SERVICE_LABEL, + labels__value_name=dep_service_name, + # check for firing and acknowledged alert groups + resolved=False, + silenced=False, + # TODO: is root? + # TODO: review this period? started around this one? + # eg. started after? last week? both? tests! + # started_at__gte=timezone.now() - timezone.timedelta(days=30), + started_at__gte=self.started_at, + started_at__gt=timezone.now() - timezone.timedelta(days=7), + ) + # TODO: maybe we don't need this? + if settings.ALERT_GROUPS_DISABLE_PREFER_ORDERING_INDEX: + # workaround related to MySQL "ORDER BY LIMIT Query Optimizer Bug" + # read more: https://hackmysql.com/infamous-order-by-limit-query-optimizer-bug/ + from django_mysql.models import add_QuerySetMixin + + queryset = add_QuerySetMixin(queryset) + queryset = queryset.force_index("alert_group_list_index") + + if queryset.exists(): + affected_deps.append(dep_service_name) + + return affected_deps + def get_paged_users(self) -> typing.List[PagedUser]: from apps.alerts.models import AlertGroupLogRecord diff --git a/engine/apps/alerts/tests/test_alert_group.py b/engine/apps/alerts/tests/test_alert_group.py index a29ae9caa2..2e9e5ed823 100644 --- a/engine/apps/alerts/tests/test_alert_group.py +++ b/engine/apps/alerts/tests/test_alert_group.py @@ -867,3 +867,144 @@ def test_slack_channel_id_no_slack_message_no_channel_filter( # Assert that slack_channel_id is None assert alert_group.slack_channel_id is None + + +@pytest.mark.django_db +def test_alert_group_dependent_services_failed_api_call( + make_organization, + make_alert_receive_channel, + make_alert_group, + make_alert_group_label_association, +): + organization = make_organization() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + # set service name label + make_alert_group_label_association(organization, alert_group, key_name="service_name", value_name="service-a") + + with patch( + "apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on" + ) as mock_get_services_depending_on: + mock_get_services_depending_on.return_value = (None, {"status_code": 500}) + services = alert_group.get_dependent_services() + assert services == [] + + +@pytest.mark.django_db +def test_alert_group_dependent_services_no_service_set( + make_organization, + make_alert_receive_channel, + make_alert_group, +): + organization = make_organization() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + # no service name label set + + services = alert_group.get_dependent_services() + assert services == [] + + +@pytest.mark.django_db +def test_alert_group_dependent_services_all( + make_organization, + make_alert_receive_channel, + make_alert_group, + make_alert_group_label_association, +): + organization = make_organization() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + # set service name label + make_alert_group_label_association(organization, alert_group, key_name="service_name", value_name="service-a") + + mock_related_services_response_data = { + "apiVersion": "gamma.ext.grafana.com/v1alpha1", + "items": [ + { + "apiVersion": "gamma.ext.grafana.com/v1alpha1", + "kind": "Relation", + "metadata": {}, + "spec": { + "from": { + "ref": {"kind": "Component", "name": f"service-{i}"}, + "type": "depends-on", + }, + "relationType": "dependency", + "to": { + "ref": {"kind": "Component", "name": "service-a"}, + "type": "dependency-of", + }, + }, + } + for i in ("b", "c") + ], + "kind": "RelationList", + "metadata": {"continue": "", "resourceVersion": "15552"}, + } + + with patch( + "apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on" + ) as mock_get_services_depending_on: + mock_get_services_depending_on.return_value = (mock_related_services_response_data, {"status_code": 200}) + services = alert_group.get_dependent_services() + assert services == ["service-b", "service-c"] + + +@pytest.mark.django_db +def test_alert_group_dependent_services_filter_affected( + make_organization, + make_alert_receive_channel, + make_alert_group, + make_alert_group_label_association, +): + organization = make_organization() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + # set service name label + make_alert_group_label_association(organization, alert_group, key_name="service_name", value_name="service-a") + + affected_states = {AlertGroupState.FIRING, AlertGroupState.ACKNOWLEDGED} + expected_services = [] + for i, state in enumerate(AlertGroupState): + ag = make_alert_group(alert_receive_channel) + if state != AlertGroupState.FIRING: + setattr(ag, state.lower(), True) + ag.save() + # set service name label + service_name = f"service-{i}" + make_alert_group_label_association(organization, ag, key_name="service_name", value_name=service_name) + if state in affected_states: + expected_services.append(service_name) + + mock_related_services_response_data = { + "apiVersion": "gamma.ext.grafana.com/v1alpha1", + "items": [ + { + "apiVersion": "gamma.ext.grafana.com/v1alpha1", + "kind": "Relation", + "metadata": {}, + "spec": { + "from": { + "ref": {"kind": "Component", "name": f"service-{i}"}, + "type": "depends-on", + }, + "relationType": "dependency", + "to": { + "ref": {"kind": "Component", "name": "service-a"}, + "type": "dependency-of", + }, + }, + } + for i in range(len(AlertGroupState)) + ], + "kind": "RelationList", + "metadata": {"continue": "", "resourceVersion": "15552"}, + } + with patch( + "apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on" + ) as mock_get_services_depending_on: + mock_get_services_depending_on.return_value = (mock_related_services_response_data, {"status_code": 200}) + services = alert_group.get_dependent_services(affected_only=True) + assert len(services) == len(affected_states) + assert services == expected_services diff --git a/engine/apps/api/tests/test_alert_group.py b/engine/apps/api/tests/test_alert_group.py index 8ee438b6bc..cc91bd24ae 100644 --- a/engine/apps/api/tests/test_alert_group.py +++ b/engine/apps/api/tests/test_alert_group.py @@ -2413,3 +2413,67 @@ def test_filter_default_started_at( ) assert response.status_code == status.HTTP_200_OK assert response.json()["pk"] == old_alert_group.public_primary_key + + +@pytest.mark.django_db +def test_alert_group_affected_services( + alert_group_internal_api_setup, + make_user_for_organization, + make_user_auth_headers, + make_alert_group_label_association, +): + _, token, alert_groups = alert_group_internal_api_setup + resolved_ag, ack_ag, new_ag, silenced_ag = alert_groups + organization = new_ag.channel.organization + user = make_user_for_organization(organization) + + # make sure the firing alert group started before the others + new_ag.started_at = timezone.now() - timezone.timedelta(days=1) + new_ag.save(update_fields=["started_at"]) + # set firing alert group service label + make_alert_group_label_association(organization, new_ag, key_name="service_name", value_name="service-a") + # set other service name labels for other alert groups + make_alert_group_label_association(organization, ack_ag, key_name="service_name", value_name="service-2") + make_alert_group_label_association(organization, resolved_ag, key_name="service_name", value_name="service-3") + make_alert_group_label_association(organization, silenced_ag, key_name="service_name", value_name="service-4") + + client = APIClient() + url = reverse("api-internal:alertgroup-related-affected-services", kwargs={"pk": new_ag.public_primary_key}) + + mock_related_services_response_data = { + "apiVersion": "gamma.ext.grafana.com/v1alpha1", + "items": [ + { + "apiVersion": "gamma.ext.grafana.com/v1alpha1", + "kind": "Relation", + "metadata": {}, + "spec": { + "from": { + "ref": {"kind": "Component", "name": f"service-{i}"}, + "type": "depends-on", + }, + "relationType": "dependency", + "to": { + "ref": {"kind": "Component", "name": "service-a"}, + "type": "dependency-of", + }, + }, + } + for i in range(5) + ], + "kind": "RelationList", + "metadata": {"continue": "", "resourceVersion": "15552"}, + } + with patch( + "apps.grafana_plugin.helpers.GrafanaAPIClient.get_services_depending_on" + ) as mock_get_services_depending_on: + mock_get_services_depending_on.return_value = (mock_related_services_response_data, {"status_code": 200}) + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + + assert response.status_code == status.HTTP_200_OK + expected = { + "name": "service-2", + "service_url": "a/grafana-slo-app/service/service-2", + "alert_groups_url": "a/grafana-oncall-app/alert-groups?status=0&status=1&started_at=now-30d_now&label=service_name:service-2", + } + assert response.json() == [expected] diff --git a/engine/apps/api/views/alert_group.py b/engine/apps/api/views/alert_group.py index 117fb9ce9d..2656012e08 100644 --- a/engine/apps/api/views/alert_group.py +++ b/engine/apps/api/views/alert_group.py @@ -26,6 +26,7 @@ from apps.api.serializers.team import TeamSerializer from apps.auth_token.auth import PluginAuthentication from apps.base.models.user_notification_policy_log_record import UserNotificationPolicyLogRecord +from apps.grafana_plugin.ui_url_builder import UIURLBuilder from apps.labels.utils import is_labels_feature_enabled from apps.mobile_app.auth import MobileAppAuthTokenAuthentication from apps.user_management.models import Team, User @@ -283,6 +284,7 @@ class AlertGroupView( "bulk_action": [RBACPermission.Permissions.ALERT_GROUPS_WRITE], "preview_template": [RBACPermission.Permissions.INTEGRATIONS_TEST], "escalation_snapshot": [RBACPermission.Permissions.ALERT_GROUPS_READ], + "related_affected_services": [RBACPermission.Permissions.ALERT_GROUPS_READ], } queryset = AlertGroup.objects.none() # needed for drf-spectacular introspection @@ -881,3 +883,31 @@ def escalation_snapshot(self, request, pk=None): escalation_snapshot = alert_group.escalation_snapshot result = AlertGroupEscalationSnapshotAPISerializer(escalation_snapshot).data if escalation_snapshot else {} return Response(result) + + @extend_schema( + responses=inline_serializer( + name="RelatedAffectedServices", + fields={ + "name": serializers.CharField(), + "service_url": serializers.CharField(), + "alert_groups_url": serializers.CharField(), + }, + many=True, + ) + ) + @action(methods=["get"], detail=True) + def related_affected_services(self, request, pk=None): + alert_group = self.get_object() + affected_deps = alert_group.get_dependent_services(affected_only=True) + url_builder = UIURLBuilder(alert_group.channel.organization) + result = [ + { + "name": service_name, + "service_url": url_builder.service_page(service_name), + "alert_groups_url": url_builder.alert_groups( + f"?status=0&status=1&started_at=now-30d_now&label=service_name:{service_name}" + ), + } + for service_name in affected_deps + ] + return Response(result) diff --git a/engine/apps/grafana_plugin/helpers/client.py b/engine/apps/grafana_plugin/helpers/client.py index 0037cb7410..f6539c0908 100644 --- a/engine/apps/grafana_plugin/helpers/client.py +++ b/engine/apps/grafana_plugin/helpers/client.py @@ -351,6 +351,17 @@ def validate_grafana_token_format(grafana_token: str) -> bool: return False return True + def get_services_depending_on(self, service_name, stack_id=None): + namespace = "default" + if settings.LICENSE == settings.CLOUD_LICENSE_NAME and stack_id: + namespace = f"stacks-{stack_id}" + query_params = { + "fieldSelector": f"spec.to.type=dependency-of,spec.to.ref.name={service_name}", + } + return self.api_get( + f"/apis/gamma.ext.grafana.com/v1alpha1/namespaces/{namespace}/relations", params=query_params + ) + class GcomAPIClient(APIClient): ACTIVE_INSTANCE_QUERY = "instances?status=active" diff --git a/engine/apps/grafana_plugin/tests/test_grafana_api_client.py b/engine/apps/grafana_plugin/tests/test_grafana_api_client.py index 2718624603..71b7825d5f 100644 --- a/engine/apps/grafana_plugin/tests/test_grafana_api_client.py +++ b/engine/apps/grafana_plugin/tests/test_grafana_api_client.py @@ -135,3 +135,37 @@ def test_it_returns_based_on_status_code_of_head_call( api_client = GrafanaAPIClient(API_URL, API_TOKEN) assert api_client.is_rbac_enabled_for_organization() == expected + + +class TestGetServicesDependingOn: + @patch("apps.grafana_plugin.helpers.client.GrafanaAPIClient.api_get") + def test_api_call_cloud(self, mock_grafana_api_client_api_get, settings): + settings.LICENSE = settings.CLOUD_LICENSE_NAME + api_client = GrafanaAPIClient(API_URL, API_TOKEN) + + service_name = "service-foo" + stack_id = 42 + api_client.get_services_depending_on(service_name, stack_id=stack_id) + expected_params = { + "fieldSelector": f"spec.to.type=dependency-of,spec.to.ref.name={service_name}", + } + mock_grafana_api_client_api_get.assert_called_with( + f"/apis/gamma.ext.grafana.com/v1alpha1/namespaces/stacks-{stack_id}/relations", + params=expected_params, + ) + + @patch("apps.grafana_plugin.helpers.client.GrafanaAPIClient.api_get") + def test_api_call_oss(self, mock_grafana_api_client_api_get, settings): + settings.LICENSE = settings.OPEN_SOURCE_LICENSE_NAME + api_client = GrafanaAPIClient(API_URL, API_TOKEN) + + service_name = "service-foo" + stack_id = 42 + api_client.get_services_depending_on(service_name, stack_id=stack_id) + expected_params = { + "fieldSelector": f"spec.to.type=dependency-of,spec.to.ref.name={service_name}", + } + mock_grafana_api_client_api_get.assert_called_with( + "/apis/gamma.ext.grafana.com/v1alpha1/namespaces/default/relations", + params=expected_params, + ) diff --git a/engine/apps/grafana_plugin/tests/test_ui_url_builder.py b/engine/apps/grafana_plugin/tests/test_ui_url_builder.py index dad9687721..b55b1c4caa 100644 --- a/engine/apps/grafana_plugin/tests/test_ui_url_builder.py +++ b/engine/apps/grafana_plugin/tests/test_ui_url_builder.py @@ -103,3 +103,9 @@ def test_build_url_overriden_base_url(org_setup): @pytest.mark.django_db def test_build_url_works_for_irm_and_oncall_plugins(org_setup, is_grafana_irm_enabled, expected_url): assert UIURLBuilder(org_setup(is_grafana_irm_enabled)).alert_group_detail(ALERT_GROUP_ID) == expected_url + + +@pytest.mark.django_db +def test_build_url_service_detail_page(org_setup): + builder = UIURLBuilder(org_setup()) + assert builder.service_page("service-a") == f"{GRAFANA_URL}/a/{PluginID.SLO}/service/service-a" diff --git a/engine/apps/grafana_plugin/ui_url_builder.py b/engine/apps/grafana_plugin/ui_url_builder.py index e37f8e7542..5b4d6e6417 100644 --- a/engine/apps/grafana_plugin/ui_url_builder.py +++ b/engine/apps/grafana_plugin/ui_url_builder.py @@ -56,3 +56,6 @@ def settings(self, path_extra: str = "") -> str: def declare_incident(self, path_extra: str = "") -> str: return self._build_url("incidents/declare", path_extra, plugin_id=PluginID.INCIDENT) + + def service_page(self, service_name: str, path_extra: str = "") -> str: + return self._build_url(f"service/{service_name}", path_extra, plugin_id=PluginID.SLO) diff --git a/engine/common/constants/plugin_ids.py b/engine/common/constants/plugin_ids.py index 666c30e21c..db6ff6a2f1 100644 --- a/engine/common/constants/plugin_ids.py +++ b/engine/common/constants/plugin_ids.py @@ -5,3 +5,4 @@ class PluginID: INCIDENT = "grafana-incident-app" LABELS = "grafana-labels-app" ML = "grafana-ml-app" + SLO = "grafana-slo-app"