diff --git a/docs/sources/oncall-api-reference/alertgroups.md b/docs/sources/oncall-api-reference/alertgroups.md index 9b9d007fd5..33c2d46b6b 100644 --- a/docs/sources/oncall-api-reference/alertgroups.md +++ b/docs/sources/oncall-api-reference/alertgroups.md @@ -46,6 +46,31 @@ The above command returns JSON structured in the following way: "telegram": "https://t.me/c/5354/1234?thread=1234" }, "silenced_at": "2020-05-19T13:37:01.429805Z", + "last_alert": { + "id": "AA74DN7T4JQB6", + "alert_group_id": "I68T24C13IFW1", + "created_at": "2020-05-11T20:08:43Z", + "payload": { + "state": "alerting", + "title": "[Alerting] Test notification", + "ruleId": 0, + "message": "Someone is testing the alert notification within Grafana.", + "ruleUrl": "{{API_URL}}/", + "ruleName": "Test notification", + "evalMatches": [ + { + "tags": null, + "value": 100, + "metric": "High value" + }, + { + "tags": null, + "value": 200, + "metric": "Higher Value" + } + ] + } + }, } ], "current_page_number": 1, diff --git a/engine/apps/alerts/escalation_snapshot/serializers/escalation_policy_snapshot.py b/engine/apps/alerts/escalation_snapshot/serializers/escalation_policy_snapshot.py index 62dfe18f14..b5281b9a49 100644 --- a/engine/apps/alerts/escalation_snapshot/serializers/escalation_policy_snapshot.py +++ b/engine/apps/alerts/escalation_snapshot/serializers/escalation_policy_snapshot.py @@ -77,6 +77,7 @@ class Meta: "to_time", "num_alerts_in_window", "num_minutes_in_window", + "severity", "custom_webhook", "notify_schedule", "notify_to_group", diff --git a/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py b/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py index 541b5ffbf4..b6a495934a 100644 --- a/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py +++ b/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_policy_snapshot.py @@ -12,11 +12,13 @@ from apps.alerts.models.escalation_policy import EscalationPolicy from apps.alerts.tasks import ( custom_webhook_result, + declare_incident, notify_all_task, notify_group_task, notify_user_task, resolve_by_last_step_task, ) +from apps.alerts.utils import is_declare_incident_step_enabled from apps.schedules.ical_utils import list_users_to_notify_from_ical from apps.user_management.models import User @@ -40,6 +42,7 @@ class EscalationPolicySnapshot: "notify_schedule", "notify_to_group", "notify_to_team_members", + "severity", "escalation_counter", "passed_last_time", "pause_escalation", @@ -71,6 +74,7 @@ def __init__( passed_last_time, pause_escalation, notify_to_team_members=None, + severity=None, ): self.id = id self.order = order @@ -86,6 +90,7 @@ def __init__( self.notify_schedule = notify_schedule self.notify_to_group = notify_to_group self.notify_to_team_members = notify_to_team_members + self.severity = severity self.escalation_counter = escalation_counter # used for STEP_REPEAT_ESCALATION_N_TIMES self.passed_last_time = passed_last_time # used for building escalation plan self.pause_escalation = pause_escalation # used for STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW @@ -133,6 +138,7 @@ def execute(self, alert_group: "AlertGroup", reason) -> StepExecutionResultData: EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: self._escalation_step_notify_if_num_alerts_in_time_window, EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS: self._escalation_step_notify_multiple_users, EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS_IMPORTANT: self._escalation_step_notify_multiple_users, + EscalationPolicy.STEP_DECLARE_INCIDENT: self._escalation_step_declare_incident, None: self._escalation_step_not_configured, } result = action_map[self.step](alert_group, reason) @@ -407,6 +413,32 @@ def _escalation_step_notify_team_members(self, alert_group: "AlertGroup", reason self._execute_tasks(tasks) + def _escalation_step_declare_incident(self, alert_group: "AlertGroup", _reason: str) -> None: + grafana_declare_incident_enabled = is_declare_incident_step_enabled( + organization=alert_group.channel.organization + ) + if not grafana_declare_incident_enabled: + AlertGroupLogRecord( + type=AlertGroupLogRecord.TYPE_ESCALATION_FAILED, + alert_group=alert_group, + reason="Declare Incident step is not enabled", + escalation_policy=self.escalation_policy, + escalation_error_code=AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED, + escalation_policy_step=self.step, + ).save() + return + tasks = [] + declare_incident_task = declare_incident.signature( + args=(alert_group.pk,), + kwargs={ + "escalation_policy_pk": self.id, + "severity": self.severity, + }, + immutable=True, + ) + tasks.append(declare_incident_task) + self._execute_tasks(tasks) + def _escalation_step_notify_if_time(self, alert_group: "AlertGroup", _reason: str) -> StepExecutionResultData: eta = None diff --git a/engine/apps/alerts/migrations/0059_escalationpolicy_severity_and_more.py b/engine/apps/alerts/migrations/0059_escalationpolicy_severity_and_more.py new file mode 100644 index 0000000000..5e32c574b5 --- /dev/null +++ b/engine/apps/alerts/migrations/0059_escalationpolicy_severity_and_more.py @@ -0,0 +1,41 @@ +# Generated by Django 4.2.15 on 2024-09-25 20:57 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('user_management', '0022_alter_team_unique_together'), + ('alerts', '0058_alter_alertgroup_reason_to_skip_escalation'), + ] + + operations = [ + migrations.AddField( + model_name='escalationpolicy', + name='severity', + field=models.CharField(default=None, max_length=512, null=True), + ), + migrations.AlterField( + model_name='escalationpolicy', + name='step', + field=models.IntegerField(choices=[(0, 'Wait'), (1, 'Notify User'), (2, 'Notify Whole Channel'), (3, 'Repeat Escalation (5 times max)'), (4, 'Resolve'), (5, 'Notify Group'), (6, 'Notify Schedule'), (7, 'Notify User (Important)'), (8, 'Notify Group (Important)'), (9, 'Notify Schedule (Important)'), (10, 'Trigger Outgoing Webhook'), (11, 'Notify User (next each time)'), (12, 'Continue escalation only if time is from'), (13, 'Notify multiple Users'), (14, 'Notify multiple Users (Important)'), (15, 'Continue escalation if >X alerts per Y minutes'), (16, 'Trigger Webhook'), (17, 'Notify all users in a Team'), (18, 'Notify all users in a Team (Important)'), (19, 'Declare Incident')], default=None, null=True), + ), + migrations.CreateModel( + name='DeclaredIncident', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('incident_id', models.CharField(db_index=True, max_length=50)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('is_active', models.BooleanField(default=True)), + ('channel_filter', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='declared_incidents', to='alerts.channelfilter')), + ('organization', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='declared_incidents', to='user_management.organization')), + ], + ), + migrations.AddField( + model_name='alertgroup', + name='declared_incident', + field=models.ForeignKey(default=None, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='attached_alert_groups', to='alerts.declaredincident'), + ), + ] diff --git a/engine/apps/alerts/models/__init__.py b/engine/apps/alerts/models/__init__.py index 51b4415844..82ea11c5d5 100644 --- a/engine/apps/alerts/models/__init__.py +++ b/engine/apps/alerts/models/__init__.py @@ -8,6 +8,7 @@ from .alert_receive_channel_connection import AlertReceiveChannelConnection # noqa: F401 from .channel_filter import ChannelFilter # noqa: F401 from .custom_button import CustomButton # noqa: F401 +from .declared_incident import DeclaredIncident # noqa: F401 from .escalation_chain import EscalationChain # noqa: F401 from .escalation_policy import EscalationPolicy # noqa: F401 from .grafana_alerting_contact_point import GrafanaAlertingContactPoint # noqa: F401 diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py index 489607a434..3b9b1f476d 100644 --- a/engine/apps/alerts/models/alert_group.py +++ b/engine/apps/alerts/models/alert_group.py @@ -30,6 +30,7 @@ from apps.metrics_exporter.tasks import update_metrics_for_alert_group from apps.slack.slack_formatter import SlackFormatter from apps.user_management.models import User +from common.constants.plugin_ids import PluginID from common.public_primary_keys import generate_public_primary_key, increase_public_primary_key_length from common.utils import clean_markup, str_or_backup @@ -43,6 +44,7 @@ AlertGroupLogRecord, AlertReceiveChannel, BundledNotification, + DeclaredIncident, ResolutionNote, ResolutionNoteSlackMessage, ) @@ -205,6 +207,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models. slack_messages: "RelatedManager['SlackMessage']" users: "RelatedManager['User']" labels: "RelatedManager['AlertGroupAssociatedLabel']" + declared_incident: typing.Optional["DeclaredIncident"] objects: models.Manager["AlertGroup"] = AlertGroupQuerySet.as_manager() @@ -420,8 +423,17 @@ def status(self) -> int: # https://code.djangoproject.com/ticket/28545 is_open_for_grouping = models.BooleanField(default=None, null=True, blank=True) + # todo: rework using this field to use DeclaredIncident model field instead grafana_incident_id = models.CharField(max_length=100, null=True, default=None) + declared_incident = models.ForeignKey( + "alerts.DeclaredIncident", + on_delete=models.SET_NULL, + null=True, + default=None, + related_name="attached_alert_groups", + ) + @staticmethod def get_silenced_state_filter(): """ @@ -545,7 +557,7 @@ def web_link(self) -> str: @property def declare_incident_link(self) -> str: """Generate a link for AlertGroup to declare Grafana Incident by click""" - incident_link = urljoin(self.channel.organization.grafana_url, "a/grafana-incident-app/incidents/declare/") + incident_link = urljoin(self.channel.organization.grafana_url, f"a/{PluginID.INCIDENT}/incidents/declare/") caption = urllib.parse.quote_plus("OnCall Alert Group") title = urllib.parse.quote_plus(self.web_title_cache) if self.web_title_cache else DEFAULT_BACKUP_TITLE title = title[:2000] # set max title length to avoid exceptions with too long declare incident link diff --git a/engine/apps/alerts/models/alert_group_log_record.py b/engine/apps/alerts/models/alert_group_log_record.py index 3c4113a295..cd1c312d1c 100644 --- a/engine/apps/alerts/models/alert_group_log_record.py +++ b/engine/apps/alerts/models/alert_group_log_record.py @@ -11,18 +11,24 @@ from apps.alerts import tasks from apps.alerts.constants import ActionSource +from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE from apps.alerts.utils import render_relative_timeline from apps.slack.slack_formatter import SlackFormatter from common.utils import clean_markup if typing.TYPE_CHECKING: from apps.alerts.models import AlertGroup, CustomButton, EscalationPolicy, Invitation - from apps.user_management.models import User + from apps.user_management.models import Organization, User logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) +class RelatedIncidentData(typing.TypedDict): + incident_link: typing.Optional[str] + incident_title: str + + class AlertGroupLogRecord(models.Model): alert_group: "AlertGroup" author: typing.Optional["User"] @@ -161,7 +167,9 @@ class AlertGroupLogRecord(models.Model): ERROR_ESCALATION_TRIGGER_CUSTOM_WEBHOOK_ERROR, ERROR_ESCALATION_NOTIFY_TEAM_MEMBERS_STEP_IS_NOT_CONFIGURED, ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED, - ) = range(20) + ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED, + ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED, + ) = range(22) type = models.IntegerField(choices=TYPE_CHOICES) @@ -225,16 +233,60 @@ class AlertGroupLogRecord(models.Model): escalation_policy_step = models.IntegerField(null=True, default=None) step_specific_info = JSONField(null=True, default=None) - STEP_SPECIFIC_INFO_KEYS = ["schedule_name", "custom_button_name", "usergroup_handle", "source_integration_name"] + STEP_SPECIFIC_INFO_KEYS = [ + "schedule_name", + "custom_button_name", + "usergroup_handle", + "source_integration_name", + "incident_link", + "incident_title", + ] + + def _make_log_line_link(self, url, title, html=False, for_slack=False, substitute_with_tag=False): + if html and url: + return f"{title}" + elif for_slack and url: + return f"<{url}|{title}>" + elif substitute_with_tag: + return f"{{{{{substitute_with_tag}}}}}" + else: + return title def render_log_line_json(self): time = humanize.naturaldelta(self.alert_group.started_at - self.created_at) created_at = DateTimeField().to_representation(self.created_at) organization = self.alert_group.channel.organization author = self.author.short(organization) if self.author is not None else None + escalation_chain = self.alert_group.channel_filter.escalation_chain if self.alert_group.channel_filter else None + step_info = self.get_step_specific_info() + related_incident = self.render_incident_data_from_step_info(organization, step_info) + escalation_chain_data = ( + { + "pk": escalation_chain.public_primary_key, + "title": escalation_chain.name, + } + if escalation_chain + else None + ) + schedule = ( + { + "pk": self.escalation_policy.notify_schedule.public_primary_key, + "title": self.escalation_policy.notify_schedule.name, + } + if self.escalation_policy and self.escalation_policy.notify_schedule + else None + ) + webhook = ( + { + "pk": step_info["webhook_id"], + "title": step_info.get("webhook_name", "webhook"), + } + if step_info and "webhook_id" in step_info + else None + ) sf = SlackFormatter(organization) - action = sf.format(self.rendered_log_line_action(substitute_author_with_tag=True)) + action = sf.format(self.rendered_log_line_action(substitute_with_tag=True)) action = clean_markup(action) result = { @@ -244,6 +296,10 @@ def render_log_line_json(self): "type": self.type, "created_at": created_at, "author": author, + "incident": related_incident, + "escalation_chain": escalation_chain_data, + "schedule": schedule, + "webhook": webhook, } return result @@ -258,7 +314,7 @@ def rendered_incident_log_line(self, for_slack=False, html=False): result += self.rendered_log_line_action(for_slack=for_slack, html=html) return result - def rendered_log_line_action(self, for_slack=False, html=False, substitute_author_with_tag=False): + def rendered_log_line_action(self, for_slack=False, html=False, substitute_with_tag=False): from apps.alerts.models import EscalationPolicy result = "" @@ -276,7 +332,7 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_autho elif self.action_source == ActionSource.BACKSYNC: author_name = "source integration " + step_specific_info.get("source_integration_name", "") elif self.author: - if substitute_author_with_tag: + if substitute_with_tag: author_name = "{{author}}" elif for_slack: author_name = self.author.get_username_with_slack_verbal() @@ -303,7 +359,9 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_autho result += f'alert group assigned to route "{channel_filter.str_for_clients}"' if escalation_chain is not None: - result += f' with escalation chain "{escalation_chain.name}"' + tag = "escalation_chain" if substitute_with_tag else False + escalation_chain_text = self._make_log_line_link(None, escalation_chain.name, html, for_slack, tag) + result += f' with escalation chain "{escalation_chain_text}"' else: result += " with no escalation chain, skipping escalation" else: @@ -379,9 +437,19 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_autho important_text = "" if escalation_policy_step == EscalationPolicy.STEP_NOTIFY_SCHEDULE_IMPORTANT: important_text = " (Important)" - result += f'triggered step "Notify on-call from Schedule {schedule_name}{important_text}"' + tag = "schedule" if substitute_with_tag else False + schedule_text = self._make_log_line_link(None, schedule_name, html, for_slack, tag) + result += f'triggered step "Notify on-call from Schedule {schedule_text}{important_text}"' elif escalation_policy_step == EscalationPolicy.STEP_REPEAT_ESCALATION_N_TIMES: result += "escalation started from the beginning" + elif escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT: + organization = self.alert_group.channel.organization + incident_data = self.render_incident_data_from_step_info(organization, step_specific_info) + incident_link = incident_data["incident_link"] + incident_title = incident_data["incident_title"] + tag = "related_incident" if substitute_with_tag else False + incident_text = self._make_log_line_link(incident_link, incident_title, html, for_slack, tag) + result += self.reason + f": {incident_text}" else: result += f'triggered step "{EscalationPolicy.get_step_display_name(escalation_policy_step)}"' elif self.type == AlertGroupLogRecord.TYPE_SILENCE: @@ -485,7 +553,10 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_autho trigger = f"{author_name}" else: trigger = trigger or "escalation chain" - result += f"outgoing webhook `{webhook_name}` triggered by {trigger}" + tag = "webhook" if substitute_with_tag else False + webhook_text = self._make_log_line_link(None, webhook_name, html, for_slack, tag) + result += f"outgoing webhook `{webhook_text}` triggered by {trigger}" + elif self.type == AlertGroupLogRecord.TYPE_FAILED_ATTACHMENT: if self.alert_group.slack_message is not None: result += ( @@ -594,8 +665,32 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_autho result += f"failed to notify User Group{usergroup_handle_text} in Slack" elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED: result += 'skipped escalation step "Trigger Outgoing Webhook" because it is disabled' + elif ( + self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED + ): + result += 'skipped escalation step "Declare Incident": step is not enabled' + elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED: + result += "failed to declare an Incident" + if self.reason: + result += f": {self.reason}" return result + def render_incident_data_from_step_info( + self, organization: "Organization", step_specific_info: dict + ) -> RelatedIncidentData | None: + from apps.alerts.models.declared_incident import get_incident_url + + if not step_specific_info or not all(key in step_specific_info for key in ["incident_title", "incident_id"]): + return None + + incident_link = ( + get_incident_url(organization, step_specific_info["incident_id"]) + if step_specific_info["incident_id"] + else None + ) + incident_title = step_specific_info["incident_title"] or DEFAULT_BACKUP_TITLE + return {"incident_link": incident_link, "incident_title": incident_title} + def get_step_specific_info(self): step_specific_info = None # in some cases step_specific_info was saved with using json.dumps diff --git a/engine/apps/alerts/models/declared_incident.py b/engine/apps/alerts/models/declared_incident.py new file mode 100644 index 0000000000..3fe914fa6f --- /dev/null +++ b/engine/apps/alerts/models/declared_incident.py @@ -0,0 +1,38 @@ +import typing +from urllib.parse import urljoin + +from django.db import models + +if typing.TYPE_CHECKING: + from django.db.models.manager import RelatedManager + + from apps.alerts.models import AlertGroup, ChannelFilter + from apps.user_management.models import Organization + + +def get_incident_url(organization, incident_id) -> str: + return urljoin(organization.grafana_url, f"a/grafana-incident-app/incidents/{incident_id}") + + +class DeclaredIncident(models.Model): + attached_alert_groups: "RelatedManager['AlertGroup']" + channel_filter: typing.Optional["ChannelFilter"] + organization: "Organization" + + incident_id = models.CharField(db_index=True, max_length=50) + organization = models.ForeignKey( + "user_management.Organization", + on_delete=models.CASCADE, + related_name="declared_incidents", + ) + channel_filter = models.ForeignKey( + "alerts.ChannelFilter", + on_delete=models.SET_NULL, + null=True, + related_name="declared_incidents", + ) + created_at = models.DateTimeField(auto_now_add=True) + is_active = models.BooleanField(default=True) + + def get_incident_link(self) -> str: + return get_incident_url(self.organization, self.incident_id) diff --git a/engine/apps/alerts/models/escalation_policy.py b/engine/apps/alerts/models/escalation_policy.py index 0c10e31f1c..28ea7022ac 100644 --- a/engine/apps/alerts/models/escalation_policy.py +++ b/engine/apps/alerts/models/escalation_policy.py @@ -47,7 +47,8 @@ class EscalationPolicy(OrderedModel): STEP_TRIGGER_CUSTOM_WEBHOOK, STEP_NOTIFY_TEAM_MEMBERS, STEP_NOTIFY_TEAM_MEMBERS_IMPORTANT, - ) = range(19) + STEP_DECLARE_INCIDENT, + ) = range(20) # Must be the same order as previous STEP_CHOICES = ( @@ -70,6 +71,7 @@ class EscalationPolicy(OrderedModel): (STEP_TRIGGER_CUSTOM_WEBHOOK, "Trigger Webhook"), (STEP_NOTIFY_TEAM_MEMBERS, "Notify all users in a Team"), (STEP_NOTIFY_TEAM_MEMBERS_IMPORTANT, "Notify all users in a Team (Important)"), + (STEP_DECLARE_INCIDENT, "Declare Incident"), ) # Ordered step choices available for internal api. @@ -90,6 +92,7 @@ class EscalationPolicy(OrderedModel): STEP_NOTIFY_IF_TIME, STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW, STEP_REPEAT_ESCALATION_N_TIMES, + STEP_DECLARE_INCIDENT, ] # Steps can be stored in db while interacting with internal api # Includes important versions of default steps @@ -110,6 +113,7 @@ class EscalationPolicy(OrderedModel): STEP_NOTIFY_MULTIPLE_USERS_IMPORTANT, STEP_TRIGGER_CUSTOM_WEBHOOK, STEP_REPEAT_ESCALATION_N_TIMES, + STEP_DECLARE_INCIDENT, ] # Maps internal api's steps choices to their verbal. First string in tuple is display name for existent step. @@ -151,6 +155,10 @@ class EscalationPolicy(OrderedModel): "Repeat escalation from the beginning (5 times max)", "Repeat escalations from the beginning (5 times max)", ), + STEP_DECLARE_INCIDENT: ( + "Declare Incident with severity {{severity}} (non-default routes only)", + "Declare Incident (non-default routes only)", + ), } STEPS_WITH_NO_IMPORTANT_VERSION_SET = { @@ -161,6 +169,7 @@ class EscalationPolicy(OrderedModel): STEP_NOTIFY_USERS_QUEUE, STEP_NOTIFY_IF_TIME, STEP_REPEAT_ESCALATION_N_TIMES, + STEP_DECLARE_INCIDENT, } DEFAULT_TO_IMPORTANT_STEP_MAPPING = { @@ -210,6 +219,7 @@ class EscalationPolicy(OrderedModel): STEP_NOTIFY_IF_TIME, STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW, STEP_REPEAT_ESCALATION_N_TIMES, + STEP_DECLARE_INCIDENT, ] PUBLIC_STEP_CHOICES_MAP = { @@ -231,6 +241,7 @@ class EscalationPolicy(OrderedModel): STEP_NOTIFY_IF_TIME: "notify_if_time_from_to", STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: "notify_if_num_alerts_in_window", STEP_REPEAT_ESCALATION_N_TIMES: "repeat_escalation", + STEP_DECLARE_INCIDENT: "declare_incident", } public_primary_key = models.CharField( @@ -291,6 +302,10 @@ class EscalationPolicy(OrderedModel): null=True, ) + # Incident severity for declare incident step + SEVERITY_SET_FROM_LABEL, SEVERITY_SET_FROM_LABEL_DISPLAY_VALUE = ("set-from-label", "from 'severity' label") + severity = models.CharField(max_length=512, null=True, default=None) + ONE_MINUTE = datetime.timedelta(minutes=1) FIVE_MINUTES = datetime.timedelta(minutes=5) FIFTEEN_MINUTES = datetime.timedelta(minutes=15) diff --git a/engine/apps/alerts/paging.py b/engine/apps/alerts/paging.py index b03f52d7a0..5121d01798 100644 --- a/engine/apps/alerts/paging.py +++ b/engine/apps/alerts/paging.py @@ -16,6 +16,7 @@ from apps.schedules.ical_utils import get_cached_oncall_users_for_multiple_schedules from apps.schedules.models import OnCallSchedule from apps.user_management.models import Organization, Team, User +from common.utils import escape_html UserNotifications = list[tuple[User, bool]] @@ -145,6 +146,10 @@ def direct_paging( if alert_group and alert_group.resolved: raise DirectPagingAlertGroupResolvedError + # https://github.com/grafana/oncall-private/issues/2760 + title = escape_html(title) + message = escape_html(message) + if title is None: title = _construct_title(from_user, team, users) diff --git a/engine/apps/alerts/tasks/__init__.py b/engine/apps/alerts/tasks/__init__.py index 056140a3eb..e89f96cb70 100644 --- a/engine/apps/alerts/tasks/__init__.py +++ b/engine/apps/alerts/tasks/__init__.py @@ -5,6 +5,7 @@ ) from .check_escalation_finished import check_escalation_finished_task # noqa: F401 from .custom_webhook_result import custom_webhook_result # noqa: F401 +from .declare_incident import declare_incident # noqa: F401 from .delete_alert_group import delete_alert_group # noqa: F401 from .delete_alert_group import finish_delete_alert_group # noqa: F401 from .delete_alert_group import send_alert_group_signal_for_delete # noqa: F401 diff --git a/engine/apps/alerts/tasks/declare_incident.py b/engine/apps/alerts/tasks/declare_incident.py new file mode 100644 index 0000000000..a12076c032 --- /dev/null +++ b/engine/apps/alerts/tasks/declare_incident.py @@ -0,0 +1,149 @@ +import logging + +from django.conf import settings + +from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE +from common.custom_celery_tasks import shared_dedicated_queue_retry_task +from common.incident_api.client import ( + DEFAULT_INCIDENT_SEVERITY, + DEFAULT_INCIDENT_STATUS, + IncidentAPIClient, + IncidentAPIException, +) + +logger = logging.getLogger(__name__) + +ATTACHMENT_CAPTION = "OnCall Alert Group" +ERROR_SEVERITY_NOT_FOUND = "Severity.FindOne: not found" +MAX_RETRIES = 1 if settings.DEBUG else 10 +MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT = 5 + + +def _attach_alert_group_to_incident(alert_group, incident_id, incident_title, escalation_policy, attached=False): + from apps.alerts.models import AlertGroupLogRecord, DeclaredIncident, EscalationPolicy + + declared_incident, _ = DeclaredIncident.objects.get_or_create( + incident_id=incident_id, + defaults={ + "organization": alert_group.channel.organization, + "channel_filter": alert_group.channel_filter, + }, + ) + alert_group.declared_incident = declared_incident + alert_group.save(update_fields=["declared_incident"]) + reason = "attached to existing incident" if attached else "incident declared" + AlertGroupLogRecord.objects.create( + type=AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, + reason=reason, + alert_group=alert_group, + step_specific_info={"incident_id": incident_id, "incident_title": incident_title}, + escalation_policy=escalation_policy, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + + +def _create_error_log_record(alert_group, escalation_policy, reason=""): + from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy + + AlertGroupLogRecord.objects.create( + type=AlertGroupLogRecord.TYPE_ESCALATION_FAILED, + escalation_error_code=AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED, + reason=reason, + alert_group=alert_group, + escalation_policy=escalation_policy, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + + +@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=MAX_RETRIES) +def declare_incident(alert_group_pk, escalation_policy_pk, severity=None): + from apps.alerts.models import AlertGroup, DeclaredIncident, EscalationPolicy + + alert_group = AlertGroup.objects.get(pk=alert_group_pk) + organization = alert_group.channel.organization + escalation_policy = None + if escalation_policy_pk: + escalation_policy = EscalationPolicy.objects.filter(pk=escalation_policy_pk).first() + + if alert_group.channel_filter.is_default: + _create_error_log_record( + alert_group, escalation_policy, reason="Declare incident step is not enabled for default routes" + ) + return + + if declare_incident.request.retries == MAX_RETRIES: + _create_error_log_record(alert_group, escalation_policy) + return + + incident_client = IncidentAPIClient(organization.grafana_url, organization.api_token) + + # check for currently active related incident in the same route (channel_filter) + existing_incident = ( + DeclaredIncident.objects.filter( + organization=organization, channel_filter=alert_group.channel_filter, is_active=True + ) + .order_by("-created_at") + .first() + ) + + if existing_incident: + incident_id = existing_incident.incident_id + try: + # get existing incident details + incident_data, _ = incident_client.get_incident(incident_id) + except IncidentAPIException as e: + logger.error(f"Error getting incident details: {e.msg}") + if e.status == 404: + # incident not found, mark as not opened + existing_incident.is_active = False + existing_incident.save(update_fields=["is_active"]) + else: + # raise (and retry) + raise + else: + # incident exists, check if it is still active + if incident_data["status"] == DEFAULT_INCIDENT_STATUS: + # attach to incident context + incident_title = incident_data["title"] + num_attached = AlertGroup.objects.filter(declared_incident=existing_incident).count() + if num_attached < MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT: + try: + incident_data, _ = incident_client.add_activity(incident_id, alert_group.web_link) + except IncidentAPIException as e: + logger.error(f"Error attaching to existing incident: {e.msg}") + # setup association between alert group and incident (even if not attached) + _attach_alert_group_to_incident( + alert_group, incident_id, incident_title, escalation_policy, attached=True + ) + else: + existing_incident.is_active = False + existing_incident.save(update_fields=["is_active"]) + + if existing_incident is None or not existing_incident.is_active: + # create new incident + if severity == EscalationPolicy.SEVERITY_SET_FROM_LABEL: + severity_label = alert_group.labels.filter(key_name="severity").first() + severity = severity_label.value_name if severity_label else None + severity = severity or DEFAULT_INCIDENT_SEVERITY + try: + incident_data, _ = incident_client.create_incident( + alert_group.web_title_cache if alert_group.web_title_cache else DEFAULT_BACKUP_TITLE, + severity=severity, + attachCaption=ATTACHMENT_CAPTION, + attachURL=alert_group.web_link, + ) + except IncidentAPIException as e: + logger.error(f"Error creating new incident: {e.msg}") + if ERROR_SEVERITY_NOT_FOUND.lower() in e.msg.lower() and severity != DEFAULT_INCIDENT_SEVERITY: + # invalid severity, retry with default severity + declare_incident.apply_async( + args=(alert_group_pk, escalation_policy_pk), + kwargs={"severity": DEFAULT_INCIDENT_SEVERITY}, + ) + return + # else raise (and retry) + raise + else: + _attach_alert_group_to_incident( + alert_group, incident_data["incidentID"], incident_data["title"], escalation_policy + ) diff --git a/engine/apps/alerts/tests/factories.py b/engine/apps/alerts/tests/factories.py index f07ef90046..7d34717110 100644 --- a/engine/apps/alerts/tests/factories.py +++ b/engine/apps/alerts/tests/factories.py @@ -8,6 +8,7 @@ AlertReceiveChannelConnection, ChannelFilter, CustomButton, + DeclaredIncident, EscalationChain, EscalationPolicy, Invitation, @@ -91,3 +92,8 @@ class Meta: class UserNotificationBundleFactory(factory.DjangoModelFactory): class Meta: model = UserNotificationBundle + + +class DeclaredIncidentFactory(factory.DjangoModelFactory): + class Meta: + model = DeclaredIncident diff --git a/engine/apps/alerts/tests/test_alert_group_log_record.py b/engine/apps/alerts/tests/test_alert_group_log_record.py index dbc668dc97..9dfaa84c35 100644 --- a/engine/apps/alerts/tests/test_alert_group_log_record.py +++ b/engine/apps/alerts/tests/test_alert_group_log_record.py @@ -2,7 +2,8 @@ import pytest -from apps.alerts.models import AlertGroupLogRecord +from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy +from apps.schedules.models import OnCallScheduleWeb @pytest.mark.django_db @@ -37,3 +38,138 @@ def test_trigger_update_signal( with patch("apps.alerts.tasks.send_update_log_report_signal") as mock_update_log_signal: alert_group.log_records.create(type=log_type) mock_update_log_signal.apply_async.assert_called_once() + + +@pytest.mark.django_db +@pytest.mark.parametrize( + "for_slack, html, substitute_with_tag, expected", + [ + (True, False, False, 'with escalation chain "Escalation name"'), + (False, True, False, 'with escalation chain "Escalation name"'), + (False, False, True, 'with escalation chain "{{escalation_chain}}'), + ], +) +def test_log_record_escalation_chain_link( + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_escalation_chain, + make_channel_filter, + make_alert_group, + for_slack, + html, + substitute_with_tag, + expected, +): + organization, _ = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + escalation_chain = make_escalation_chain(organization, name="Escalation name") + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + alert_group = make_alert_group(alert_receive_channel, channel_filter=channel_filter) + alert_group.raw_escalation_snapshot = alert_group.build_raw_escalation_snapshot() + + log = alert_group.log_records.create( + type=AlertGroupLogRecord.TYPE_ROUTE_ASSIGNED, + ) + + log_line = log.rendered_log_line_action(for_slack=for_slack, html=html, substitute_with_tag=substitute_with_tag) + assert expected in log_line + + log_data = log.render_log_line_json() + escalation_chain_data = log_data.get("escalation_chain") + assert escalation_chain_data == {"pk": escalation_chain.public_primary_key, "title": escalation_chain.name} + + +@pytest.mark.django_db +@pytest.mark.parametrize( + "for_slack, html, substitute_with_tag, expected", + [ + (True, False, False, "Notify on-call from Schedule 'Schedule name'"), + (False, True, False, "Notify on-call from Schedule 'Schedule name'"), + (False, False, True, "Notify on-call from Schedule {{schedule}}"), + ], +) +def test_log_record_schedule_link( + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, + make_schedule, + make_escalation_chain, + make_escalation_policy, + for_slack, + html, + substitute_with_tag, + expected, +): + organization, _ = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + schedule = make_schedule(organization, schedule_class=OnCallScheduleWeb, name="Schedule name") + escalation_chain = make_escalation_chain(organization, name="Escalation name") + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + escalation_policy = make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_NOTIFY_SCHEDULE, + notify_schedule=schedule, + ) + + log = alert_group.log_records.create( + type=AlertGroupLogRecord.TYPE_ESCALATION_TRIGGERED, + step_specific_info={"schedule_name": schedule.name}, + escalation_policy=escalation_policy, + ) + + log_line = log.rendered_log_line_action(for_slack=for_slack, html=html, substitute_with_tag=substitute_with_tag) + assert expected in log_line + + log_data = log.render_log_line_json() + schedule_data = log_data.get("schedule") + assert schedule_data == {"pk": schedule.public_primary_key, "title": schedule.name} + + +@pytest.mark.django_db +@pytest.mark.parametrize( + "for_slack, html, substitute_with_tag, expected", + [ + (True, False, False, "outgoing webhook `Webhook name`"), + (False, True, False, "outgoing webhook `Webhook name`"), + (False, False, True, "outgoing webhook `{{webhook}}`"), + ], +) +def test_log_record_webhook_link( + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_channel_filter, + make_alert_group, + make_custom_webhook, + make_escalation_chain, + make_escalation_policy, + for_slack, + html, + substitute_with_tag, + expected, +): + organization, _ = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + webhook = make_custom_webhook(organization, name="Webhook name") + escalation_chain = make_escalation_chain(organization, name="Escalation name") + channel_filter = make_channel_filter(alert_receive_channel, escalation_chain=escalation_chain) + escalation_policy = make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_TRIGGER_CUSTOM_WEBHOOK, + custom_webhook=webhook, + ) + + log = alert_group.log_records.create( + type=AlertGroupLogRecord.TYPE_CUSTOM_WEBHOOK_TRIGGERED, + step_specific_info={"webhook_id": webhook.public_primary_key, "webhook_name": webhook.name}, + escalation_policy=escalation_policy, + ) + + log_line = log.rendered_log_line_action(for_slack=for_slack, html=html, substitute_with_tag=substitute_with_tag) + assert expected in log_line + + log_data = log.render_log_line_json() + webhook_data = log_data.get("webhook") + assert webhook_data == {"pk": webhook.public_primary_key, "title": webhook.name} diff --git a/engine/apps/alerts/tests/test_declare_incident.py b/engine/apps/alerts/tests/test_declare_incident.py new file mode 100644 index 0000000000..47b931f445 --- /dev/null +++ b/engine/apps/alerts/tests/test_declare_incident.py @@ -0,0 +1,335 @@ +from unittest.mock import patch + +import httpretty +import pytest + +from apps.alerts.models import AlertGroupLogRecord, DeclaredIncident, EscalationPolicy +from apps.alerts.tasks.declare_incident import ( + ATTACHMENT_CAPTION, + DEFAULT_BACKUP_TITLE, + DEFAULT_INCIDENT_SEVERITY, + ERROR_SEVERITY_NOT_FOUND, + MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT, + declare_incident, +) +from common.incident_api.client import IncidentAPIException + + +@pytest.fixture +def setup_alert_group_and_escalation_step( + make_organization, + make_alert_receive_channel, + make_alert_group, + make_channel_filter, + make_escalation_chain, + make_escalation_policy, +): + def _setup_alert_group_and_escalation_step(is_default_route=False, already_declared_incident=False): + organization = make_organization(grafana_url="https://stack.grafana.net", api_token="token") + alert_receive_channel = make_alert_receive_channel(organization=organization) + escalation_chain = make_escalation_chain(organization) + declare_incident_step = make_escalation_policy( + escalation_chain=escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + channel_filter = make_channel_filter( + alert_receive_channel, + escalation_chain=escalation_chain, + is_default=is_default_route, + ) + alert_group = make_alert_group( + alert_receive_channel=alert_receive_channel, + channel_filter=channel_filter, + ) + declared_incident = None + if already_declared_incident: + declared_incident = DeclaredIncident.objects.create( + incident_id="123", + organization=organization, + channel_filter=channel_filter, + ) + + return alert_group, declare_incident_step, declared_incident + + return _setup_alert_group_and_escalation_step + + +@pytest.mark.django_db +def test_declare_incident_default_route(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(is_default_route=True) + + declare_incident(alert_group.pk, declare_incident_step.pk) + + alert_group.refresh_from_db() + # check triggered log + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_FAILED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info is None + assert log_record.reason == "Declare incident step is not enabled for default routes" + assert log_record.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_ok(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, + severity=DEFAULT_INCIDENT_SEVERITY, + attachCaption=ATTACHMENT_CAPTION, + attachURL=alert_group.web_link, + ) + + alert_group.refresh_from_db() + + # check declared incident + assert alert_group.declared_incident.incident_id == "123" + assert alert_group.declared_incident.organization == alert_group.channel.organization + assert alert_group.declared_incident.channel_filter == alert_group.channel_filter + # check triggered log + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": "123", "incident_title": "Incident"} + assert log_record.reason == "incident declared" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_set_severity(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + severity = "critical" + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity) + + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, severity=severity, attachCaption=ATTACHMENT_CAPTION, attachURL=alert_group.web_link + ) + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_set_severity_from_label(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + expected_severity = "minor" + # set alert group label + alert_group.labels.create( + organization=alert_group.channel.organization, key_name="severity", value_name=expected_severity + ) + severity = EscalationPolicy.SEVERITY_SET_FROM_LABEL + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.return_value = {"incidentID": "123", "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity) + + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, + severity=expected_severity, + attachCaption=ATTACHMENT_CAPTION, + attachURL=alert_group.web_link, + ) + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_invalid_severity_fallback(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + severity = "INVALID" + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + with patch.object(declare_incident, "apply_async") as mock_declare_incident_apply_async: + mock_create_incident.side_effect = IncidentAPIException( + status=500, url="some-url", msg=ERROR_SEVERITY_NOT_FOUND + ) + declare_incident(alert_group.pk, declare_incident_step.pk, severity=severity) + + # create call failing with invalid severity + mock_create_incident.assert_called_with( + DEFAULT_BACKUP_TITLE, severity=severity, attachCaption=ATTACHMENT_CAPTION, attachURL=alert_group.web_link + ) + # new task is queued with default severity instead + mock_declare_incident_apply_async.assert_called_with( + args=(alert_group.pk, declare_incident_step.pk), kwargs={"severity": DEFAULT_INCIDENT_SEVERITY} + ) + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_attach_alert_group(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + incident_id = existing_open_incident.incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity: + mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None + mock_add_activity.return_value = {"activityItemID": "111"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + alert_group.refresh_from_db() + + # check declared incident + assert alert_group.declared_incident == existing_open_incident + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"} + assert log_record.reason == "attached to existing incident" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_resolved_update(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + incident_id = existing_open_incident.incident_id + new_incident_id = "333" + assert new_incident_id != incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_get_incident.return_value = { + "incidentID": incident_id, + "title": "Incident1", + "status": "resolved", + }, None + mock_create_incident.return_value = {"incidentID": new_incident_id, "title": "Incident2"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + alert_group.refresh_from_db() + existing_open_incident.refresh_from_db() + + assert existing_open_incident.is_active is False + # check declared incident + assert alert_group.declared_incident != existing_open_incident + assert alert_group.declared_incident.incident_id == new_incident_id + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": new_incident_id, "incident_title": "Incident2"} + assert log_record.reason == "incident declared" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_declare_incident_attach_alert_group_skip_incident_update( + setup_alert_group_and_escalation_step, make_alert_group +): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + alert_receive_channel = alert_group.channel + channel_filter = alert_group.channel_filter + incident_id = existing_open_incident.incident_id + + # attach max alert groups to incident + for _ in range(MAX_ATTACHED_ALERT_GROUPS_PER_INCIDENT): + ag = make_alert_group(alert_receive_channel=alert_receive_channel, channel_filter=channel_filter) + ag.declared_incident = existing_open_incident + ag.save() + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity: + mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + assert not mock_add_activity.called + + alert_group.refresh_from_db() + + # check declared incident + assert alert_group.declared_incident == existing_open_incident + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"} + assert log_record.reason == "attached to existing incident" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_get_existing_incident_error(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + mock_get_incident.side_effect = IncidentAPIException(status=500, url="some-url") + with pytest.raises(IncidentAPIException): + declare_incident(alert_group.pk, declare_incident_step.pk) + + # but if incident was not found, a new one should be created + incident_id = existing_open_incident.incident_id + new_incident_id = "333" + assert new_incident_id != incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_get_incident.side_effect = IncidentAPIException(status=404, url="some-url") + mock_create_incident.return_value = {"incidentID": new_incident_id, "title": "Incident"}, None + declare_incident(alert_group.pk, declare_incident_step.pk) + + alert_group.refresh_from_db() + + # check declared incident + assert alert_group.declared_incident != existing_open_incident + assert alert_group.declared_incident.incident_id == new_incident_id + assert alert_group.declared_incident.organization == alert_group.channel.organization + assert alert_group.declared_incident.channel_filter == alert_group.channel_filter + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_attach_alert_group_error(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, existing_open_incident = setup_alert_group_and_escalation_step( + already_declared_incident=True + ) + incident_id = existing_open_incident.incident_id + + with patch("common.incident_api.client.IncidentAPIClient.get_incident") as mock_get_incident: + with patch("common.incident_api.client.IncidentAPIClient.add_activity") as mock_add_activity: + mock_get_incident.return_value = {"incidentID": incident_id, "title": "Incident", "status": "active"}, None + mock_add_activity.side_effect = IncidentAPIException(status=500, url="some-url") + declare_incident(alert_group.pk, declare_incident_step.pk) + + alert_group.refresh_from_db() + + # incident attachment failed, but DB is still updated + assert alert_group.declared_incident == existing_open_incident + log_record = alert_group.log_records.last() + assert log_record.type == log_record.TYPE_ESCALATION_TRIGGERED + assert log_record.escalation_policy == declare_incident_step + assert log_record.escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert log_record.step_specific_info == {"incident_id": incident_id, "incident_title": "Incident"} + assert log_record.reason == "attached to existing incident" + assert log_record.escalation_error_code is None + + +@pytest.mark.django_db +@httpretty.activate(verbose=True, allow_net_connect=False) +def test_create_incident_error(setup_alert_group_and_escalation_step): + alert_group, declare_incident_step, _ = setup_alert_group_and_escalation_step(already_declared_incident=False) + + with patch("common.incident_api.client.IncidentAPIClient.create_incident") as mock_create_incident: + mock_create_incident.side_effect = IncidentAPIException(status=500, url="some-url") + with pytest.raises(IncidentAPIException): + declare_incident(alert_group.pk, declare_incident_step.pk) diff --git a/engine/apps/alerts/tests/test_escalation_policy_snapshot.py b/engine/apps/alerts/tests/test_escalation_policy_snapshot.py index 8a3eef6008..8882a37012 100644 --- a/engine/apps/alerts/tests/test_escalation_policy_snapshot.py +++ b/engine/apps/alerts/tests/test_escalation_policy_snapshot.py @@ -690,3 +690,52 @@ def test_notify_team_members( (user_2.pk, alert_group.pk), expected_kwargs, immutable=True ) assert mock_execute.signature.call_count == 2 + + +@pytest.mark.django_db +def test_escalation_step_declare_incident( + escalation_step_test_setup, + make_escalation_policy, +): + organization, _, _, channel_filter, alert_group, reason = escalation_step_test_setup + + declare_incident_step = make_escalation_policy( + escalation_chain=channel_filter.escalation_chain, + escalation_policy_step=EscalationPolicy.STEP_DECLARE_INCIDENT, + ) + escalation_policy_snapshot = get_escalation_policy_snapshot_from_model(declare_incident_step) + expected_eta = timezone.now() + timezone.timedelta(seconds=NEXT_ESCALATION_DELAY) + with patch.object(EscalationPolicySnapshot, "_execute_tasks") as mocked_execute_tasks: + with patch( + "apps.alerts.escalation_snapshot.snapshot_classes.escalation_policy_snapshot.is_declare_incident_step_enabled", + return_value=True, + ): + result = escalation_policy_snapshot.execute(alert_group, reason) + expected_result = EscalationPolicySnapshot.StepExecutionResultData( + eta=result.eta, + stop_escalation=False, + pause_escalation=False, + start_from_beginning=False, + ) + assert ( + expected_eta + timezone.timedelta(seconds=15) + > result.eta + > expected_eta - timezone.timedelta(seconds=15) + ) + assert result == expected_result + assert not alert_group.log_records.exists() + mocked_execute_tasks.assert_called_once() + with patch.object(EscalationPolicySnapshot, "_execute_tasks") as mocked_execute_tasks: + with patch( + "apps.alerts.escalation_snapshot.snapshot_classes.escalation_policy_snapshot.is_declare_incident_step_enabled", + return_value=False, + ): + escalation_policy_snapshot.execute(alert_group, reason) + mocked_execute_tasks.assert_not_called() + assert alert_group.log_records.exists() + log_record = alert_group.log_records.get() + assert log_record.type == AlertGroupLogRecord.TYPE_ESCALATION_FAILED + assert ( + log_record.escalation_error_code + == AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED + ) diff --git a/engine/apps/alerts/tests/test_escalation_snapshot.py b/engine/apps/alerts/tests/test_escalation_snapshot.py index fc279a53dd..84d3b0f496 100644 --- a/engine/apps/alerts/tests/test_escalation_snapshot.py +++ b/engine/apps/alerts/tests/test_escalation_snapshot.py @@ -41,6 +41,7 @@ def test_raw_escalation_snapshot(escalation_snapshot_test_setup): "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "from_time": None, "to_time": None, "num_alerts_in_window": None, @@ -60,6 +61,7 @@ def test_raw_escalation_snapshot(escalation_snapshot_test_setup): "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "from_time": None, "to_time": None, "num_alerts_in_window": None, @@ -79,6 +81,7 @@ def test_raw_escalation_snapshot(escalation_snapshot_test_setup): "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "from_time": notify_if_time_step.from_time.isoformat(), "to_time": notify_if_time_step.to_time.isoformat(), "num_alerts_in_window": None, diff --git a/engine/apps/alerts/tests/test_paging.py b/engine/apps/alerts/tests/test_paging.py index 0528782d0e..d6bad7a2bc 100644 --- a/engine/apps/alerts/tests/test_paging.py +++ b/engine/apps/alerts/tests/test_paging.py @@ -312,3 +312,25 @@ def _title(middle_portion: str) -> str: assert _construct_title(from_user, team, multiple_users) == _title( f"{team.name}, {user1.username}, {user2.username} and {user3.username}" ) + + +@pytest.mark.django_db +def test_direct_paging_title_and_message_are_html_escaped(make_organization, make_user_for_organization): + dirty_input = "" + clean_input = "<script>alert('hacked');</script>" + + organization = make_organization() + from_user = make_user_for_organization(organization) + other_user = make_user_for_organization(organization) + + direct_paging(organization, from_user, dirty_input, dirty_input, users=[(other_user, False)]) + + # alert group created + alert_groups = AlertGroup.objects.all() + assert alert_groups.count() == 1 + ag = alert_groups.get() + alert = ag.alerts.get() + + assert ag.web_title_cache == clean_input + assert alert.title == clean_input + assert alert.message == clean_input diff --git a/engine/apps/alerts/utils.py b/engine/apps/alerts/utils.py index abf6b24cde..5317c22b3f 100644 --- a/engine/apps/alerts/utils.py +++ b/engine/apps/alerts/utils.py @@ -1,3 +1,11 @@ +import typing + +from django.conf import settings + +if typing.TYPE_CHECKING: + from apps.user_management.models import Organization + + def render_relative_timeline(log_created_at, alert_group_started_at): time_delta = log_created_at - alert_group_started_at seconds = int(time_delta.total_seconds()) @@ -12,3 +20,7 @@ def render_relative_timeline(log_created_at, alert_group_started_at): return "%dm%ds" % (minutes, seconds) else: return "%ds" % (seconds,) + + +def is_declare_incident_step_enabled(organization: "Organization") -> bool: + return organization.is_grafana_incident_enabled and settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED diff --git a/engine/apps/api/permissions.py b/engine/apps/api/permissions.py index 771139935c..e0120de567 100644 --- a/engine/apps/api/permissions.py +++ b/engine/apps/api/permissions.py @@ -9,15 +9,14 @@ from rest_framework.views import APIView from rest_framework.viewsets import ViewSet, ViewSetMixin +from common.constants.plugin_ids import PluginID from common.utils import getattrd if typing.TYPE_CHECKING: from apps.user_management.models import User -ACTION_PREFIX = "grafana-oncall-app" RBAC_PERMISSIONS_ATTR = "rbac_permissions" RBAC_OBJECT_PERMISSIONS_ATTR = "rbac_object_permissions" -BASIC_ROLE_PERMISSIONS_ATTR = "basic_role_permissions" ViewSetOrAPIView = typing.Union[ViewSet, APIView] @@ -67,6 +66,7 @@ class Resources(enum.Enum): OTHER_SETTINGS = "other-settings" ADMIN = "admin" + LABEL = "label" class Actions(enum.Enum): @@ -78,6 +78,8 @@ class Actions(enum.Enum): UPDATE_SETTINGS = "update-settings" DIRECT_PAGING = "direct-paging" + CREATE = "create" + class LegacyAccessControlRole(enum.IntEnum): ADMIN = 0 @@ -91,15 +93,20 @@ def choices(cls): class LegacyAccessControlCompatiblePermission: - def __init__(self, resource: Resources, action: Actions, fallback_role: LegacyAccessControlRole) -> None: - self.value = f"{ACTION_PREFIX}.{resource.value}:{action.value}" + def __init__( + self, + resource: Resources, + action: Actions, + fallback_role: LegacyAccessControlRole, + prefix: str = PluginID.ONCALL, + ) -> None: + self.value = f"{prefix}.{resource.value}:{action.value}" self.fallback_role = fallback_role LegacyAccessControlCompatiblePermissions = typing.List[LegacyAccessControlCompatiblePermission] RBACPermissionsAttribute = typing.Dict[str, LegacyAccessControlCompatiblePermissions] RBACObjectPermissionsAttribute = typing.Dict[permissions.BasePermission, typing.List[str]] -BasicRolePermissionsAttribute = typing.Dict[str, LegacyAccessControlRole] def get_view_action(request: AuthenticatedRequest, view: ViewSetOrAPIView) -> str: @@ -119,24 +126,14 @@ def get_most_authorized_role(permissions: LegacyAccessControlCompatiblePermissio return min({p.fallback_role for p in permissions}, key=lambda r: r.value) -def user_is_authorized( - user: "User", - required_permissions: LegacyAccessControlCompatiblePermissions, - required_basic_role_permission: LegacyAccessControlRole = None, -) -> bool: +def user_is_authorized(user: "User", required_permissions: LegacyAccessControlCompatiblePermissions) -> bool: """ - This function checks whether `user` has all necessary permissions. If `required_basic_role_permission` is set, - it only checks the basic user role, otherwise it checks whether `user` has all permissions in - `required_permissions`. + This function checks whether `user` has all necessary permissions specified in `required_permissions`. RBAC permissions are used if RBAC is enabled for the organization, otherwise the fallback basic role is checked. - user - The user to check permissions for - required_permissions - A list of permissions that a user must have to be considered authorized - required_basic_role_permission - Min basic role user must have to be considered authorized (used in cases when - it's needed to check ONLY the basic user role, otherwise `required_permissions` should be used) + `user` - The user to check permissions for + `required_permissions` - A list of permissions that a user must have to be considered authorized """ - if required_basic_role_permission is not None: - return user.role <= required_basic_role_permission.value if user.organization.is_rbac_permissions_enabled: user_permissions = [u["action"] for u in user.permissions] required_permission_values = [p.value for p in required_permissions] @@ -250,6 +247,17 @@ class Permissions: Resources.OTHER_SETTINGS, Actions.WRITE, LegacyAccessControlRole.ADMIN ) + # NOTE: we don't currently add the label delete permission here because we don't currently use this in OnCall + LABEL_CREATE = LegacyAccessControlCompatiblePermission( + Resources.LABEL, Actions.CREATE, LegacyAccessControlRole.EDITOR, prefix=PluginID.LABELS + ) + LABEL_READ = LegacyAccessControlCompatiblePermission( + Resources.LABEL, Actions.READ, LegacyAccessControlRole.VIEWER, prefix=PluginID.LABELS + ) + LABEL_WRITE = LegacyAccessControlCompatiblePermission( + Resources.LABEL, Actions.WRITE, LegacyAccessControlRole.EDITOR, prefix=PluginID.LABELS + ) + # mypy complains about "Liskov substitution principle" here because request is `AuthenticatedRequest` object # and not rest_framework.request.Request # https://mypy.readthedocs.io/en/stable/common_issues.html#incompatible-overrides @@ -301,40 +309,6 @@ def has_object_permission(self, request: AuthenticatedRequest, view: ViewSetOrAP return True -class BasicRolePermission(permissions.BasePermission): - """Checks only basic user role permissions, regardless of whether RBAC is enabled for the organization""" - - # mypy complains about "Liskov substitution principle" here because request is `AuthenticatedRequest` object - # and not rest_framework.request.Request - # https://mypy.readthedocs.io/en/stable/common_issues.html#incompatible-overrides - def has_permission(self, request: AuthenticatedRequest, view: ViewSetOrAPIView) -> bool: # type: ignore[override] - # the django-debug-toolbar UI makes OPTIONS calls. Without this statement the debug UI can't gather the - # necessary info it needs to work properly - if settings.DEBUG and request.method == "OPTIONS": - return True - action = get_view_action(request, view) - - basic_role_permissions: typing.Optional[BasicRolePermissionsAttribute] = getattr( - view, BASIC_ROLE_PERMISSIONS_ATTR, None - ) - - # first check that the basic_role_permissions dict attribute is defined - assert ( - basic_role_permissions is not None - ), f"Must define a {BASIC_ROLE_PERMISSIONS_ATTR} dict on the ViewSet that is consuming the role class" - - action_required_permissions: LegacyAccessControlRole = basic_role_permissions.get(action, None) - - # next check that the action in question is defined within the basic_role_permissions dict attribute - assert ( - action_required_permissions is not None - ), f"""Each action must be defined within the {BASIC_ROLE_PERMISSIONS_ATTR} dict on the ViewSet""" - - return user_is_authorized( - request.user, required_permissions=[], required_basic_role_permission=action_required_permissions - ) - - ALL_PERMISSION_NAMES = [perm for perm in dir(RBACPermission.Permissions) if not perm.startswith("_")] ALL_PERMISSION_CLASSES = [ getattr(RBACPermission.Permissions, permission_name) for permission_name in ALL_PERMISSION_NAMES diff --git a/engine/apps/api/serializers/alert_group.py b/engine/apps/api/serializers/alert_group.py index d4b2e65d5e..c0882658fb 100644 --- a/engine/apps/api/serializers/alert_group.py +++ b/engine/apps/api/serializers/alert_group.py @@ -2,7 +2,6 @@ import logging import typing -from django.conf import settings from django.core.cache import cache from django.db.models import Prefetch from django.utils import timezone @@ -133,26 +132,23 @@ class AlertGroupListSerializer( labels = AlertGroupLabelSerializer(many=True, read_only=True) - PREFETCH_RELATED: list[str | Prefetch] = [ + PREFETCH_RELATED = [ "dependent_alert_groups", "log_records__author", "labels", + Prefetch( + "slack_messages", + queryset=SlackMessage.objects.select_related("_slack_team_identity").order_by("created_at")[:1], + to_attr="prefetched_slack_messages", + ), + Prefetch( + "telegram_messages", + queryset=TelegramMessage.objects.filter( + chat_id__startswith="-", message_type=TelegramMessage.ALERT_GROUP_MESSAGE + ).order_by("id")[:1], + to_attr="prefetched_telegram_messages", + ), ] - if settings.ALERT_GROUP_LIST_TRY_PREFETCH: - PREFETCH_RELATED += [ - Prefetch( - "slack_messages", - queryset=SlackMessage.objects.select_related("_slack_team_identity").order_by("created_at")[:1], - to_attr="prefetched_slack_messages", - ), - Prefetch( - "telegram_messages", - queryset=TelegramMessage.objects.filter( - chat_id__startswith="-", message_type=TelegramMessage.ALERT_GROUP_MESSAGE - ).order_by("id")[:1], - to_attr="prefetched_telegram_messages", - ), - ] SELECT_RELATED = [ "channel__organization", diff --git a/engine/apps/api/serializers/escalation_policy.py b/engine/apps/api/serializers/escalation_policy.py index 75f3628488..f8b0270de8 100644 --- a/engine/apps/api/serializers/escalation_policy.py +++ b/engine/apps/api/serializers/escalation_policy.py @@ -3,6 +3,7 @@ from rest_framework import serializers from apps.alerts.models import EscalationChain, EscalationPolicy +from apps.alerts.utils import is_declare_incident_step_enabled from apps.schedules.models import OnCallSchedule from apps.slack.models import SlackUserGroup from apps.user_management.models import Team, User @@ -24,6 +25,7 @@ NUM_ALERTS_IN_WINDOW = "num_alerts_in_window" NUM_MINUTES_IN_WINDOW = "num_minutes_in_window" CUSTOM_WEBHOOK_TRIGGER = "custom_webhook" +SEVERITY = "severity" STEP_TYPE_TO_RELATED_FIELD_MAP = { EscalationPolicy.STEP_WAIT: [WAIT_DELAY], @@ -35,6 +37,7 @@ EscalationPolicy.STEP_NOTIFY_IF_TIME: [FROM_TIME, TO_TIME], EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: [NUM_ALERTS_IN_WINDOW, NUM_MINUTES_IN_WINDOW], EscalationPolicy.STEP_TRIGGER_CUSTOM_WEBHOOK: [CUSTOM_WEBHOOK_TRIGGER], + EscalationPolicy.STEP_DECLARE_INCIDENT: [SEVERITY], } @@ -81,6 +84,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, serializers.ModelSerializer) allow_null=True, filter_field="organization", ) + severity = serializers.CharField(required=False, allow_null=True) class Meta: model = EscalationPolicy @@ -99,6 +103,7 @@ class Meta: "notify_schedule", "notify_to_group", "notify_to_team_members", + "severity", "important", ] @@ -123,6 +128,7 @@ def validate(self, data): NUM_ALERTS_IN_WINDOW, NUM_MINUTES_IN_WINDOW, CUSTOM_WEBHOOK_TRIGGER, + SEVERITY, ] step = data.get("step") @@ -151,6 +157,8 @@ def validate_step(self, step_type): raise serializers.ValidationError("Invalid step value") if step_type in EscalationPolicy.SLACK_INTEGRATION_REQUIRED_STEPS and organization.slack_team_identity is None: raise serializers.ValidationError("Invalid escalation step type: step is Slack-specific") + if step_type == EscalationPolicy.STEP_DECLARE_INCIDENT and not is_declare_incident_step_enabled(organization): + raise serializers.ValidationError("Invalid escalation step type: step is not enabled") return step_type def to_representation(self, instance): @@ -214,6 +222,7 @@ def _drop_not_step_type_related_fields(step, validated_data): NUM_ALERTS_IN_WINDOW, NUM_MINUTES_IN_WINDOW, CUSTOM_WEBHOOK_TRIGGER, + SEVERITY, ] for f in STEP_TYPE_TO_RELATED_FIELD_MAP.get(step, []): diff --git a/engine/apps/api/tests/test_alert_group.py b/engine/apps/api/tests/test_alert_group.py index a015fccc3e..13080a155e 100644 --- a/engine/apps/api/tests/test_alert_group.py +++ b/engine/apps/api/tests/test_alert_group.py @@ -975,6 +975,38 @@ def test_get_filter_labels( assert response.json()["results"][0]["pk"] == alert_groups[0].public_primary_key +@pytest.mark.django_db +def test_get_filter_by_related_incident( + alert_group_internal_api_setup, make_declared_incident, make_alert_group, make_user_auth_headers +): + user, token, alert_groups = alert_group_internal_api_setup + + alert_group = alert_groups[0] + declared_incident = make_declared_incident("1", alert_group.channel.organization, alert_group.channel_filter) + alert_group.declared_incident = declared_incident + alert_group.save() + + client = APIClient() + url = reverse("api-internal:alertgroup-list") + response = client.get( + url + "?has_related_incident=true", + format="json", + **make_user_auth_headers(user, token), + ) + + assert response.status_code == status.HTTP_200_OK + assert len(response.data["results"]) == 1 + + response = client.get( + url + "?has_related_incident=false", + format="json", + **make_user_auth_headers(user, token), + ) + + assert response.status_code == status.HTTP_200_OK + assert len(response.data["results"]) == 3 + + @pytest.mark.django_db def test_get_title_search( settings, diff --git a/engine/apps/api/tests/test_escalation_policy.py b/engine/apps/api/tests/test_escalation_policy.py index e6908d7a0f..9ef3dfbd23 100644 --- a/engine/apps/api/tests/test_escalation_policy.py +++ b/engine/apps/api/tests/test_escalation_policy.py @@ -10,6 +10,7 @@ from apps.alerts.models import EscalationPolicy from apps.api.permissions import LegacyAccessControlRole +from common.incident_api.client import DEFAULT_INCIDENT_SEVERITY, IncidentAPIException @pytest.fixture() @@ -651,8 +652,13 @@ def test_create_escalation_policy_with_no_important_version( make_escalation_chain, step, make_user_auth_headers, + settings, ): organization, user, _, _ = make_organization_and_user_with_slack_identities() + # make sure declare incident step is enabled + settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True + organization.is_grafana_incident_enabled = True + organization.save() _, token = make_token_for_organization(organization) escalation_chain = make_escalation_chain(organization) @@ -828,6 +834,7 @@ def test_escalation_policy_switch_importance( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": True, "wait_delay": None, } @@ -885,6 +892,7 @@ def test_escalation_policy_filter_by_user( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": False, }, { @@ -902,6 +910,7 @@ def test_escalation_policy_filter_by_user( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": False, }, ] @@ -967,6 +976,7 @@ def test_escalation_policy_filter_by_slack_channel( "notify_schedule": None, "notify_to_group": None, "notify_to_team_members": None, + "severity": None, "important": False, }, ] @@ -997,3 +1007,88 @@ def test_escalation_policy_escalation_options_webhooks( returned_options = [option["value"] for option in response.json()] assert EscalationPolicy.STEP_TRIGGER_CUSTOM_WEBHOOK in returned_options + + +@pytest.mark.django_db +def test_escalation_policy_severity_options( + make_organization_and_user_with_plugin_token, + make_user_auth_headers, +): + organization, user, token = make_organization_and_user_with_plugin_token() + organization.is_grafana_labels_enabled = False + organization.save() + + client = APIClient() + url = reverse("api-internal:escalation_policy-severity-options") + + # without labels enabled + available_severities = [ + {"severityID": "abc", "orgID": "1", "displayLabel": "Pending", "level": -1}, + {"severityID": "def", "orgID": "1", "displayLabel": "Critical", "level": 1}, + ] + with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities: + mock_get_severities.return_value = available_severities, None + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + + expected_options = [{"value": s["displayLabel"], "display_name": s["displayLabel"]} for s in available_severities] + assert response.json() == expected_options + + # failing request does not break; fallback to default option only + with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities: + mock_get_severities.side_effect = IncidentAPIException(status=404, url="some-url") + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + + fallback_options = [{"value": DEFAULT_INCIDENT_SEVERITY, "display_name": DEFAULT_INCIDENT_SEVERITY}] + assert response.json() == fallback_options + + # labels enabled + organization.is_grafana_labels_enabled = True + organization.save() + + with patch("common.incident_api.client.IncidentAPIClient.get_severities") as mock_get_severities: + mock_get_severities.return_value = available_severities, None + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + # include set from label option + expected_options = [ + { + "value": EscalationPolicy.SEVERITY_SET_FROM_LABEL, + "display_name": EscalationPolicy.SEVERITY_SET_FROM_LABEL_DISPLAY_VALUE, + } + ] + expected_options + assert response.json() == expected_options + + +@pytest.mark.django_db +def test_create_escalation_policy_declare_incident( + escalation_policy_internal_api_setup, make_user_auth_headers, settings +): + token, escalation_chain, _, user, _ = escalation_policy_internal_api_setup + organization = escalation_chain.organization + client = APIClient() + url = reverse("api-internal:escalation_policy-list") + + data = { + "step": EscalationPolicy.STEP_DECLARE_INCIDENT, + "severity": "critical", + "escalation_chain": escalation_chain.public_primary_key, + } + + response = client.post(url, data, format="json", **make_user_auth_headers(user, token)) + assert response.status_code == status.HTTP_400_BAD_REQUEST + + # make sure declare incident step is enabled + settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True + organization.is_grafana_incident_enabled = True + organization.save() + + response = client.post(url, data, format="json", **make_user_auth_headers(user, token)) + assert response.status_code == status.HTTP_201_CREATED + escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"]) + assert escalation_policy.step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert escalation_policy.severity == "critical" + + url = reverse("api-internal:escalation_policy-detail", kwargs={"pk": escalation_policy.public_primary_key}) + response = client.get(url, format="json", **make_user_auth_headers(user, token)) + response_data = response.json() + assert response_data["step"] == EscalationPolicy.STEP_DECLARE_INCIDENT + assert response_data["severity"] == "critical" diff --git a/engine/apps/api/tests/test_labels.py b/engine/apps/api/tests/test_labels.py index a0bccb3afc..2c36363edf 100644 --- a/engine/apps/api/tests/test_labels.py +++ b/engine/apps/api/tests/test_labels.py @@ -23,9 +23,8 @@ def test_labels_get_keys( mocked_get_labels_keys, make_organization_and_user_with_plugin_token, make_user_auth_headers, - make_alert_receive_channel, ): - organization, user, token = make_organization_and_user_with_plugin_token() + _, user, token = make_organization_and_user_with_plugin_token() client = APIClient() url = reverse("api-internal:get_keys") response = client.get(url, format="json", **make_user_auth_headers(user, token)) @@ -49,7 +48,7 @@ def test_get_update_key_get( make_organization_and_user_with_plugin_token, make_user_auth_headers, ): - organization, user, token = make_organization_and_user_with_plugin_token() + _, user, token = make_organization_and_user_with_plugin_token() client = APIClient() url = reverse("api-internal:get_update_key", kwargs={"key_id": "keyid123"}) response = client.get(url, format="json", **make_user_auth_headers(user, token)) @@ -73,7 +72,7 @@ def test_get_update_key_put( make_organization_and_user_with_plugin_token, make_user_auth_headers, ): - organization, user, token = make_organization_and_user_with_plugin_token() + _, user, token = make_organization_and_user_with_plugin_token() client = APIClient() url = reverse("api-internal:get_update_key", kwargs={"key_id": "keyid123"}) data = {"name": "team"} @@ -98,7 +97,7 @@ def test_add_value( make_organization_and_user_with_plugin_token, make_user_auth_headers, ): - organization, user, token = make_organization_and_user_with_plugin_token() + _, user, token = make_organization_and_user_with_plugin_token() client = APIClient() url = reverse("api-internal:add_value", kwargs={"key_id": "keyid123"}) data = {"name": "yolo"} @@ -123,7 +122,7 @@ def test_rename_value( make_organization_and_user_with_plugin_token, make_user_auth_headers, ): - organization, user, token = make_organization_and_user_with_plugin_token() + _, user, token = make_organization_and_user_with_plugin_token() client = APIClient() url = reverse("api-internal:get_update_value", kwargs={"key_id": "keyid123", "value_id": "valueid123"}) data = {"name": "yolo"} @@ -148,7 +147,7 @@ def test_get_value( make_organization_and_user_with_plugin_token, make_user_auth_headers, ): - organization, user, token = make_organization_and_user_with_plugin_token() + _, user, token = make_organization_and_user_with_plugin_token() client = APIClient() url = reverse("api-internal:get_update_value", kwargs={"key_id": "keyid123", "value_id": "valueid123"}) response = client.get(url, format="json", **make_user_auth_headers(user, token)) @@ -172,7 +171,7 @@ def test_labels_create_label( make_organization_and_user_with_plugin_token, make_user_auth_headers, ): - organization, user, token = make_organization_and_user_with_plugin_token() + _, user, token = make_organization_and_user_with_plugin_token() client = APIClient() url = reverse("api-internal:create_label") data = {"key": {"name": "team"}, "values": [{"name": "yolo"}]} @@ -192,7 +191,7 @@ def test_labels_feature_false( ): settings.FEATURE_LABELS_ENABLED_FOR_ALL = False - organization, user, token = make_organization_and_user_with_plugin_token() + _, user, token = make_organization_and_user_with_plugin_token() client = APIClient() url = reverse("api-internal:get_keys") @@ -240,7 +239,7 @@ def test_labels_permissions_get_actions( role, expected_status, ): - organization, user, token = make_organization_and_user_with_plugin_token(role) + _, user, token = make_organization_and_user_with_plugin_token(role) client = APIClient() with patch("apps.api.views.labels.LabelsViewSet.get_keys", return_value=Response(status=status.HTTP_200_OK)): url = reverse("api-internal:get_keys") @@ -274,7 +273,7 @@ def test_labels_permissions_create_update_actions( role, expected_status, ): - organization, user, token = make_organization_and_user_with_plugin_token(role) + _, user, token = make_organization_and_user_with_plugin_token(role) client = APIClient() with patch("apps.api.views.labels.LabelsViewSet.rename_key", return_value=Response(status=status.HTTP_200_OK)): url = reverse("api-internal:get_update_key", kwargs={"key_id": "keyid123"}) diff --git a/engine/apps/api/tests/test_permissions.py b/engine/apps/api/tests/test_permissions.py index eef1c5dfaf..46bf59fb3f 100644 --- a/engine/apps/api/tests/test_permissions.py +++ b/engine/apps/api/tests/test_permissions.py @@ -5,10 +5,7 @@ from rest_framework.viewsets import ViewSetMixin from apps.api.permissions import ( - BASIC_ROLE_PERMISSIONS_ATTR, RBAC_PERMISSIONS_ATTR, - BasicRolePermission, - BasicRolePermissionsAttribute, GrafanaAPIPermission, HasRBACPermissions, IsOwner, @@ -60,7 +57,6 @@ def __init__( action: str, rbac_permissions: typing.Optional[RBACPermissionsAttribute] = None, rbac_object_permissions: typing.Optional[RBACObjectPermissionsAttribute] = None, - basic_role_permissions: typing.Optional[BasicRolePermissionsAttribute] = None, ) -> None: super().__init__() self.action = action @@ -69,8 +65,6 @@ def __init__( self.rbac_permissions = rbac_permissions if rbac_object_permissions: self.rbac_object_permissions = rbac_object_permissions - if basic_role_permissions: - self.basic_role_permissions = basic_role_permissions class MockedAPIView(APIView): @@ -78,7 +72,6 @@ def __init__( self, rbac_permissions: typing.Optional[RBACPermissionsAttribute] = None, rbac_object_permissions: typing.Optional[RBACObjectPermissionsAttribute] = None, - basic_role_permissions: typing.Optional[BasicRolePermissionsAttribute] = None, ) -> None: super().__init__() @@ -86,8 +79,6 @@ def __init__( self.rbac_permissions = rbac_permissions if rbac_object_permissions: self.rbac_object_permissions = rbac_object_permissions - if basic_role_permissions: - self.basic_role_permissions = basic_role_permissions @pytest.mark.parametrize( @@ -460,142 +451,3 @@ def __init__(self, schedule: MockedSchedule) -> None: assert PermClass.has_object_permission(request, None, thingy) is True assert PermClass.has_object_permission(MockedRequest(MockedUser([])), None, thingy) is False - - -@pytest.mark.parametrize( - "role,required_role,org_has_rbac_enabled,expected_result", - [ - ( - LegacyAccessControlRole.VIEWER, - LegacyAccessControlRole.VIEWER, - True, - True, - ), - ( - LegacyAccessControlRole.VIEWER, - LegacyAccessControlRole.VIEWER, - False, - True, - ), - ( - LegacyAccessControlRole.ADMIN, - LegacyAccessControlRole.VIEWER, - True, - True, - ), - ( - LegacyAccessControlRole.ADMIN, - LegacyAccessControlRole.VIEWER, - False, - True, - ), - ( - LegacyAccessControlRole.VIEWER, - LegacyAccessControlRole.ADMIN, - True, - False, - ), - ( - LegacyAccessControlRole.VIEWER, - LegacyAccessControlRole.ADMIN, - False, - False, - ), - ], -) -def test_user_is_authorized_basic_role( - role, - required_role, - org_has_rbac_enabled, - expected_result, -) -> None: - user = MockedUser([], org_has_rbac_enabled=org_has_rbac_enabled, basic_role=role) - assert user_is_authorized(user, [], required_role) == expected_result - - -class TestBasicRolePermission: - def test_has_permission_works_on_a_viewset_view(self) -> None: - required_role = LegacyAccessControlRole.VIEWER - - action = "hello" - viewset = MockedViewSet( - action=action, - basic_role_permissions={ - action: required_role, - }, - ) - - user_with_permission = MockedUser([], basic_role=required_role) - user_without_permission = MockedUser([], basic_role=LegacyAccessControlRole.NONE) - - assert ( - BasicRolePermission().has_permission(MockedRequest(user_with_permission), viewset) is True - ), "it works on a viewset when the user does have permission" - - assert ( - BasicRolePermission().has_permission(MockedRequest(user_without_permission), viewset) is False - ), "it works on a viewset when the user does have permission" - - def test_has_permission_works_on_an_apiview_view(self) -> None: - required_role = LegacyAccessControlRole.VIEWER - - method = "hello" - apiview = MockedAPIView( - basic_role_permissions={ - method: required_role, - }, - ) - - user_with_permission = MockedUser([], basic_role=required_role) - user_without_permission = MockedUser([], basic_role=LegacyAccessControlRole.NONE) - - class Request(MockedRequest): - def __init__(self, user: typing.Optional[MockedUser] = None) -> None: - super().__init__(user, method) - - assert ( - BasicRolePermission().has_permission(Request(user_with_permission), apiview) is True - ), "it works on an APIView when the user has permission" - - assert ( - BasicRolePermission().has_permission(Request(user_without_permission), apiview) is False - ), "it works on an APIView when the user does not have permission" - - def test_has_permission_throws_assertion_error_if_developer_forgets_to_specify_basic_role_permissions(self) -> None: - action_slash_method = "hello" - error_msg = f"Must define a {BASIC_ROLE_PERMISSIONS_ATTR} dict on the ViewSet that is consuming the role class" - - viewset = MockedViewSet(action_slash_method) - apiview = MockedAPIView() - - with pytest.raises(AssertionError, match=error_msg): - BasicRolePermission().has_permission(MockedRequest(), viewset) - - with pytest.raises(AssertionError, match=error_msg): - BasicRolePermission().has_permission(MockedRequest(method=action_slash_method), apiview) - - def test_has_permission_throws_assertion_error_if_developer_forgets_to_specify_an_action_in_basic_role_permissions( - self, - ) -> None: - action_slash_method = "hello" - other_action_role_permissions = {"bonjour": LegacyAccessControlRole.VIEWER} - error_msg = f"""Each action must be defined within the {BASIC_ROLE_PERMISSIONS_ATTR} dict on the ViewSet""" - - viewset = MockedViewSet(action_slash_method, basic_role_permissions=other_action_role_permissions) - apiview = MockedAPIView(basic_role_permissions=other_action_role_permissions) - - with pytest.raises(AssertionError, match=error_msg): - BasicRolePermission().has_permission(MockedRequest(), viewset) - - with pytest.raises(AssertionError, match=error_msg): - BasicRolePermission().has_permission(MockedRequest(method=action_slash_method), apiview) - - def test_has_object_permission_returns_true(self) -> None: - action = "hello" - - request = MockedRequest(None, action) - apiview = MockedAPIView() - viewset = MockedViewSet(action) - - assert BasicRolePermission().has_object_permission(request, apiview, None) is True - assert BasicRolePermission().has_object_permission(request, viewset, None) is True diff --git a/engine/apps/api/views/alert_group.py b/engine/apps/api/views/alert_group.py index 35ef836a68..5b158d1228 100644 --- a/engine/apps/api/views/alert_group.py +++ b/engine/apps/api/views/alert_group.py @@ -1,9 +1,8 @@ -import typing from datetime import timedelta from django.conf import settings from django.core.exceptions import ObjectDoesNotExist -from django.db.models import Count, Max, Q +from django.db.models import Q from django.utils import timezone from django_filters import rest_framework as filters from drf_spectacular.utils import extend_schema, inline_serializer @@ -15,9 +14,10 @@ from rest_framework.response import Response from apps.alerts.constants import ActionSource -from apps.alerts.models import Alert, AlertGroup, AlertReceiveChannel, EscalationChain, ResolutionNote +from apps.alerts.models import AlertGroup, AlertReceiveChannel, EscalationChain, ResolutionNote from apps.alerts.paging import unpage_user from apps.alerts.tasks import delete_alert_group, send_update_resolution_note_signal +from apps.alerts.utils import is_declare_incident_step_enabled from apps.api.errors import AlertGroupAPIError from apps.api.label_filtering import parse_label_query from apps.api.permissions import RBACPermission @@ -36,7 +36,12 @@ ModelFieldFilterMixin, MultipleChoiceCharFilter, ) -from common.api_helpers.mixins import PreviewTemplateMixin, PublicPrimaryKeyMixin, TeamFilteringMixin +from common.api_helpers.mixins import ( + AlertGroupEnrichingMixin, + PreviewTemplateMixin, + PublicPrimaryKeyMixin, + TeamFilteringMixin, +) from common.api_helpers.paginators import AlertGroupCursorPaginator @@ -116,6 +121,7 @@ class AlertGroupFilter(DateRangeFilterMixin, ModelFieldFilterMixin, filters.Filt ) with_resolution_note = filters.BooleanFilter(method="filter_with_resolution_note") mine = filters.BooleanFilter(method="filter_mine") + has_related_incident = filters.BooleanFilter(field_name="declared_incident", lookup_expr="isnull", exclude=True) def filter_status(self, queryset, name, value): if not value: @@ -255,6 +261,7 @@ def get_search_fields(self, view, request): class AlertGroupView( + AlertGroupEnrichingMixin, PreviewTemplateMixin, AlertGroupTeamFilteringMixin, PublicPrimaryKeyMixin[AlertGroup], @@ -354,19 +361,8 @@ def get_queryset(self, ignore_filtering_by_available_teams=False): labels__value_name=value, ) - queryset = queryset.only("id") - return queryset - def paginate_queryset(self, queryset): - """ - All SQL joins (select_related and prefetch_related) will be performed AFTER pagination, so it only joins tables - for 25 alert groups, not the whole table. - """ - alert_groups = super().paginate_queryset(queryset) - alert_groups = self.enrich(alert_groups) - return alert_groups - def get_object(self): obj = super().get_object() obj = self.enrich([obj])[0] @@ -432,48 +428,6 @@ def retrieve(self, request, *args, **kwargs): """ return super().retrieve(request, *args, **kwargs) - def enrich(self, alert_groups: typing.List[AlertGroup]) -> typing.List[AlertGroup]: - """ - This method performs select_related and prefetch_related (using setup_eager_loading) as well as in-memory joins - to add additional info like alert_count and last_alert for every alert group efficiently. - We need the last_alert because it's used by AlertGroupWebRenderer. - """ - - # enrich alert groups with select_related and prefetch_related - alert_group_pks = [alert_group.pk for alert_group in alert_groups] - queryset = AlertGroup.objects.filter(pk__in=alert_group_pks).order_by("-started_at") - - queryset = self.get_serializer_class().setup_eager_loading(queryset) - alert_groups = list(queryset) - - # get info on alerts count and last alert ID for every alert group - alerts_info = ( - Alert.objects.values("group_id") - .filter(group_id__in=alert_group_pks) - .annotate(alerts_count=Count("group_id"), last_alert_id=Max("id")) - ) - alerts_info_map = {info["group_id"]: info for info in alerts_info} - - # fetch last alerts for every alert group - last_alert_ids = [info["last_alert_id"] for info in alerts_info_map.values()] - last_alerts = Alert.objects.filter(pk__in=last_alert_ids) - for alert in last_alerts: - # link group back to alert - alert.group = [alert_group for alert_group in alert_groups if alert_group.pk == alert.group_id][0] - alerts_info_map[alert.group_id].update({"last_alert": alert}) - - # add additional "alerts_count" and "last_alert" fields to every alert group - for alert_group in alert_groups: - try: - alert_group.last_alert = alerts_info_map[alert_group.pk]["last_alert"] - alert_group.alerts_count = alerts_info_map[alert_group.pk]["alerts_count"] - except KeyError: - # alert group has no alerts - alert_group.last_alert = None - alert_group.alerts_count = 0 - - return alert_groups - def destroy(self, request, *args, **kwargs): instance = self.get_object() delete_alert_group.apply_async((instance.pk, request.user.pk)) @@ -767,6 +721,7 @@ def filters(self, request): """ Retrieve a list of valid filter options that can be used to filter alert groups """ + organization = self.request.auth.organization api_root = "/api/internal/v1/" default_day_range = 30 @@ -852,7 +807,7 @@ def filters(self, request): filter_options = [{"name": "search", "type": "search", "description": description}] + filter_options - if is_labels_feature_enabled(self.request.auth.organization): + if is_labels_feature_enabled(organization): filter_options.append( { "name": "label", @@ -861,6 +816,15 @@ def filters(self, request): } ) + if is_declare_incident_step_enabled(organization): + filter_options.append( + { + "name": "has_related_incident", + "type": "boolean", + "default": "true", + } + ) + return Response(filter_options) @extend_schema( diff --git a/engine/apps/api/views/escalation_policy.py b/engine/apps/api/views/escalation_policy.py index 945b634825..4dc32be079 100644 --- a/engine/apps/api/views/escalation_policy.py +++ b/engine/apps/api/views/escalation_policy.py @@ -1,3 +1,5 @@ +import logging + from django.conf import settings from django.db.models import Q from rest_framework.decorators import action @@ -5,6 +7,7 @@ from rest_framework.response import Response from apps.alerts.models import EscalationPolicy +from apps.alerts.utils import is_declare_incident_step_enabled from apps.api.permissions import RBACPermission from apps.api.serializers.escalation_policy import ( EscalationPolicyCreateSerializer, @@ -19,9 +22,12 @@ TeamFilteringMixin, UpdateSerializerMixin, ) +from common.incident_api.client import DEFAULT_INCIDENT_SEVERITY, IncidentAPIClient, IncidentAPIException from common.insight_log import EntityEvent, write_resource_insight_log from common.ordered_model.viewset import OrderedModelViewSet +logger = logging.getLogger(__name__) + class EscalationPolicyView( TeamFilteringMixin, @@ -42,6 +48,7 @@ class EscalationPolicyView( "escalation_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], "delay_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], "num_minutes_in_window_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], + "severity_options": [RBACPermission.Permissions.ESCALATION_CHAINS_READ], "create": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE], "update": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE], "partial_update": [RBACPermission.Permissions.ESCALATION_CHAINS_WRITE], @@ -116,6 +123,7 @@ def perform_destroy(self, instance): @action(detail=False, methods=["get"]) def escalation_options(self, request): + grafana_declare_incident_enabled = is_declare_incident_step_enabled(organization=self.request.auth.organization) choices = [] for step in EscalationPolicy.INTERNAL_API_STEPS: verbal = EscalationPolicy.INTERNAL_API_STEPS_TO_VERBAL_MAP[step] @@ -125,6 +133,8 @@ def escalation_options(self, request): slack_integration_required = step in EscalationPolicy.SLACK_INTEGRATION_REQUIRED_STEPS if slack_integration_required and not settings.FEATURE_SLACK_INTEGRATION_ENABLED: continue + if step == EscalationPolicy.STEP_DECLARE_INCIDENT and not grafana_declare_incident_enabled: + continue choices.append( { "value": step, @@ -151,3 +161,25 @@ def num_minutes_in_window_options(self, request): {"value": choice[0], "display_name": choice[1]} for choice in EscalationPolicy.WEB_DURATION_CHOICES_MINUTES ] return Response(choices) + + @action(detail=False, methods=["get"]) + def severity_options(self, request): + organization = self.request.auth.organization + choices = [] + if organization.is_grafana_labels_enabled: + choices = [ + { + "value": EscalationPolicy.SEVERITY_SET_FROM_LABEL, + "display_name": EscalationPolicy.SEVERITY_SET_FROM_LABEL_DISPLAY_VALUE, + } + ] + incident_client = IncidentAPIClient(organization.grafana_url, organization.api_token) + try: + severities, _ = incident_client.get_severities() + choices += [ + {"value": severity["displayLabel"], "display_name": severity["displayLabel"]} for severity in severities + ] + except IncidentAPIException as e: + logger.error(f"Error getting severities: {e.msg}") + choices += [{"value": DEFAULT_INCIDENT_SEVERITY, "display_name": DEFAULT_INCIDENT_SEVERITY}] + return Response(choices) diff --git a/engine/apps/api/views/labels.py b/engine/apps/api/views/labels.py index 4ab74bb356..d27215f2e3 100644 --- a/engine/apps/api/views/labels.py +++ b/engine/apps/api/views/labels.py @@ -7,7 +7,7 @@ from rest_framework.response import Response from rest_framework.viewsets import ViewSet -from apps.api.permissions import BasicRolePermission, LegacyAccessControlRole +from apps.api.permissions import RBACPermission from apps.api.serializers.labels import ( LabelKeySerializer, LabelOptionSerializer, @@ -35,16 +35,16 @@ class LabelsViewSet(LabelsFeatureFlagViewSet): Proxy requests to labels-app to create/update labels """ - permission_classes = (IsAuthenticated, BasicRolePermission) + permission_classes = (IsAuthenticated, RBACPermission) authentication_classes = (PluginAuthentication,) - basic_role_permissions = { - "get_keys": LegacyAccessControlRole.VIEWER, - "get_key": LegacyAccessControlRole.VIEWER, - "get_value": LegacyAccessControlRole.VIEWER, - "rename_key": LegacyAccessControlRole.EDITOR, - "create_label": LegacyAccessControlRole.EDITOR, - "add_value": LegacyAccessControlRole.EDITOR, - "rename_value": LegacyAccessControlRole.EDITOR, + rbac_permissions = { + "create_label": [RBACPermission.Permissions.LABEL_CREATE], + "rename_key": [RBACPermission.Permissions.LABEL_WRITE], + "add_value": [RBACPermission.Permissions.LABEL_WRITE], + "rename_value": [RBACPermission.Permissions.LABEL_WRITE], + "get_keys": [RBACPermission.Permissions.LABEL_READ], + "get_key": [RBACPermission.Permissions.LABEL_READ], + "get_value": [RBACPermission.Permissions.LABEL_READ], } @extend_schema(responses=LabelKeySerializer(many=True)) @@ -160,11 +160,11 @@ class AlertGroupLabelsViewSet(LabelsFeatureFlagViewSet): Alert group labels are stored in the database, not in the label repo. """ - permission_classes = (IsAuthenticated, BasicRolePermission) + permission_classes = (IsAuthenticated, RBACPermission) authentication_classes = (PluginAuthentication,) - basic_role_permissions = { - "get_keys": LegacyAccessControlRole.VIEWER, - "get_key": LegacyAccessControlRole.VIEWER, + rbac_permissions = { + "get_keys": [RBACPermission.Permissions.ALERT_GROUPS_READ], + "get_key": [RBACPermission.Permissions.ALERT_GROUPS_READ], } @extend_schema(responses=LabelKeySerializer(many=True)) diff --git a/engine/apps/grafana_plugin/helpers/client.py b/engine/apps/grafana_plugin/helpers/client.py index 4ed9c34692..a5c1131c38 100644 --- a/engine/apps/grafana_plugin/helpers/client.py +++ b/engine/apps/grafana_plugin/helpers/client.py @@ -8,7 +8,8 @@ from django.conf import settings from rest_framework import status -from apps.api.permissions import ACTION_PREFIX, GrafanaAPIPermission +from apps.api.permissions import GrafanaAPIPermission +from common.constants.plugin_ids import PluginID logger = logging.getLogger(__name__) @@ -160,11 +161,9 @@ def request_headers(self) -> APIRequestHeaders: class GrafanaAPIClient(APIClient): - GRAFANA_INCIDENT_PLUGIN = "grafana-incident-app" GRAFANA_INCIDENT_PLUGIN_BACKEND_URL_KEY = "backendUrl" - GRAFANA_LABELS_PLUGIN = "grafana-labels-app" - USER_PERMISSION_ENDPOINT = f"api/access-control/users/permissions/search?actionPrefix={ACTION_PREFIX}" + USER_PERMISSION_ENDPOINT = f"api/access-control/users/permissions/search?actionPrefix={PluginID.ONCALL}" MIN_GRAFANA_TOKEN_LENGTH = 16 @@ -305,10 +304,10 @@ def get_grafana_plugin_settings(self, recipient: str) -> APIClientResponse["Graf return self.api_get(f"api/plugins/{recipient}/settings") def get_grafana_incident_plugin_settings(self) -> APIClientResponse["GrafanaAPIClient.Types.PluginSettings"]: - return self.get_grafana_plugin_settings(self.GRAFANA_INCIDENT_PLUGIN) + return self.get_grafana_plugin_settings(PluginID.INCIDENT) def get_grafana_labels_plugin_settings(self) -> APIClientResponse["GrafanaAPIClient.Types.PluginSettings"]: - return self.get_grafana_plugin_settings(self.GRAFANA_LABELS_PLUGIN) + return self.get_grafana_plugin_settings(PluginID.LABELS) def get_service_account(self, login: str) -> APIClientResponse["GrafanaAPIClient.Types.ServiceAccountResponse"]: return self.api_get(f"api/serviceaccounts/search?query={login}") diff --git a/engine/apps/labels/client.py b/engine/apps/labels/client.py index 0e9a63dd4a..3310694d3e 100644 --- a/engine/apps/labels/client.py +++ b/engine/apps/labels/client.py @@ -5,6 +5,8 @@ import requests from django.conf import settings +from common.constants.plugin_ids import PluginID + if typing.TYPE_CHECKING: from apps.labels.types import LabelKey, LabelOption, LabelValue @@ -33,7 +35,7 @@ def __str__(self): class LabelsAPIClient: - LABELS_API_URL = "/api/plugins/grafana-labels-app/resources/v1/labels/" + LABELS_API_URL = f"/api/plugins/{PluginID.LABELS}/resources/v1/labels/" def __init__(self, api_url: str, api_token: str) -> None: self.api_token = api_token diff --git a/engine/apps/public_api/serializers/alert_groups.py b/engine/apps/public_api/serializers/alert_groups.py index 07f1d0fbc6..5218bd1305 100644 --- a/engine/apps/public_api/serializers/alert_groups.py +++ b/engine/apps/public_api/serializers/alert_groups.py @@ -1,7 +1,11 @@ +from django.db.models import Prefetch from rest_framework import serializers from apps.alerts.models import AlertGroup from apps.api.serializers.alert_group import AlertGroupLabelSerializer +from apps.public_api.serializers.alerts import AlertSerializer +from apps.slack.models import SlackMessage +from apps.telegram.models import TelegramMessage from common.api_helpers.custom_fields import TeamPrimaryKeyRelatedField, UserIdField from common.api_helpers.mixins import EagerLoadingMixin @@ -18,9 +22,31 @@ class AlertGroupSerializer(EagerLoadingMixin, serializers.ModelSerializer): acknowledged_by = UserIdField(read_only=True, source="acknowledged_by_user") resolved_by = UserIdField(read_only=True, source="resolved_by_user") labels = AlertGroupLabelSerializer(many=True, read_only=True) + last_alert = serializers.SerializerMethodField() - SELECT_RELATED = ["channel", "channel_filter", "slack_message", "channel__organization", "channel__team"] - PREFETCH_RELATED = ["labels"] + SELECT_RELATED = [ + "channel", + "channel_filter", + "channel__organization", + "channel__team", + "acknowledged_by_user", + "resolved_by_user", + ] + PREFETCH_RELATED = [ + "labels", + Prefetch( + "slack_messages", + queryset=SlackMessage.objects.select_related("_slack_team_identity").order_by("created_at")[:1], + to_attr="prefetched_slack_messages", + ), + Prefetch( + "telegram_messages", + queryset=TelegramMessage.objects.filter( + chat_id__startswith="-", message_type=TelegramMessage.ALERT_GROUP_MESSAGE + ).order_by("id")[:1], + to_attr="prefetched_telegram_messages", + ), + ] class Meta: model = AlertGroup @@ -40,14 +66,12 @@ class Meta: "title", "permalinks", "silenced_at", + "last_alert", ] def get_title(self, obj): return obj.web_title_cache - def get_alerts_count(self, obj): - return obj.alerts.count() - def get_state(self, obj): return obj.state @@ -56,3 +80,20 @@ def get_route_id(self, obj): return obj.channel_filter.public_primary_key else: return None + + def get_last_alert(self, obj): + if hasattr(obj, "last_alert"): # could be set by AlertGroupEnrichingMixin.enrich + last_alert = obj.last_alert + else: + last_alert = obj.alerts.order_by("-created_at").first() + + if last_alert is None: + return None + + return AlertSerializer(last_alert).data + + def get_alerts_count(self, obj): + if hasattr(obj, "alerts_count"): # could be set by AlertGroupEnrichingMixin.enrich + return obj.alerts_count + + return obj.alerts.count() diff --git a/engine/apps/public_api/serializers/escalation_policies.py b/engine/apps/public_api/serializers/escalation_policies.py index ba40ff3030..54fb35addb 100644 --- a/engine/apps/public_api/serializers/escalation_policies.py +++ b/engine/apps/public_api/serializers/escalation_policies.py @@ -5,6 +5,7 @@ from rest_framework import fields, serializers from apps.alerts.models import EscalationChain, EscalationPolicy +from apps.alerts.utils import is_declare_incident_step_enabled from apps.schedules.models import OnCallSchedule from apps.slack.models import SlackUserGroup from apps.user_management.models import Team, User @@ -72,6 +73,7 @@ class EscalationPolicySerializer(EagerLoadingMixin, OrderedModelSerializer): required=False, source="custom_webhook", ) + severity = serializers.CharField(required=False) important = serializers.BooleanField(required=False) TIME_FORMAT = "%H:%M:%SZ" @@ -101,6 +103,7 @@ class Meta: "notify_if_time_to", "num_alerts_in_window", "num_minutes_in_window", + "severity", ] PREFETCH_RELATED = ["notify_to_users_queue"] @@ -120,6 +123,9 @@ def validate_type(self, step_type): if step_type == EscalationPolicy.STEP_FINAL_NOTIFYALL and organization.slack_team_identity is None: raise BadRequest(detail="Invalid escalation step type: step is Slack-specific") + if step_type == EscalationPolicy.STEP_DECLARE_INCIDENT and not is_declare_incident_step_enabled(organization): + raise BadRequest("Invalid escalation step type: step is not enabled") + return step_type def create(self, validated_data): @@ -163,6 +169,7 @@ def _get_field_to_represent(self, step, result): "notify_if_time_to", "num_alerts_in_window", "num_minutes_in_window", + "severity", ] if step == EscalationPolicy.STEP_WAIT: fields_to_remove.remove("duration") @@ -190,6 +197,8 @@ def _get_field_to_represent(self, step, result): elif step == EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: fields_to_remove.remove("num_alerts_in_window") fields_to_remove.remove("num_minutes_in_window") + elif step == EscalationPolicy.STEP_DECLARE_INCIDENT: + fields_to_remove.remove("severity") if ( step in EscalationPolicy.DEFAULT_TO_IMPORTANT_STEP_MAPPING @@ -213,6 +222,7 @@ def _correct_validated_data(self, validated_data): "to_time", "num_alerts_in_window", "num_minutes_in_window", + "severity", ] step = validated_data.get("step") important = validated_data.pop("important", None) @@ -243,6 +253,8 @@ def _correct_validated_data(self, validated_data): elif step == EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: validated_data_fields_to_remove.remove("num_alerts_in_window") validated_data_fields_to_remove.remove("num_minutes_in_window") + elif step == EscalationPolicy.STEP_DECLARE_INCIDENT: + validated_data_fields_to_remove.remove("severity") for field in validated_data_fields_to_remove: validated_data.pop(field, None) @@ -299,5 +311,7 @@ def update(self, instance, validated_data): if step != EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: instance.num_alerts_in_window = None instance.num_minutes_in_window = None + if step != EscalationPolicy.STEP_DECLARE_INCIDENT: + instance.severity = None return super().update(instance, validated_data) diff --git a/engine/apps/public_api/tests/test_alert_groups.py b/engine/apps/public_api/tests/test_alert_groups.py index 758b0d9924..71421cd318 100644 --- a/engine/apps/public_api/tests/test_alert_groups.py +++ b/engine/apps/public_api/tests/test_alert_groups.py @@ -71,6 +71,12 @@ def user_pk_or_none(alert_group, user_field): "web": alert_group.web_link, }, "silenced_at": silenced_at, + "last_alert": { + "id": alert_group.alerts.last().public_primary_key, + "alert_group_id": alert_group.public_primary_key, + "created_at": alert_group.alerts.last().created_at.isoformat().replace("+00:00", "Z"), + "payload": alert_group.channel.config.example_payload, + }, } ) return { @@ -110,7 +116,7 @@ def alert_group_public_api_setup( make_alert(alert_group=grafana_alert_group_default_route, raw_request_data=grafana.config.example_payload) make_alert(alert_group=grafana_alert_group_non_default_route, raw_request_data=grafana.config.example_payload) - make_alert(alert_group=formatted_webhook_alert_group, raw_request_data=grafana.config.example_payload) + make_alert(alert_group=formatted_webhook_alert_group, raw_request_data=formatted_webhook.config.example_payload) integrations = grafana, formatted_webhook alert_groups = ( diff --git a/engine/apps/public_api/tests/test_escalation.py b/engine/apps/public_api/tests/test_escalation.py index 0fad2f71ee..5c4e0e772a 100644 --- a/engine/apps/public_api/tests/test_escalation.py +++ b/engine/apps/public_api/tests/test_escalation.py @@ -73,6 +73,12 @@ def test_escalation_new_alert_group( "web": f"a/grafana-oncall-app/alert-groups/{ag.public_primary_key}", }, "silenced_at": None, + "last_alert": { + "id": ag.alerts.last().public_primary_key, + "alert_group_id": ag.public_primary_key, + "created_at": ag.alerts.last().created_at.isoformat().replace("+00:00", "Z"), + "payload": ag.alerts.last().raw_request_data, + }, } alert = ag.alerts.get() diff --git a/engine/apps/public_api/tests/test_escalation_policies.py b/engine/apps/public_api/tests/test_escalation_policies.py index 9cf961acc6..e1d478da89 100644 --- a/engine/apps/public_api/tests/test_escalation_policies.py +++ b/engine/apps/public_api/tests/test_escalation_policies.py @@ -463,3 +463,43 @@ def test_update_escalation_policy_using_notify_team_members( escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"]) serializer = EscalationPolicySerializer(escalation_policy) assert response.data == serializer.data + + +@pytest.mark.django_db +def test_create_escalation_policy_declare_incident( + make_organization_and_user_with_token, + escalation_policies_setup, + settings, +): + organization, user, token = make_organization_and_user_with_token() + escalation_chain, _, _ = escalation_policies_setup(organization, user) + + data_for_create = { + "escalation_chain_id": escalation_chain.public_primary_key, + "type": "declare_incident", + "position": 0, + "severity": "critical", + } + + client = APIClient() + url = reverse("api-public:escalation_policies-list") + response = client.post(url, data=data_for_create, format="json", HTTP_AUTHORIZATION=token) + assert response.status_code == status.HTTP_400_BAD_REQUEST + + # make sure declare incident step is enabled + settings.FEATURE_DECLARE_INCIDENT_STEP_ENABLED = True + organization.is_grafana_incident_enabled = True + organization.save() + + response = client.post(url, data=data_for_create, format="json", HTTP_AUTHORIZATION=token) + assert response.status_code == status.HTTP_201_CREATED + + escalation_policy = EscalationPolicy.objects.get(public_primary_key=response.data["id"]) + assert escalation_policy.step == EscalationPolicy.STEP_DECLARE_INCIDENT + assert escalation_policy.severity == "critical" + + url = reverse("api-public:escalation_policies-detail", kwargs={"pk": escalation_policy.public_primary_key}) + response = client.get(url, format="json", HTTP_AUTHORIZATION=token) + response_data = response.json() + assert response_data["type"] == EscalationPolicy.PUBLIC_STEP_CHOICES_MAP[EscalationPolicy.STEP_DECLARE_INCIDENT] + assert response_data["severity"] == "critical" diff --git a/engine/apps/public_api/views/alert_groups.py b/engine/apps/public_api/views/alert_groups.py index c1b5fe1d54..738219d428 100644 --- a/engine/apps/public_api/views/alert_groups.py +++ b/engine/apps/public_api/views/alert_groups.py @@ -8,7 +8,7 @@ from rest_framework.viewsets import GenericViewSet from apps.alerts.constants import ActionSource -from apps.alerts.models import AlertGroup +from apps.alerts.models import AlertGroup, AlertReceiveChannel from apps.alerts.tasks import delete_alert_group, wipe from apps.api.label_filtering import parse_label_query from apps.auth_token.auth import ApiTokenAuthentication @@ -23,7 +23,7 @@ DateRangeFilterMixin, get_team_queryset, ) -from common.api_helpers.mixins import RateLimitHeadersMixin +from common.api_helpers.mixins import AlertGroupEnrichingMixin, RateLimitHeadersMixin from common.api_helpers.paginators import FiftyPageSizePaginator @@ -49,7 +49,12 @@ class AlertGroupFilters(ByTeamModelFieldFilterMixin, DateRangeFilterMixin, filte class AlertGroupView( - RateLimitHeadersMixin, mixins.ListModelMixin, mixins.RetrieveModelMixin, mixins.DestroyModelMixin, GenericViewSet + AlertGroupEnrichingMixin, + RateLimitHeadersMixin, + mixins.ListModelMixin, + mixins.RetrieveModelMixin, + mixins.DestroyModelMixin, + GenericViewSet, ): authentication_classes = (ApiTokenAuthentication,) permission_classes = (IsAuthenticated,) @@ -64,18 +69,23 @@ class AlertGroupView( filterset_class = AlertGroupFilters def get_queryset(self): + # no select_related or prefetch_related is used at this point, it will be done on paginate_queryset. + route_id = self.request.query_params.get("route_id", None) integration_id = self.request.query_params.get("integration_id", None) state = self.request.query_params.get("state", None) - queryset = AlertGroup.objects.filter( - channel__organization=self.request.auth.organization, - ).order_by("-started_at") + alert_receive_channels_qs = AlertReceiveChannel.objects_with_deleted.filter( + organization_id=self.request.auth.organization.id + ) + if integration_id: + alert_receive_channels_qs = alert_receive_channels_qs.filter(public_primary_key=integration_id) + + alert_receive_channels_ids = list(alert_receive_channels_qs.values_list("id", flat=True)) + queryset = AlertGroup.objects.filter(channel__in=alert_receive_channels_ids).order_by("-started_at") if route_id: queryset = queryset.filter(channel_filter__public_primary_key=route_id) - if integration_id: - queryset = queryset.filter(channel__public_primary_key=integration_id) if state: choices = dict(AlertGroup.STATUS_CHOICES) try: @@ -112,9 +122,11 @@ def get_object(self): public_primary_key = self.kwargs["pk"] try: - return AlertGroup.objects.filter( + obj = AlertGroup.objects.filter( channel__organization=self.request.auth.organization, ).get(public_primary_key=public_primary_key) + obj = self.enrich([obj])[0] + return obj except AlertGroup.DoesNotExist: raise NotFound diff --git a/engine/common/api_helpers/mixins.py b/engine/common/api_helpers/mixins.py index d6e31c4261..552aaade45 100644 --- a/engine/common/api_helpers/mixins.py +++ b/engine/common/api_helpers/mixins.py @@ -4,7 +4,7 @@ from django.core.exceptions import ObjectDoesNotExist from django.db import models -from django.db.models import Q +from django.db.models import Count, Max, Q from django.utils.functional import cached_property from drf_spectacular.utils import extend_schema, inline_serializer from rest_framework import serializers, status @@ -21,6 +21,7 @@ AlertWebTemplater, TemplateLoader, ) +from apps.alerts.models import Alert, AlertGroup from apps.base.messaging import get_messaging_backends from common.api_helpers.exceptions import BadRequest from common.jinja_templater import apply_jinja_template @@ -411,3 +412,56 @@ def instance_context(self) -> InstanceContext: else: instance_context = None return instance_context + + +class AlertGroupEnrichingMixin: + def paginate_queryset(self, queryset): + """ + All SQL joins (select_related and prefetch_related) will be performed AFTER pagination, so it only joins tables + for one page of alert groups, not the whole table. + """ + alert_groups = super().paginate_queryset(queryset.only("id")) + alert_groups = self.enrich(alert_groups) + return alert_groups + + def enrich(self, alert_groups: typing.List[AlertGroup]) -> typing.List[AlertGroup]: + """ + This method performs select_related and prefetch_related (using setup_eager_loading) as well as in-memory joins + to add additional info like alert_count and last_alert for every alert group efficiently. + We need the last_alert because it's used by AlertGroupWebRenderer. + """ + + # enrich alert groups with select_related and prefetch_related + alert_group_pks = [alert_group.pk for alert_group in alert_groups] + queryset = AlertGroup.objects.filter(pk__in=alert_group_pks).order_by("-started_at") + + queryset = self.get_serializer_class().setup_eager_loading(queryset) + alert_groups = list(queryset) + + # get info on alerts count and last alert ID for every alert group + alerts_info = ( + Alert.objects.values("group_id") + .filter(group_id__in=alert_group_pks) + .annotate(alerts_count=Count("group_id"), last_alert_id=Max("id")) + ) + alerts_info_map = {info["group_id"]: info for info in alerts_info} + + # fetch last alerts for every alert group + last_alert_ids = [info["last_alert_id"] for info in alerts_info_map.values()] + last_alerts = Alert.objects.filter(pk__in=last_alert_ids) + for alert in last_alerts: + # link group back to alert + alert.group = [alert_group for alert_group in alert_groups if alert_group.pk == alert.group_id][0] + alerts_info_map[alert.group_id].update({"last_alert": alert}) + + # add additional "alerts_count" and "last_alert" fields to every alert group + for alert_group in alert_groups: + try: + alert_group.last_alert = alerts_info_map[alert_group.pk]["last_alert"] + alert_group.alerts_count = alerts_info_map[alert_group.pk]["alerts_count"] + except KeyError: + # alert group has no alerts + alert_group.last_alert = None + alert_group.alerts_count = 0 + + return alert_groups diff --git a/engine/common/constants/plugin_ids.py b/engine/common/constants/plugin_ids.py new file mode 100644 index 0000000000..666c30e21c --- /dev/null +++ b/engine/common/constants/plugin_ids.py @@ -0,0 +1,7 @@ +class PluginID: + ONCALL = "grafana-oncall-app" + IRM = "grafana-irm-app" + + INCIDENT = "grafana-incident-app" + LABELS = "grafana-labels-app" + ML = "grafana-ml-app" diff --git a/engine/common/incident_api/client.py b/engine/common/incident_api/client.py index 41d2b28c06..7013b45a3c 100644 --- a/engine/common/incident_api/client.py +++ b/engine/common/incident_api/client.py @@ -5,6 +5,8 @@ import requests from django.conf import settings +from common.constants.plugin_ids import PluginID + class IncidentDetails(typing.TypedDict): # https://grafana.com/docs/grafana-cloud/alerting-and-irm/irm/incident/api/reference/#getincidentresponse @@ -73,13 +75,13 @@ def __str__(self): TIMEOUT = 5 -DEFAULT_INCIDENT_SEVERITY = "pending" +DEFAULT_INCIDENT_SEVERITY = "Pending" DEFAULT_INCIDENT_STATUS = "active" DEFAULT_ACTIVITY_KIND = "userNote" class IncidentAPIClient: - INCIDENT_BASE_PATH = "/api/plugins/grafana-incident-app/resources/" + INCIDENT_BASE_PATH = f"/api/plugins/{PluginID.INCIDENT}/resources/" def __init__(self, api_url: str, api_token: str) -> None: self.api_token = api_token @@ -90,18 +92,15 @@ def _request_headers(self): return {"User-Agent": settings.GRAFANA_COM_USER_AGENT, "Authorization": f"Bearer {self.api_token}"} def _check_response(self, response: requests.models.Response): - message = None + message = "" - if 400 <= response.status_code < 500: + if response.status_code >= 400: try: error_data = response.json() message = error_data.get("error", response.reason) except JSONDecodeError: message = response.reason - elif 500 <= response.status_code < 600: - message = response.reason - if message: raise IncidentAPIException( status=response.status_code, url=response.request.url, diff --git a/engine/common/incident_api/tests/test_client.py b/engine/common/incident_api/tests/test_client.py index 2fffad1daf..2ba6840d5b 100644 --- a/engine/common/incident_api/tests/test_client.py +++ b/engine/common/incident_api/tests/test_client.py @@ -182,9 +182,6 @@ def test_error_handling(endpoint, client_method_name, args): client_method = getattr(client, client_method_name) client_method(*args) assert excinfo.value.status == error_code - expected_error = ( - response_data["error"] if error_code == status.HTTP_400_BAD_REQUEST else "Internal Server Error" - ) - assert excinfo.value.msg == expected_error + assert excinfo.value.msg == response_data["error"] assert excinfo.value.url == url assert excinfo.value.method == "POST" diff --git a/engine/common/utils.py b/engine/common/utils.py index 395a1306bd..dc20b9eceb 100644 --- a/engine/common/utils.py +++ b/engine/common/utils.py @@ -249,7 +249,7 @@ def clean_markup(text): def escape_html(text): - return html.escape(text, quote=False) + return html.escape(text, quote=False) if text else text def urlize_with_respect_to_a(html): diff --git a/engine/conftest.py b/engine/conftest.py index 9cb362ada6..8e18b316c0 100644 --- a/engine/conftest.py +++ b/engine/conftest.py @@ -32,6 +32,7 @@ AlertReceiveChannelFactory, ChannelFilterFactory, CustomActionFactory, + DeclaredIncidentFactory, EscalationChainFactory, EscalationPolicyFactory, InvitationFactory, @@ -40,7 +41,6 @@ UserNotificationBundleFactory, ) from apps.api.permissions import ( - ACTION_PREFIX, GrafanaAPIPermission, LegacyAccessControlCompatiblePermission, LegacyAccessControlRole, @@ -111,6 +111,7 @@ TestAdvancedWebhookPreset, TestWebhookPreset, ) +from common.constants.plugin_ids import PluginID register(OrganizationFactory) register(UserFactory) @@ -355,11 +356,30 @@ class PluginJSON(typing.TypedDict): with open("../grafana-plugin/src/plugin.json") as fp: plugin_json: PluginJSON = json.load(fp) + # NOTE: we need to manually add grafana-labels-app permissions here since these + # are granted to basic roles via the grafana-labels-app itself, and not + # ../grafana-plugin/src/plugin.json + # + # However, we do sync these permissions into our backend. See + # https://github.com/grafana/irm/pull/200 for more details + # + # We don't currently add the label delete permission here because we don't currently + # use this in OnCall role_mapping: RoleMapping = { LegacyAccessControlRole.NONE: [], - LegacyAccessControlRole.VIEWER: [], - LegacyAccessControlRole.EDITOR: [], - LegacyAccessControlRole.ADMIN: [], + LegacyAccessControlRole.VIEWER: [ + RBACPermission.Permissions.LABEL_READ, + ], + LegacyAccessControlRole.EDITOR: [ + RBACPermission.Permissions.LABEL_READ, + RBACPermission.Permissions.LABEL_WRITE, + RBACPermission.Permissions.LABEL_CREATE, + ], + LegacyAccessControlRole.ADMIN: [ + RBACPermission.Permissions.LABEL_READ, + RBACPermission.Permissions.LABEL_WRITE, + RBACPermission.Permissions.LABEL_CREATE, + ], } all_permission_classes: typing.Dict[str, LegacyAccessControlCompatiblePermission] = { @@ -377,7 +397,7 @@ class PluginJSON(typing.TypedDict): action = permission["action"] permission_class = None - if action.startswith(ACTION_PREFIX): + if action.startswith(PluginID.ONCALL): permission_class = all_permission_classes[action] if permission_class: @@ -1093,3 +1113,13 @@ def _make_user_notification_bundle(user, notification_channel, important=False, ) return _make_user_notification_bundle + + +@pytest.fixture +def make_declared_incident(): + def _make_declared_incident(incident_id, organization, channel_filter): + return DeclaredIncidentFactory( + incident_id=incident_id, organization=organization, channel_filter=channel_filter + ) + + return _make_declared_incident diff --git a/engine/settings/base.py b/engine/settings/base.py index 9f7f64184d..4f0859f0f2 100644 --- a/engine/settings/base.py +++ b/engine/settings/base.py @@ -8,7 +8,7 @@ from firebase_admin import credentials, initialize_app from common.api_helpers.custom_ratelimit import getenv_custom_ratelimit -from common.utils import getenv_boolean, getenv_integer, getenv_list +from common.utils import getenv_boolean, getenv_float, getenv_integer, getenv_list VERSION = "dev-oss" SEND_ANONYMOUS_USAGE_STATS = getenv_boolean("SEND_ANONYMOUS_USAGE_STATS", default=True) @@ -75,6 +75,7 @@ FEATURE_ALERT_GROUP_SEARCH_ENABLED = getenv_boolean("FEATURE_ALERT_GROUP_SEARCH_ENABLED", default=True) FEATURE_ALERT_GROUP_SEARCH_CUTOFF_DAYS = getenv_integer("FEATURE_ALERT_GROUP_SEARCH_CUTOFF_DAYS", default=None) FEATURE_NOTIFICATION_BUNDLE_ENABLED = getenv_boolean("FEATURE_NOTIFICATION_BUNDLE_ENABLED", default=True) +FEATURE_DECLARE_INCIDENT_STEP_ENABLED = getenv_boolean("FEATURE_DECLARE_INCIDENT_STEP_ENABLED", default=False) TWILIO_API_KEY_SID = os.environ.get("TWILIO_API_KEY_SID") TWILIO_API_KEY_SECRET = os.environ.get("TWILIO_API_KEY_SECRET") @@ -206,7 +207,6 @@ class DatabaseTypes: ALERT_GROUPS_DISABLE_PREFER_ORDERING_INDEX = DATABASE_TYPE == DatabaseTypes.MYSQL and getenv_boolean( "ALERT_GROUPS_DISABLE_PREFER_ORDERING_INDEX", default=False ) -ALERT_GROUP_LIST_TRY_PREFETCH = getenv_boolean("ALERT_GROUP_LIST_TRY_PREFETCH", default=False) # Redis REDIS_USERNAME = os.getenv("REDIS_USERNAME", "") @@ -835,7 +835,7 @@ class BrokerTypes: JINJA_RESULT_MAX_LENGTH = os.getenv("JINJA_RESULT_MAX_LENGTH", 50000) # Log inbound/outbound calls as slow=1 if they exceed threshold -SLOW_THRESHOLD_SECONDS = 2.0 +SLOW_THRESHOLD_SECONDS = getenv_float("SLOW_THRESHOLD_SECONDS", 2.0) # Email messaging backend EMAIL_BACKEND = "django.core.mail.backends.smtp.EmailBackend" diff --git a/engine/settings/celery_task_routes.py b/engine/settings/celery_task_routes.py index ed58be1ab5..29309a7196 100644 --- a/engine/settings/celery_task_routes.py +++ b/engine/settings/celery_task_routes.py @@ -94,6 +94,7 @@ # CRITICAL "apps.alerts.tasks.acknowledge_reminder.acknowledge_reminder_task": {"queue": "critical"}, "apps.alerts.tasks.acknowledge_reminder.unacknowledge_timeout_task": {"queue": "critical"}, + "apps.alerts.tasks.declare_incident.declare_incident": {"queue": "critical"}, "apps.alerts.tasks.distribute_alert.send_alert_create_signal": {"queue": "critical"}, "apps.alerts.tasks.escalate_alert_group.escalate_alert_group": {"queue": "critical"}, "apps.alerts.tasks.invite_user_to_join_incident.invite_user_to_join_incident": {"queue": "critical"}, diff --git a/grafana-plugin/CHANGELOG.md b/grafana-plugin/CHANGELOG.md index f66f1b73de..585b848c9a 100644 --- a/grafana-plugin/CHANGELOG.md +++ b/grafana-plugin/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## [1.9.28](https://github.com/grafana/irm/compare/grafana-oncall-app-v1.9.27...grafana-oncall-app-v1.9.28) (2024-10-01) + + +### Bug Fixes + +* disallow oncall schedule rotation layer/overrides CUD form submissions more than once ([#193](https://github.com/grafana/irm/issues/193)) ([73ae1c7](https://github.com/grafana/irm/commit/73ae1c7d78474b42b9eb4305416828afeb04fa3a)) + + +### Miscellaneous Chores + +* implement merged IRM module.tsx ([#182](https://github.com/grafana/irm/issues/182)) ([995b573](https://github.com/grafana/irm/commit/995b5732493aabc226cd62b9ca52a1e582ef5878)) + ## [1.9.27](https://github.com/grafana/irm/compare/grafana-oncall-app-v1.9.26...grafana-oncall-app-v1.9.27) (2024-09-26) diff --git a/grafana-plugin/package.json b/grafana-plugin/package.json index c4a9b55d26..a960904928 100644 --- a/grafana-plugin/package.json +++ b/grafana-plugin/package.json @@ -1,6 +1,6 @@ { "name": "grafana-oncall-app", - "version": "1.9.27", + "version": "1.9.28", "description": "Grafana OnCall Plugin", "scripts": { "lint": "eslint --ext .js,.jsx,.ts,.tsx --max-warnings=20 ./src ./e2e-tests", diff --git a/grafana-plugin/src/components/Policy/EscalationPolicy.tsx b/grafana-plugin/src/components/Policy/EscalationPolicy.tsx index 7f341eb94c..aee19dad1d 100644 --- a/grafana-plugin/src/components/Policy/EscalationPolicy.tsx +++ b/grafana-plugin/src/components/Policy/EscalationPolicy.tsx @@ -131,6 +131,8 @@ class _EscalationPolicy extends React.Component { return this.renderNumAlertsInWindow(); case 'num_minutes_in_window': return this.renderNumMinutesInWindowOptions(); + case 'severity': + return this.renderSeverities(); default: console.warn('Unknown escalation step placeholder'); return ''; @@ -248,6 +250,34 @@ class _EscalationPolicy extends React.Component { ); } + renderSeverities() { + const { + data, + isDisabled, + theme, + store: { escalationPolicyStore }, + } = this.props; + const styles = getEscalationPolicyStyles(theme); + const { severity } = data; + + return ( + +