diff --git a/engine/apps/alerts/tasks/notify_user.py b/engine/apps/alerts/tasks/notify_user.py index 7ad2bb4986..88f46f6e9c 100644 --- a/engine/apps/alerts/tasks/notify_user.py +++ b/engine/apps/alerts/tasks/notify_user.py @@ -26,6 +26,9 @@ from apps.user_management.models import User +RETRY_TIMEOUT_HOURS = 1 + + def schedule_send_bundled_notification_task( user_notification_bundle: "UserNotificationBundle", alert_group: "AlertGroup" ): @@ -445,10 +448,29 @@ def perform_notification(log_record_pk, use_default_notification_policy_fallback try: TelegramToUserConnector.notify_user(user, alert_group, notification_policy) except RetryAfter as e: - countdown = getattr(e, "retry_after", 3) - raise perform_notification.retry( - (log_record_pk, use_default_notification_policy_fallback), countdown=countdown, exc=e - ) + task_logger.exception(f"Telegram API rate limit exceeded. Retry after {e.retry_after} seconds.") + # check how much time has passed since log record was created + # to prevent eternal loop of restarting perform_notification task + if timezone.now() < log_record.created_at + timezone.timedelta(hours=RETRY_TIMEOUT_HOURS): + countdown = getattr(e, "retry_after", 3) + perform_notification.apply_async( + (log_record_pk, use_default_notification_policy_fallback), countdown=countdown + ) + else: + task_logger.debug( + f"telegram notification for alert_group {alert_group.pk} failed because of rate limit" + ) + UserNotificationPolicyLogRecord( + author=user, + type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED, + notification_policy=notification_policy, + reason="Telegram rate limit exceeded", + alert_group=alert_group, + notification_step=notification_policy.step, + notification_channel=notification_channel, + notification_error_code=UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_IN_TELEGRAM_RATELIMIT, + ).save() + return elif notification_channel == UserNotificationPolicy.NotificationChannel.SLACK: # TODO: refactor checking the possibility of sending a notification in slack @@ -516,13 +538,12 @@ def perform_notification(log_record_pk, use_default_notification_policy_fallback ).save() return - retry_timeout_hours = 1 if alert_group.slack_message: alert_group.slack_message.send_slack_notification(user, alert_group, notification_policy) task_logger.debug(f"Finished send_slack_notification for alert_group {alert_group.pk}.") # check how much time has passed since log record was created # to prevent eternal loop of restarting perform_notification task - elif timezone.now() < log_record.created_at + timezone.timedelta(hours=retry_timeout_hours): + elif timezone.now() < log_record.created_at + timezone.timedelta(hours=RETRY_TIMEOUT_HOURS): task_logger.debug( f"send_slack_notification for alert_group {alert_group.pk} failed because slack message " f"does not exist. Restarting perform_notification." @@ -534,7 +555,7 @@ def perform_notification(log_record_pk, use_default_notification_policy_fallback else: task_logger.debug( f"send_slack_notification for alert_group {alert_group.pk} failed because slack message " - f"after {retry_timeout_hours} hours still does not exist" + f"after {RETRY_TIMEOUT_HOURS} hours still does not exist" ) UserNotificationPolicyLogRecord( author=user, diff --git a/engine/apps/alerts/tests/test_notify_user.py b/engine/apps/alerts/tests/test_notify_user.py index 7124f957d1..a24003dfe9 100644 --- a/engine/apps/alerts/tests/test_notify_user.py +++ b/engine/apps/alerts/tests/test_notify_user.py @@ -360,12 +360,30 @@ def test_perform_notification_telegram_retryafter_error( countdown = 15 exc = RetryAfter(countdown) with patch.object(TelegramToUserConnector, "notify_user", side_effect=exc) as mock_notify_user: - with pytest.raises(RetryAfter): + with patch.object(perform_notification, "apply_async") as mock_apply_async: perform_notification(log_record.pk, False) mock_notify_user.assert_called_once_with(user, alert_group, user_notification_policy) + # task is rescheduled using the countdown value from the exception + mock_apply_async.assert_called_once_with((log_record.pk, False), countdown=countdown) assert alert_group.personal_log_records.last() == log_record + # but if the log was too old, skip and create a failed log record + log_record.created_at = timezone.now() - timezone.timedelta(minutes=90) + log_record.save() + with patch.object(TelegramToUserConnector, "notify_user", side_effect=exc) as mock_notify_user: + with patch.object(perform_notification, "apply_async") as mock_apply_async: + perform_notification(log_record.pk, False) + mock_notify_user.assert_called_once_with(user, alert_group, user_notification_policy) + assert not mock_apply_async.called + last_log_record = UserNotificationPolicyLogRecord.objects.last() + assert last_log_record.type == UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED + assert last_log_record.reason == "Telegram rate limit exceeded" + assert ( + last_log_record.notification_error_code + == UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_IN_TELEGRAM_RATELIMIT + ) + @patch("apps.base.models.UserNotificationPolicy.get_default_fallback_policy") @patch("apps.base.tests.messaging_backend.TestOnlyBackend.notify_user") diff --git a/engine/apps/base/models/user_notification_policy_log_record.py b/engine/apps/base/models/user_notification_policy_log_record.py index 3d7ab44e42..80185a8454 100644 --- a/engine/apps/base/models/user_notification_policy_log_record.py +++ b/engine/apps/base/models/user_notification_policy_log_record.py @@ -106,7 +106,8 @@ class UserNotificationPolicyLogRecord(models.Model): ERROR_NOTIFICATION_TELEGRAM_USER_IS_DEACTIVATED, ERROR_NOTIFICATION_MOBILE_USER_HAS_NO_ACTIVE_DEVICE, ERROR_NOTIFICATION_FORMATTING_ERROR, - ) = range(29) + ERROR_NOTIFICATION_IN_TELEGRAM_RATELIMIT, + ) = range(30) # for this errors we want to send message to general log channel ERRORS_TO_SEND_IN_SLACK_CHANNEL = [ @@ -304,6 +305,10 @@ def render_log_line_action(self, for_slack=False, substitute_author_with_tag=Fal result += f"failed to notify {user_verbal} in Slack, because channel is archived" elif self.notification_error_code == UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_IN_SLACK_RATELIMIT: result += f"failed to notify {user_verbal} in Slack due to Slack rate limit" + elif ( + self.notification_error_code == UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_IN_TELEGRAM_RATELIMIT + ): + result += f"failed to notify {user_verbal} in Telegram due to Telegram rate limit" elif self.notification_error_code == UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_FORBIDDEN: result += f"failed to notify {user_verbal}, not allowed" elif (