diff --git a/instance/management/commands/kill_zombies.py b/instance/management/commands/kill_zombies.py index d7b52178c..e3610067a 100644 --- a/instance/management/commands/kill_zombies.py +++ b/instance/management/commands/kill_zombies.py @@ -71,16 +71,18 @@ def handle(self, *args, **options): if not nova_servers: self.log('No servers found in region {}.'.format(self.region)) - return + return None self.log('Found {} unterminated servers in region {}.'.format(len(nova_servers), self.region)) # Scan each server for the zombieness. death_count = sum(1 for srv in nova_servers if self.not_zombie_or_die(srv)) if self.dry_run: - self.log("Would have terminated {} zombies if this weren't a dry run.".format(death_count)) + result = "Would have terminated {} zombies if this weren't a dry run.".format(death_count) else: - self.log("Terminated {} zombies.".format(death_count)) + result = "Terminated {} zombies.".format(death_count) + self.log(result) + return result def not_zombie_or_die(self, nova_server): """ diff --git a/instance/tasks.py b/instance/tasks.py index 56fbbe836..f2e03327c 100644 --- a/instance/tasks.py +++ b/instance/tasks.py @@ -23,11 +23,13 @@ # Imports ##################################################################### from datetime import datetime -import logging from typing import Optional, List +import logging from django.conf import settings from django.contrib.auth import get_user_model +from django.core.management import call_command, CommandError +from django.core.mail import EmailMessage from django.db import connection from django.db.models import Q from django.utils import timezone @@ -47,6 +49,9 @@ logger = logging.getLogger(__name__) +# Constants ################################################################## + +KILL_ZOMBIES_CRON_SCHEDULE = settings.KILL_ZOMBIES_SCHEDULE.split() # Tasks ####################################################################### @@ -339,3 +344,115 @@ def cleanup_old_betatest_users(): users_to_delete = queryset.filter(is_active=False, profile__modified__lte=delete_cutoff) logger.info('Deleting %s users.', users_to_delete.count()) users_to_delete.delete() + + +class KillZombiesRunner: + """ + Helper class to run `kill_zombies_periodically` + """ + + def __init__(self): + self.region: str = settings.OPENSTACK_REGION + self.recipient: list = [email for _, email in settings.ADMINS] + self.threshold: int = settings.KILL_ZOMBIES_WARNING_THRESHOLD + + def send_email(self, subject: str, body: str) -> int: + """ + Utility method for sending emails + """ + if self.recipient: + email = EmailMessage( + subject, body, settings.DEFAULT_FROM_EMAIL, self.recipient + ) + email.send() + else: + logging.info("Email recipient is undefined. Skipping email.") + + def trigger_warning(self, num_zombies): + """ + Warns when more than `threshold` VMs will be terminated + """ + subject = "Zombie instances are over the current threshold ({})".format( + self.threshold + ) + body = ( + "The number of zombie OpenStack VMs ({}) in the {} region " + "is over the KILL_ZOMBIES_WARNING_THRESHOLD ({}).\n" + "These instances will be terminated using the `kill_zombies` command." + ).format(num_zombies, self.region, self.threshold) + logging.info( + "%s\nSending an email notification to: %s", + body, self.recipient + ) + self.send_email(subject, body) + + def get_zombie_servers_count(self, stdout: str) -> int: + """ + If there are servers to terminate, a `kill_zombies` dry run will warn: + "Would have terminated {} zombies" to stdout. + """ + num_instances = 0 + if "No servers found in region" in stdout: + return num_instances + try: + num_instances = int(stdout.split("Would have terminated")[1].split()[0]) + except (IndexError, ValueError): + logger.info("Received unexpected input from dry run. Defaulting to zero") + return num_instances + + def run(self): + """ + Main method for running the task + """ + try: + dry_run_output, output = "", "" + dry_run_output = call_command( + "kill_zombies", + region=self.region, + dry_run=True, + ) + num_of_zombies = self.get_zombie_servers_count(dry_run_output) + if num_of_zombies > self.threshold: + self.trigger_warning(num_of_zombies) + if num_of_zombies != 0: + output = call_command("kill_zombies", region=self.region) + logging.info( + "Task `kill_zombies_periodically` ran successfully " + "and terminated %s zombies", num_of_zombies + ) + else: + logging.info( + "Found zero zombies to terminate. " + "Task `kill_zombies_periodically` ran successfully." + ) + except CommandError: + logger.error( + "Task `kill_zombies` command failed. Sending notification email" + ) + subject = "Terminate zombie instances command failed" + body = ( + "Scheduled execution of `kill_zombies` command failed. " + "Log entries are displayed below:" + "%s\n%s" + ) % (dry_run_output, output) + self.send_email(subject, body) + return False + return True + + +if settings.KILL_ZOMBIES_ENABLED: + @db_periodic_task(crontab( + minute=KILL_ZOMBIES_CRON_SCHEDULE[0], + hour=KILL_ZOMBIES_CRON_SCHEDULE[1], + day=KILL_ZOMBIES_CRON_SCHEDULE[2], + month=KILL_ZOMBIES_CRON_SCHEDULE[3], + day_of_week=KILL_ZOMBIES_CRON_SCHEDULE[4], + )) + def kill_zombies_task(): + """ + A wrapper for `kill_zombies_periodically` so it only + exists when KILL_ZOMBIES_ENABLED is true + """ + task_runner = KillZombiesRunner() + logging.info("Executing periodic task `kill_zombies_periodically`") + task_runner.run() diff --git a/instance/tests/management/test_kill_zombies.py b/instance/tests/management/test_kill_zombies.py new file mode 100644 index 000000000..1437a3e29 --- /dev/null +++ b/instance/tests/management/test_kill_zombies.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- +# +# OpenCraft -- tools to aid developing and hosting free software projects +# Copyright (C) 2015-2019 OpenCraft +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +""" +Tests for the 'kill_zombies' management command. +""" +# Imports ##################################################################### + +from unittest.mock import patch, call +from django.core import mail +from django.core.management import CommandError +from django.test import TestCase, override_settings +from instance.tasks import KillZombiesRunner + +# Tests ####################################################################### + +# pylint: disable=bad-continuation + + +@override_settings( + KILL_ZOMBIES_ENABLED=True, + ADMINS=( + ("admin1", "admin1@localhost"), + ("admin2", "admin2@localhost"), + ), + OPENSTACK_REGION="some-region" +) +class KillZombiesPeriodicallyTestCase(TestCase): + """ + Test cases for the `kill_zombies_task` periodic task. + """ + @override_settings(KILL_ZOMBIES_ENABLED=False) + @patch("instance.tasks.call_command") + def test_toggle_task_false(self, mock_command): + """ + Tests that the task can't be run if disabled + through env variables + """ + with self.assertRaises(ImportError): + from instance.tasks import kill_zombies_task + kill_zombies_task() + self.assertFalse(mock_command.called) + + @patch("instance.tasks.call_command") + def test_set_env_variables(self, mock_command): + """ + Tests that the task passes env variables to + the kill_zombies command call + """ + test_runner = KillZombiesRunner() + test_runner.run() + first_call = call.mock_command("kill_zombies", region="some-region", dry_run=True) + self.assertEqual(mock_command.mock_calls[0], first_call) + mock_command.assert_called_with("kill_zombies", region="some-region") + + @patch.object(KillZombiesRunner, "trigger_warning") + def test_get_zombie_servers_count(self, mock_trigger): + """ + Tests that KillZombiesRunner.get_zombie_servers_count + works as expected + """ + test_runner = KillZombiesRunner() + no_instances = """ + 2021-02-26 14:16:12 | Starting kill_zombies + 2021-02-26 14:16:12 | No servers found in region some-region. + """ + some_instances = """ + 2021-02-26 14:16:12 | Starting kill_zombies + 2021-02-26 14:16:12 | Found 32 unterminated servers in region some-region. + 2021-02-26 14:16:12 | Would have terminated 32 zombies if this weren't a dry run. + """ + bad_output = "qwerty" + self.assertEqual(test_runner.get_zombie_servers_count(no_instances), 0) + self.assertEqual(test_runner.get_zombie_servers_count(some_instances), 32) + self.assertEqual(test_runner.get_zombie_servers_count(bad_output), 0) + self.assertEqual(test_runner.get_zombie_servers_count(""), 0) + + @override_settings(KILL_ZOMBIES_WARNING_THRESHOLD=3) + @patch.object(KillZombiesRunner, "get_zombie_servers_count", return_value=4) + @patch.object(KillZombiesRunner, "trigger_warning") + @patch("instance.tasks.call_command") + def test_trigger_warning_if_over_threshold( + self, + mock_get_zombie_servers_count, + mock_trigger_warning, + mock_command + ): + """ + Tests that trigger_warning is called when the number + of zombies to delete is over the threshold + """ + test_runner = KillZombiesRunner() + test_runner.run() + self.assertTrue(mock_get_zombie_servers_count.called) + self.assertTrue(mock_trigger_warning.called) + + @patch.object(KillZombiesRunner, "get_zombie_servers_count", return_value=0) + @patch("instance.tasks.call_command") + def test_do_nothing_if_zero( + self, + mock_get_zombie_servers_count, + mock_command + ): + """ + Tests that only a dry_run is executed when + the number of zombies to terminate is zero. + """ + test_runner = KillZombiesRunner() + test_runner.run() + mock_get_zombie_servers_count.assert_called_with( + "kill_zombies", + region="some-region", + dry_run=True + ) + + @override_settings( + DEFAULT_FROM_EMAIL="from@site.com", + ADMINS=(("admin3", "admin3@localhost"), ("admin4", "admin4@localhost")) + ) + @patch("instance.tasks.call_command") + def test_send_warning_email(self, mock_command): + """ + Tests that trigger_warning sends an email when called + """ + test_runner = KillZombiesRunner() + test_runner.trigger_warning(5) + self.assertEqual(len(mail.outbox), 1) + self.assertIn( + "is over the KILL_ZOMBIES_WARNING_THRESHOLD", + mail.outbox[0].body + ) + self.assertEqual(mail.outbox[0].from_email, "from@site.com") + self.assertEqual(mail.outbox[0].to, ["admin3@localhost", "admin4@localhost"]) + + @override_settings( + DEFAULT_FROM_EMAIL="from@site.com", + ADMINS=(("admin3", "admin3@localhost"),) + ) + @patch.object(KillZombiesRunner, "get_zombie_servers_count", return_value=4) + @patch.object(KillZombiesRunner, "trigger_warning") + @patch("instance.tasks.call_command") + def test_send_email_on_failure( + self, + mock_get_zombie_servers_count, + mock_trigger_warning, + mock_command + ): + """ + When call_command fails, it raises a CommandError. + Tests that an email is sent when CommandError is raised + """ + mock_command.side_effect = CommandError() + test_runner = KillZombiesRunner() + test_runner.run() + self.assertEqual(len(mail.outbox), 1) + self.assertIn( + "Scheduled execution of `kill_zombies` command failed", + mail.outbox[0].body + ) + self.assertEqual(mail.outbox[0].from_email, "from@site.com") + self.assertEqual(mail.outbox[0].to, ["admin3@localhost"]) diff --git a/opencraft/settings.py b/opencraft/settings.py index 79d50f0fd..48a259f36 100644 --- a/opencraft/settings.py +++ b/opencraft/settings.py @@ -910,6 +910,12 @@ # Format is ' ' like normal crontabs TRIAL_INSTANCES_REPORT_SCHEDULE = env('TRIAL_INSTANCES_REPORT_SCHEDULE', default='0 2 1 * *') +# Kill Zombies Periodic task ################################################# + +KILL_ZOMBIES_ENABLED = env("KILL_ZOMBIES_ENABLED", default=False) +KILL_ZOMBIES_SCHEDULE = env("KILL_ZOMBIES_SCHEDULE", default="0 0 * * *") +KILL_ZOMBIES_WARNING_THRESHOLD = env("KILL_ZOMBIES_WARNING_THRESHOLD", default=10) + # User inactive and delete #################################################### CLEANUP_OLD_BETATEST_USERS = env('CLEANUP_OLD_BETATEST_USERS', default=True)