Skip to content

Commit

Permalink
HostMetrics: Hard auto-cleanup (ansible#14255)
Browse files Browse the repository at this point in the history
Fix host metric settings

Cleanup_host_metric command with default params

Fix order of host metric cleanups
  • Loading branch information
slemrmartin authored and djyasin committed Sep 11, 2024
1 parent e76f632 commit 8e6b1d1
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 68 deletions.
24 changes: 12 additions & 12 deletions awx/main/management/commands/cleanup_host_metrics.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
from awx.main.models import HostMetric
from django.core.management.base import BaseCommand
from django.conf import settings
from awx.main.tasks.host_metrics import HostMetricTask


class Command(BaseCommand):
"""
Run soft-deleting of HostMetrics
This command provides cleanup task for HostMetric model.
There are two modes, which run in following order:
- soft cleanup
- - Perform soft-deletion of all host metrics last automated 12 months ago or before.
This is the same as issuing a DELETE request to /api/v2/host_metrics/N/ for all host metrics that match the criteria.
- - updates columns delete, deleted_counter and last_deleted
- hard cleanup
- - Permanently erase from the database all host metrics last automated 36 months ago or before.
This operation happens after the soft deletion has finished.
"""

help = 'Run soft-deleting of HostMetrics'

def add_arguments(self, parser):
parser.add_argument('--months-ago', type=int, dest='months-ago', action='store', help='Threshold in months for soft-deleting')
help = 'Run soft and hard-deletion of HostMetrics'

def handle(self, *args, **options):
months_ago = options.get('months-ago') or None

if not months_ago:
months_ago = getattr(settings, 'CLEANUP_HOST_METRICS_SOFT_THRESHOLD', 12)

HostMetric.cleanup_task(months_ago)
HostMetricTask().cleanup(soft_threshold=settings.CLEANUP_HOST_METRICS_SOFT_THRESHOLD, hard_threshold=settings.CLEANUP_HOST_METRICS_HARD_THRESHOLD)
18 changes: 0 additions & 18 deletions awx/main/models/inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import os.path
from urllib.parse import urljoin

import dateutil.relativedelta
import yaml

# Django
Expand Down Expand Up @@ -890,23 +889,6 @@ def soft_restore(self):
self.deleted = False
self.save(update_fields=['deleted'])

@classmethod
def cleanup_task(cls, months_ago):
try:
months_ago = int(months_ago)
if months_ago <= 0:
raise ValueError()

last_automation_before = now() - dateutil.relativedelta.relativedelta(months=months_ago)

logger.info(f'cleanup_host_metrics: soft-deleting records last automated before {last_automation_before}')
HostMetric.active_objects.filter(last_automation__lt=last_automation_before).update(
deleted=True, deleted_counter=models.F('deleted_counter') + 1, last_deleted=now()
)
settings.CLEANUP_HOST_METRICS_LAST_TS = now()
except (TypeError, ValueError):
logger.error(f"cleanup_host_metrics: months_ago({months_ago}) has to be a positive integer value")


class HostMetricSummaryMonthly(models.Model):
"""
Expand Down
10 changes: 10 additions & 0 deletions awx/main/tasks/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from django.utils.timezone import now
from rest_framework.fields import DateTimeField


def is_run_threshold_reached(setting, threshold_seconds):
last_time = DateTimeField().to_internal_value(setting) if setting else None
if not last_time:
return True
else:
return (now() - last_time).total_seconds() > threshold_seconds
75 changes: 66 additions & 9 deletions awx/main/tasks/host_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,90 @@
import logging

from django.conf import settings
from django.db.models import Count
from django.db.models import Count, F
from django.db.models.functions import TruncMonth
from django.utils.timezone import now
from rest_framework.fields import DateTimeField
from awx.main.dispatch import get_task_queuename
from awx.main.dispatch.publish import task
from awx.main.models.inventory import HostMetric, HostMetricSummaryMonthly
from awx.main.tasks.helpers import is_run_threshold_reached
from awx.conf.license import get_license

logger = logging.getLogger('awx.main.tasks.host_metric_summary_monthly')
logger = logging.getLogger('awx.main.tasks.host_metrics')


@task(queue=get_task_queuename)
def cleanup_host_metrics():
if is_run_threshold_reached(getattr(settings, 'CLEANUP_HOST_METRICS_LAST_TS', None), getattr(settings, 'CLEANUP_HOST_METRICS_INTERVAL', 30) * 86400):
logger.info(f"Executing cleanup_host_metrics, last ran at {getattr(settings, 'CLEANUP_HOST_METRICS_LAST_TS', '---')}")
HostMetricTask().cleanup(
soft_threshold=getattr(settings, 'CLEANUP_HOST_METRICS_SOFT_THRESHOLD', 12),
hard_threshold=getattr(settings, 'CLEANUP_HOST_METRICS_HARD_THRESHOLD', 36),
)
logger.info("Finished cleanup_host_metrics")


@task(queue=get_task_queuename)
def host_metric_summary_monthly():
"""Run cleanup host metrics summary monthly task each week"""
if _is_run_threshold_reached(
getattr(settings, 'HOST_METRIC_SUMMARY_TASK_LAST_TS', None), getattr(settings, 'HOST_METRIC_SUMMARY_TASK_INTERVAL', 7) * 86400
):
if is_run_threshold_reached(getattr(settings, 'HOST_METRIC_SUMMARY_TASK_LAST_TS', None), getattr(settings, 'HOST_METRIC_SUMMARY_TASK_INTERVAL', 7) * 86400):
logger.info(f"Executing host_metric_summary_monthly, last ran at {getattr(settings, 'HOST_METRIC_SUMMARY_TASK_LAST_TS', '---')}")
HostMetricSummaryMonthlyTask().execute()
logger.info("Finished host_metric_summary_monthly")


def _is_run_threshold_reached(setting, threshold_seconds):
last_time = DateTimeField().to_internal_value(setting) if setting else DateTimeField().to_internal_value('1970-01-01')
class HostMetricTask:
"""
This class provides cleanup task for HostMetric model.
There are two modes:
- soft cleanup (updates columns delete, deleted_counter and last_deleted)
- hard cleanup (deletes from the db)
"""

def cleanup(self, soft_threshold=None, hard_threshold=None):
"""
Main entrypoint, runs either soft cleanup, hard cleanup or both
:param soft_threshold: (int)
:param hard_threshold: (int)
"""
if hard_threshold is not None:
self.hard_cleanup(hard_threshold)
if soft_threshold is not None:
self.soft_cleanup(soft_threshold)

settings.CLEANUP_HOST_METRICS_LAST_TS = now()

@staticmethod
def soft_cleanup(threshold=None):
if threshold is None:
threshold = getattr(settings, 'CLEANUP_HOST_METRICS_SOFT_THRESHOLD', 12)

try:
threshold = int(threshold)
except (ValueError, TypeError) as e:
raise type(e)("soft_threshold has to be convertible to number") from e

last_automation_before = now() - relativedelta(months=threshold)
rows = HostMetric.active_objects.filter(last_automation__lt=last_automation_before).update(
deleted=True, deleted_counter=F('deleted_counter') + 1, last_deleted=now()
)
logger.info(f'cleanup_host_metrics: soft-deleted records last automated before {last_automation_before}, affected rows: {rows}')

@staticmethod
def hard_cleanup(threshold=None):
if threshold is None:
threshold = getattr(settings, 'CLEANUP_HOST_METRICS_HARD_THRESHOLD', 36)

try:
threshold = int(threshold)
except (ValueError, TypeError) as e:
raise type(e)("hard_threshold has to be convertible to number") from e

return (now() - last_time).total_seconds() > threshold_seconds
last_deleted_before = now() - relativedelta(months=threshold)
queryset = HostMetric.objects.filter(deleted=True, last_deleted__lt=last_deleted_before)
rows = queryset.delete()
logger.info(f'cleanup_host_metrics: hard-deleted records which were soft deleted before {last_deleted_before}, affected rows: {rows[0]}')


class HostMetricSummaryMonthlyTask:
Expand Down
29 changes: 2 additions & 27 deletions awx/main/tasks/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
Inventory,
SmartInventoryMembership,
Job,
HostMetric,
convert_jsonfields,
)
from awx.main.constants import ACTIVE_STATES
Expand All @@ -64,6 +63,7 @@

from awx.main.utils.reload import stop_local_services
from awx.main.utils.pglock import advisory_lock
from awx.main.tasks.helpers import is_run_threshold_reached
from awx.main.tasks.receptor import get_receptor_ctl, worker_info, worker_cleanup, administrative_workunit_reaper, write_receptor_config
from awx.main.consumers import emit_channel_notification
from awx.main import analytics
Expand Down Expand Up @@ -368,9 +368,7 @@ def send_notifications(notification_list, job_id=None):

@task(queue=get_task_queuename)
def gather_analytics():
from awx.conf.models import Setting

if is_run_threshold_reached(Setting.objects.filter(key='AUTOMATION_ANALYTICS_LAST_GATHER').first(), settings.AUTOMATION_ANALYTICS_GATHER_INTERVAL):
if is_run_threshold_reached(getattr(settings, 'AUTOMATION_ANALYTICS_LAST_GATHER', None), settings.AUTOMATION_ANALYTICS_GATHER_INTERVAL):
analytics.gather()


Expand Down Expand Up @@ -427,29 +425,6 @@ def cleanup_images_and_files():
_cleanup_images_and_files()


@task(queue=get_task_queuename)
def cleanup_host_metrics():
"""Run cleanup host metrics ~each month"""
# TODO: move whole method to host_metrics in follow-up PR
from awx.conf.models import Setting

if is_run_threshold_reached(
Setting.objects.filter(key='CLEANUP_HOST_METRICS_LAST_TS').first(), getattr(settings, 'CLEANUP_HOST_METRICS_INTERVAL', 30) * 86400
):
months_ago = getattr(settings, 'CLEANUP_HOST_METRICS_SOFT_THRESHOLD', 12)
logger.info("Executing cleanup_host_metrics")
HostMetric.cleanup_task(months_ago)
logger.info("Finished cleanup_host_metrics")


def is_run_threshold_reached(setting, threshold_seconds):
from rest_framework.fields import DateTimeField

last_time = DateTimeField().to_internal_value(setting.value) if setting and setting.value else DateTimeField().to_internal_value('1970-01-01')

return (now() - last_time).total_seconds() > threshold_seconds


@task(queue=get_task_queuename)
def cluster_node_health_check(node):
"""
Expand Down
78 changes: 78 additions & 0 deletions awx/main/tests/functional/commands/test_cleanup_host_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pytest

from awx.main.tasks.host_metrics import HostMetricTask
from awx.main.models.inventory import HostMetric
from awx.main.tests.factories.fixtures import mk_host_metric
from dateutil.relativedelta import relativedelta
from django.conf import settings
from django.utils import timezone


@pytest.mark.django_db
def test_no_host_metrics():
"""No-crash test"""
assert HostMetric.objects.count() == 0
HostMetricTask().cleanup(soft_threshold=0, hard_threshold=0)
HostMetricTask().cleanup(soft_threshold=24, hard_threshold=42)
assert HostMetric.objects.count() == 0


@pytest.mark.django_db
def test_delete_exception():
"""Crash test"""
with pytest.raises(ValueError):
HostMetricTask().soft_cleanup("")
with pytest.raises(TypeError):
HostMetricTask().hard_cleanup(set())


@pytest.mark.django_db
@pytest.mark.parametrize('threshold', [settings.CLEANUP_HOST_METRICS_SOFT_THRESHOLD, 20])
def test_soft_delete(threshold):
"""Metrics with last_automation < threshold are updated to deleted=True"""
mk_host_metric('host_1', first_automation=ago(months=1), last_automation=ago(months=1), deleted=False)
mk_host_metric('host_2', first_automation=ago(months=1), last_automation=ago(months=1), deleted=True)
mk_host_metric('host_3', first_automation=ago(months=1), last_automation=ago(months=threshold, hours=-1), deleted=False)
mk_host_metric('host_4', first_automation=ago(months=1), last_automation=ago(months=threshold, hours=-1), deleted=True)
mk_host_metric('host_5', first_automation=ago(months=1), last_automation=ago(months=threshold, hours=1), deleted=False)
mk_host_metric('host_6', first_automation=ago(months=1), last_automation=ago(months=threshold, hours=1), deleted=True)
mk_host_metric('host_7', first_automation=ago(months=1), last_automation=ago(months=42), deleted=False)
mk_host_metric('host_8', first_automation=ago(months=1), last_automation=ago(months=42), deleted=True)

assert HostMetric.objects.count() == 8
assert HostMetric.active_objects.count() == 4

for i in range(2):
HostMetricTask().cleanup(soft_threshold=threshold)
assert HostMetric.objects.count() == 8

hostnames = set(HostMetric.objects.filter(deleted=False).order_by('hostname').values_list('hostname', flat=True))
assert hostnames == {'host_1', 'host_3'}


@pytest.mark.django_db
@pytest.mark.parametrize('threshold', [settings.CLEANUP_HOST_METRICS_HARD_THRESHOLD, 20])
def test_hard_delete(threshold):
"""Metrics with last_deleted < threshold and deleted=True are deleted from the db"""
mk_host_metric('host_1', first_automation=ago(months=1), last_deleted=ago(months=1), deleted=False)
mk_host_metric('host_2', first_automation=ago(months=1), last_deleted=ago(months=1), deleted=True)
mk_host_metric('host_3', first_automation=ago(months=1), last_deleted=ago(months=threshold, hours=-1), deleted=False)
mk_host_metric('host_4', first_automation=ago(months=1), last_deleted=ago(months=threshold, hours=-1), deleted=True)
mk_host_metric('host_5', first_automation=ago(months=1), last_deleted=ago(months=threshold, hours=1), deleted=False)
mk_host_metric('host_6', first_automation=ago(months=1), last_deleted=ago(months=threshold, hours=1), deleted=True)
mk_host_metric('host_7', first_automation=ago(months=1), last_deleted=ago(months=42), deleted=False)
mk_host_metric('host_8', first_automation=ago(months=1), last_deleted=ago(months=42), deleted=True)

assert HostMetric.objects.count() == 8
assert HostMetric.active_objects.count() == 4

for i in range(2):
HostMetricTask().cleanup(hard_threshold=threshold)
assert HostMetric.objects.count() == 6

hostnames = set(HostMetric.objects.order_by('hostname').values_list('hostname', flat=True))
assert hostnames == {'host_1', 'host_2', 'host_3', 'host_4', 'host_5', 'host_7'}


def ago(months=0, hours=0):
return timezone.now() - relativedelta(months=months, hours=hours)
4 changes: 2 additions & 2 deletions awx/settings/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@
'receptor_reaper': {'task': 'awx.main.tasks.system.awx_receptor_workunit_reaper', 'schedule': timedelta(seconds=60)},
'send_subsystem_metrics': {'task': 'awx.main.analytics.analytics_tasks.send_subsystem_metrics', 'schedule': timedelta(seconds=20)},
'cleanup_images': {'task': 'awx.main.tasks.system.cleanup_images_and_files', 'schedule': timedelta(hours=3)},
'cleanup_host_metrics': {'task': 'awx.main.tasks.system.cleanup_host_metrics', 'schedule': timedelta(hours=3, minutes=30)},
'cleanup_host_metrics': {'task': 'awx.main.tasks.host_metrics.cleanup_host_metrics', 'schedule': timedelta(hours=3, minutes=30)},
'host_metric_summary_monthly': {'task': 'awx.main.tasks.host_metrics.host_metric_summary_monthly', 'schedule': timedelta(hours=4)},
}

Expand Down Expand Up @@ -1049,7 +1049,7 @@
# - 'unique_managed_hosts': Compliant = automated - deleted hosts (using /api/v2/host_metrics/)
SUBSCRIPTION_USAGE_MODEL = ''

# Host metrics cleanup - last time of the cleanup run (soft-deleting records)
# Host metrics cleanup - last time of the task/command run
CLEANUP_HOST_METRICS_LAST_TS = None
# Host metrics cleanup - minimal interval between two cleanups in days
CLEANUP_HOST_METRICS_INTERVAL = 30 # days
Expand Down

0 comments on commit 8e6b1d1

Please sign in to comment.