From 85e2893bc7489d52383200e6d78482ca6920a367 Mon Sep 17 00:00:00 2001 From: Alexander Burmak Date: Sat, 11 Nov 2023 16:18:49 +0300 Subject: [PATCH 1/3] Reworked "ch-monitoring backup" command --- .pylintrc | 2 +- ch_tools/monrun_checks/ch_backup.py | 165 ++++++++++++++-------------- 2 files changed, 84 insertions(+), 83 deletions(-) diff --git a/.pylintrc b/.pylintrc index bf1ce78f..05c0513c 100644 --- a/.pylintrc +++ b/.pylintrc @@ -121,7 +121,7 @@ argument-naming-style=snake_case # Regular expression matching correct argument names. Overrides argument- # naming-style. If left empty, argument names will be checked with the set # naming style. -argument-rgx=[a-z_][a-z0-9_]{0,30}$ +argument-rgx=[a-z_][a-z0-9_]{0,35}$ # Naming style matching correct attribute names. attr-naming-style=snake_case diff --git a/ch_tools/monrun_checks/ch_backup.py b/ch_tools/monrun_checks/ch_backup.py index 4a2a83dd..82968673 100644 --- a/ch_tools/monrun_checks/ch_backup.py +++ b/ch_tools/monrun_checks/ch_backup.py @@ -5,15 +5,14 @@ import json from datetime import datetime, timedelta, timezone from os.path import exists -from typing import Dict, List, Optional +from typing import Dict, List import click from dateutil.parser import parse as dateutil_parse -from ch_tools.common.backup import BackupConfig, get_backups +from ch_tools.common.backup import get_backups from ch_tools.common.cli.parameters import TimeSpanParamType from ch_tools.common.clickhouse.client import ClickhouseClient -from ch_tools.common.dbaas import DbaasConfig from ch_tools.monrun_checks.exceptions import die RESTORE_CONTEXT_PATH = "/tmp/ch_backup_restore_state.json" @@ -22,41 +21,79 @@ @click.command("backup") @click.option( - "--critical-failed", - "crit_failed", + "--failed-backup-count-crit", + "--failed-backup-count-crit-threshold", + "--critical-failed", # deprecated option name preserved for backward-compatibility + "failed_backup_count_crit_threshold", default=3, - help="Critical threshold for failed backups.", + help="Critical threshold on the number of failed backups. If last backups failed and its count equals or greater " + "than the specified threshold, a crit will be reported.", ) @click.option( - "--critical-absent", - "crit_absent", + "--backup-age-warn", + "--backup-age-warn-threshold", + "backup_age_warn_threshold", + default="1d", + type=TimeSpanParamType(), + help="Warning threshold on age of the last backup. If the last backup is created more than the specified " + "time ago, a warning will be reported.", +) +@click.option( + "--backup-age-crit", + "--backup-age-crit-threshold", + "--critical-absent", # deprecated option name preserved for backward-compatibility + "backup_age_crit_threshold", default="2d", type=TimeSpanParamType(), - help="Critical threshold for absent backups.", + help="Critical threshold on age of the last backup. If the last backup is created more than the specified " + "time ago, a crit will be reported.", ) -def backup_command(crit_failed, crit_absent): +@click.option( + "--backup-count-warn", + "--backup-count-warn-threshold", + "backup_count_warn_threshold", + default=0, + help="Warning threshold on the number of backups. If the number of backups of any status equals or greater than " + "the specified threshold, a warning will be reported.", +) +@click.option( + "--min-uptime", + "min_uptime", + default="2d", + type=TimeSpanParamType(), + help="Minimal ClickHouse uptime enough for backup creation.", +) +def backup_command( + failed_backup_count_crit_threshold, + backup_age_warn_threshold, + backup_age_crit_threshold, + backup_count_warn_threshold, + min_uptime, +): """ Check ClickHouse backups: its state, age and count. - Skip backup checks for recently created clusters. """ - dbaas_config = DbaasConfig.load() - - cluster_created_at = parse_str_datetime(dbaas_config.created_at) - if not check_now_pass_threshold(cluster_created_at): - return - ch_client = ClickhouseClient() - backup_config = BackupConfig.load() backups = get_backups() - check_restored_parts() + check_restored_data() + check_uptime(ch_client, min_uptime) check_valid_backups_exist(backups) - check_last_backup_not_failed(backups, crit=crit_failed) - check_backup_age(ch_client, backups, crit=crit_absent) - check_backup_count(backup_config, backups) + check_last_backup_not_failed(backups, failed_backup_count_crit_threshold) + check_backup_age(backups, backup_age_warn_threshold, backup_age_crit_threshold) + check_backup_count(backups, backup_count_warn_threshold) + + +def check_uptime(ch_client: ClickhouseClient, min_uptime: timedelta) -> None: + """ + Check that ClickHouse uptime enough for backup creation. + """ + uptime = ch_client.get_uptime() + if uptime < min_uptime: + die(0, "OK (forced due to low ClickHouse uptime)") -def check_valid_backups_exist(backups): +def check_valid_backups_exist(backups: List[Dict]) -> None: """ Check that valid backups exist. """ @@ -67,12 +104,10 @@ def check_valid_backups_exist(backups): die(2, "No valid backups found") -def check_last_backup_not_failed(backups, crit=3): +def check_last_backup_not_failed(backups: List[Dict], crit_threshold: int) -> None: """ Check that the last backup is not failed. Its status must be 'created' or 'creating'. """ - # pylint: disable=chained-comparison - counter = 0 for i, backup in enumerate(backups): state = backup["state"] @@ -86,23 +121,21 @@ def check_last_backup_not_failed(backups, crit=3): if counter == 0: return - status = 2 if crit > 0 and counter >= crit else 1 - if counter > 1: - message = f"Last {counter} backups failed" - else: + if counter == 1: message = "Last backup failed" + else: + message = f"Last {counter} backups failed" + + status = 2 if crit_threshold and counter >= crit_threshold else 1 die(status, message) -def check_backup_age(ch_client, backups, *, age_threshold=timedelta(days=1), crit=None): +def check_backup_age( + backups: List[Dict], warn_threshold: timedelta, crit_threshold: timedelta +) -> None: """ Check that the last backup is not too old. """ - # To avoid false warnings the check is skipped if ClickHouse uptime is less then age threshold. - uptime = ch_client.get_uptime() - if uptime < age_threshold: - return - checking_backup: Dict[str, str] = {} for i, backup in enumerate(backups): state = backup["state"] @@ -114,35 +147,34 @@ def check_backup_age(ch_client, backups, *, age_threshold=timedelta(days=1), cri die(2, "Didn't find a backup to check") backup_age = get_backup_age(checking_backup) - if backup_age < age_threshold: + + if crit_threshold and backup_age >= crit_threshold: + status = 2 + elif warn_threshold and backup_age >= warn_threshold: + status = 1 + else: return if checking_backup["state"] == "creating": message = f"Last backup was started {backup_age.days} days ago" else: message = f"Last backup was created {backup_age.days} days ago" - status = 1 - if crit is not None and uptime >= crit and backup_age >= crit: - status = 2 + die(status, message) -def check_backup_count(config: BackupConfig, backups: List[Dict]) -> None: +def check_backup_count(backups: List[Dict], backup_count_warn_threshold: int) -> None: """ Check that the number of backups is not too large. """ - max_count = ( - config.retain_count + config.deduplication_age_limit.days + 2 - ) # backup-service backup retention delay + backup_count = len(backups) + if backup_count_warn_threshold and backup_count >= backup_count_warn_threshold: + die( + 1, f"Too many backups exist: {backup_count} > {backup_count_warn_threshold}" + ) - count = sum( - 1 for backup in backups if backup.get("labels", {}).get("user_initiator") - ) - if count > max_count: - die(1, f"Too many backups exist: {count} > {max_count}") - -def check_restored_parts() -> None: +def check_restored_data() -> None: """ Check count of failed parts on restore """ @@ -172,34 +204,3 @@ def get_backup_age(backup): """ backup_time = dateutil_parse(backup["start_time"]) return datetime.now(timezone.utc) - backup_time - - -def parse_str_datetime(value: str) -> Optional[datetime]: - """ - Parse input string to datetime. - """ - if value is None: - return None - - try: - # use dateutil.parse instead of datetime.strptime because strptime can't parse timezone with ":" on python 3.6 - return dateutil_parse(value) - except Exception: - return None - - -def check_now_pass_threshold( - date_time: Optional[datetime], hours_threshold: int = 25 -) -> bool: - """ - Check that hours threshold is passed since input date - """ - if date_time is None: - return True - - diff = datetime.now(date_time.tzinfo) - date_time - - days, seconds = diff.days, diff.seconds - diff_in_hours = days * 24 + seconds // (60 * 60) - - return diff_in_hours >= hours_threshold From 133355e819c870475d2cfddccff9efe82db9293f Mon Sep 17 00:00:00 2001 From: Alexander Burmak Date: Mon, 13 Nov 2023 10:55:19 +0300 Subject: [PATCH 2/3] Add suppression by load monitor flag file --- ch_tools/common/result.py | 7 +- ch_tools/monrun_checks/ch_backup.py | 135 +++++++++++++++++++--------- 2 files changed, 97 insertions(+), 45 deletions(-) diff --git a/ch_tools/common/result.py b/ch_tools/common/result.py index 53d989a5..139bff4c 100644 --- a/ch_tools/common/result.py +++ b/ch_tools/common/result.py @@ -1,5 +1,10 @@ +OK = 0 +WARNING = 1 +CRIT = 2 + + class Result: - def __init__(self, code=0, message="Ok", verbose=""): + def __init__(self, code=OK, message="OK", verbose=""): self.code = code self.message = message self.verbose = verbose diff --git a/ch_tools/monrun_checks/ch_backup.py b/ch_tools/monrun_checks/ch_backup.py index 82968673..a5281aac 100644 --- a/ch_tools/monrun_checks/ch_backup.py +++ b/ch_tools/monrun_checks/ch_backup.py @@ -3,6 +3,8 @@ """ import json +import logging +import os.path from datetime import datetime, timedelta, timezone from os.path import exists from typing import Dict, List @@ -13,8 +15,9 @@ from ch_tools.common.backup import get_backups from ch_tools.common.cli.parameters import TimeSpanParamType from ch_tools.common.clickhouse.client import ClickhouseClient -from ch_tools.monrun_checks.exceptions import die +from ch_tools.common.result import CRIT, OK, WARNING, Result +LOAD_MONITOR_FLAG_PATH = "/tmp/load-monitor-userfault.flag" RESTORE_CONTEXT_PATH = "/tmp/ch_backup_restore_state.json" FAILED_PARTS_THRESHOLD = 10 @@ -73,38 +76,36 @@ def backup_command( """ Check ClickHouse backups: its state, age and count. """ - ch_client = ClickhouseClient() - backups = get_backups() - - check_restored_data() - check_uptime(ch_client, min_uptime) - check_valid_backups_exist(backups) - check_last_backup_not_failed(backups, failed_backup_count_crit_threshold) - check_backup_age(backups, backup_age_warn_threshold, backup_age_crit_threshold) - check_backup_count(backups, backup_count_warn_threshold) + backups, result = _get_backups() + + if result.code == OK: + result = _merge_results( + _check_valid_backups_exist(backups), + _check_last_backup_not_failed(backups, failed_backup_count_crit_threshold), + _check_backup_age( + backups, backup_age_warn_threshold, backup_age_crit_threshold + ), + _check_backup_count(backups, backup_count_warn_threshold), + _check_restored_data(), + ) + _suppress_if_required(result, min_uptime) -def check_uptime(ch_client: ClickhouseClient, min_uptime: timedelta) -> None: - """ - Check that ClickHouse uptime enough for backup creation. - """ - uptime = ch_client.get_uptime() - if uptime < min_uptime: - die(0, "OK (forced due to low ClickHouse uptime)") + return result -def check_valid_backups_exist(backups: List[Dict]) -> None: +def _check_valid_backups_exist(backups: List[Dict]) -> Result: """ Check that valid backups exist. """ for backup in backups: if backup["state"] == "created": - return + return Result(OK) - die(2, "No valid backups found") + return Result(CRIT, "No valid backups found") -def check_last_backup_not_failed(backups: List[Dict], crit_threshold: int) -> None: +def _check_last_backup_not_failed(backups: List[Dict], crit_threshold: int) -> Result: """ Check that the last backup is not failed. Its status must be 'created' or 'creating'. """ @@ -119,20 +120,20 @@ def check_last_backup_not_failed(backups: List[Dict], crit_threshold: int) -> No counter += 1 if counter == 0: - return + return Result(OK) if counter == 1: message = "Last backup failed" else: message = f"Last {counter} backups failed" - status = 2 if crit_threshold and counter >= crit_threshold else 1 - die(status, message) + status = CRIT if crit_threshold and counter >= crit_threshold else WARNING + return Result(status, message) -def check_backup_age( +def _check_backup_age( backups: List[Dict], warn_threshold: timedelta, crit_threshold: timedelta -) -> None: +) -> Result: """ Check that the last backup is not too old. """ @@ -144,37 +145,42 @@ def check_backup_age( break if len(checking_backup) == 0: - die(2, "Didn't find a backup to check") + return Result(OK) - backup_age = get_backup_age(checking_backup) + backup_age = _get_backup_age(checking_backup) if crit_threshold and backup_age >= crit_threshold: - status = 2 + status = CRIT elif warn_threshold and backup_age >= warn_threshold: - status = 1 + status = WARNING else: - return + return Result(OK) if checking_backup["state"] == "creating": message = f"Last backup was started {backup_age.days} days ago" else: message = f"Last backup was created {backup_age.days} days ago" - die(status, message) + return Result(status, message) -def check_backup_count(backups: List[Dict], backup_count_warn_threshold: int) -> None: +def _check_backup_count( + backups: List[Dict], backup_count_warn_threshold: int +) -> Result: """ Check that the number of backups is not too large. """ backup_count = len(backups) if backup_count_warn_threshold and backup_count >= backup_count_warn_threshold: - die( - 1, f"Too many backups exist: {backup_count} > {backup_count_warn_threshold}" + return Result( + WARNING, + f"Too many backups exist: {backup_count} > {backup_count_warn_threshold}", ) + return Result(OK) -def check_restored_data() -> None: + +def _check_restored_data() -> Result: """ Check count of failed parts on restore """ @@ -189,18 +195,59 @@ def check_restored_data() -> None: sum(len(table) for table in tables.values()) for tables in context["databases"].values() ) - if failed == 0: - return - failed_percent = int((failed / (failed + restored)) * 100) - die( - 1 if failed_percent < FAILED_PARTS_THRESHOLD else 2, - f"Some parts restore failed: {failed}({failed_percent}%)", - ) + if failed != 0: + failed_percent = int((failed / (failed + restored)) * 100) + status = 1 if failed_percent < FAILED_PARTS_THRESHOLD else 2 + return Result( + status, f"Some parts restore failed: {failed}({failed_percent}%)" + ) + + return Result(OK) -def get_backup_age(backup): +def _suppress_if_required(result: Result, min_uptime: timedelta) -> None: + if result.code == OK: + return + + if os.path.exists(LOAD_MONITOR_FLAG_PATH): + result.code = WARNING + result.message += " (suppressed by load monitor flag file)" + + if _get_uptime() < min_uptime: + result.code = WARNING + result.message += " (suppressed by low ClickHouse uptime)" + + +def _get_uptime() -> timedelta: + try: + return ClickhouseClient().get_uptime() + except Exception: + logging.warning("Failed to get ClickHouse uptime", exc_info=True) + return timedelta() + + +def _get_backups(): + try: + return get_backups(), Result(OK) + except Exception: + logging.exception("Failed to get backups") + return None, Result(CRIT, "Failed to get backups") + + +def _get_backup_age(backup): """ Calculate and return age of ClickHouse backup. """ backup_time = dateutil_parse(backup["start_time"]) return datetime.now(timezone.utc) - backup_time + + +def _merge_results(*results: Result) -> Result: + merged_result = Result() + for result in results: + if result.code > merged_result.code: + merged_result.code = result.code + merged_result.message = result.message + merged_result.verbose = result.verbose + + return merged_result From 2ee722d6539fceb8156e93cad3fa2377e569e1e5 Mon Sep 17 00:00:00 2001 From: Alexander Burmak Date: Mon, 13 Nov 2023 14:35:40 +0300 Subject: [PATCH 3/3] Minor corrections --- ch_tools/monrun_checks/ch_backup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ch_tools/monrun_checks/ch_backup.py b/ch_tools/monrun_checks/ch_backup.py index a5281aac..e52644b2 100644 --- a/ch_tools/monrun_checks/ch_backup.py +++ b/ch_tools/monrun_checks/ch_backup.py @@ -144,7 +144,7 @@ def _check_backup_age( checking_backup = backup break - if len(checking_backup) == 0: + if not checking_backup: return Result(OK) backup_age = _get_backup_age(checking_backup) @@ -197,7 +197,7 @@ def _check_restored_data() -> Result: ) if failed != 0: failed_percent = int((failed / (failed + restored)) * 100) - status = 1 if failed_percent < FAILED_PARTS_THRESHOLD else 2 + status = WARNING if failed_percent < FAILED_PARTS_THRESHOLD else CRIT return Result( status, f"Some parts restore failed: {failed}({failed_percent}%)" )