From 700802bebe580467417cef2888d88e994117acb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Tue, 23 Jan 2024 12:58:58 +0300 Subject: [PATCH] Wait replication sync command --- ch_tools/monrun_checks/ch_replication_lag.py | 4 ++ ch_tools/monrun_checks/ch_replication_sync.py | 42 +++++++++++++++++++ ch_tools/monrun_checks/main.py | 2 + tests/features/monrun.feature | 10 +++++ 4 files changed, 58 insertions(+) create mode 100644 ch_tools/monrun_checks/ch_replication_sync.py diff --git a/ch_tools/monrun_checks/ch_replication_lag.py b/ch_tools/monrun_checks/ch_replication_lag.py index 761ef1d8..b4583100 100644 --- a/ch_tools/monrun_checks/ch_replication_lag.py +++ b/ch_tools/monrun_checks/ch_replication_lag.py @@ -56,6 +56,10 @@ ) @click.pass_context def replication_lag_command(ctx, xcrit, crit, warn, mwarn, mcrit, verbose): + return estimate_replication_lag(ctx, xcrit, crit, warn, mwarn, mcrit, verbose) + + +def estimate_replication_lag(ctx, xcrit=3600, crit=6000, warn=300, mwarn=50.0, mcrit=90.0, verbose=0): """ Check for replication lag across replicas. Should be: lag >= lag_with_errors, lag >= max_execution diff --git a/ch_tools/monrun_checks/ch_replication_sync.py b/ch_tools/monrun_checks/ch_replication_sync.py new file mode 100644 index 00000000..28f8ccab --- /dev/null +++ b/ch_tools/monrun_checks/ch_replication_sync.py @@ -0,0 +1,42 @@ +import time + +from click import command, option, pass_context + +from ch_tools.common.result import Result +from ch_tools.monrun_checks.ch_replication_lag import estimate_replication_lag + + +@command("wait-replication-sync") +@option( + "-s", + "--status", + type=int, + default=0, + help="Wait until returned status is no worse than given, 0 = OK (default), 1 = WARN, 2 = CRIT.", +) +@option( + "-p", + "--pause", + type=int, + default=30, + help="Pause between request in seconds, default is 30 seconds.", +) +@option( + "-t", + "--timeout", + type=int, + default=3 * 24 * 60 * 60, + help="Max amount of time to wait, in seconds. Default is 30 days.", +) +@pass_context +def wait_replication_sync_command(ctx, status, pause, timeout): + """Wait for ClickHouse server to sync replication with other replicas.""" + + deadline = time.time() + timeout + while time.time() < deadline: + res = estimate_replication_lag(ctx) + if res.code <= status: + return Result(code=0) + time.sleep(pause) + + return Result(code=2, message=f"ClickHouse can\'t sync replica for {timeout} seconds") diff --git a/ch_tools/monrun_checks/main.py b/ch_tools/monrun_checks/main.py index 423db95c..fb8b396e 100644 --- a/ch_tools/monrun_checks/main.py +++ b/ch_tools/monrun_checks/main.py @@ -28,6 +28,7 @@ from ch_tools.monrun_checks.ch_log_errors import log_errors_command from ch_tools.monrun_checks.ch_ping import ping_command from ch_tools.monrun_checks.ch_replication_lag import replication_lag_command +from ch_tools.monrun_checks.ch_replication_sync import wait_replication_sync_command from ch_tools.monrun_checks.ch_resetup_state import resetup_state_command from ch_tools.monrun_checks.ch_ro_replica import ro_replica_command from ch_tools.monrun_checks.ch_s3_backup_orphaned import orphaned_backups_command @@ -135,6 +136,7 @@ def cli(ctx, ensure_monitoring_user): ping_command, log_errors_command, replication_lag_command, + wait_replication_sync_command, system_queues_command, core_dumps_command, dist_tables_command, diff --git a/tests/features/monrun.feature b/tests/features/monrun.feature index 4006986c..b0718c21 100644 --- a/tests/features/monrun.feature +++ b/tests/features/monrun.feature @@ -371,3 +371,13 @@ Feature: ch-monitoring tool """ 2;KazooTimeoutError('Connection time-out') """ + + Scenario: Check wait replication sync + When we execute command on clickhouse01 + """ + ch-monitoring wait-replication-sync -t 10 -p 1 + """ + Then we get response + """ + 0;OK + """