Skip to content

Commit

Permalink
Wait replication sync command
Browse files Browse the repository at this point in the history
  • Loading branch information
kirillgarbar committed Jan 23, 2024
1 parent baaf843 commit 700802b
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 0 deletions.
4 changes: 4 additions & 0 deletions ch_tools/monrun_checks/ch_replication_lag.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
)
@click.pass_context
def replication_lag_command(ctx, xcrit, crit, warn, mwarn, mcrit, verbose):
return estimate_replication_lag(ctx, xcrit, crit, warn, mwarn, mcrit, verbose)


def estimate_replication_lag(ctx, xcrit=3600, crit=6000, warn=300, mwarn=50.0, mcrit=90.0, verbose=0):
"""
Check for replication lag across replicas.
Should be: lag >= lag_with_errors, lag >= max_execution
Expand Down
42 changes: 42 additions & 0 deletions ch_tools/monrun_checks/ch_replication_sync.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import time

from click import command, option, pass_context

from ch_tools.common.result import Result
from ch_tools.monrun_checks.ch_replication_lag import estimate_replication_lag


@command("wait-replication-sync")
@option(
"-s",
"--status",
type=int,
default=0,
help="Wait until returned status is no worse than given, 0 = OK (default), 1 = WARN, 2 = CRIT.",
)
@option(
"-p",
"--pause",
type=int,
default=30,
help="Pause between request in seconds, default is 30 seconds.",
)
@option(
"-t",
"--timeout",
type=int,
default=3 * 24 * 60 * 60,
help="Max amount of time to wait, in seconds. Default is 30 days.",
)
@pass_context
def wait_replication_sync_command(ctx, status, pause, timeout):
"""Wait for ClickHouse server to sync replication with other replicas."""

deadline = time.time() + timeout
while time.time() < deadline:
res = estimate_replication_lag(ctx)
if res.code <= status:
return Result(code=0)
time.sleep(pause)

return Result(code=2, message=f"ClickHouse can\'t sync replica for {timeout} seconds")
2 changes: 2 additions & 0 deletions ch_tools/monrun_checks/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from ch_tools.monrun_checks.ch_log_errors import log_errors_command
from ch_tools.monrun_checks.ch_ping import ping_command
from ch_tools.monrun_checks.ch_replication_lag import replication_lag_command
from ch_tools.monrun_checks.ch_replication_sync import wait_replication_sync_command
from ch_tools.monrun_checks.ch_resetup_state import resetup_state_command
from ch_tools.monrun_checks.ch_ro_replica import ro_replica_command
from ch_tools.monrun_checks.ch_s3_backup_orphaned import orphaned_backups_command
Expand Down Expand Up @@ -135,6 +136,7 @@ def cli(ctx, ensure_monitoring_user):
ping_command,
log_errors_command,
replication_lag_command,
wait_replication_sync_command,
system_queues_command,
core_dumps_command,
dist_tables_command,
Expand Down
10 changes: 10 additions & 0 deletions tests/features/monrun.feature
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,13 @@ Feature: ch-monitoring tool
"""
2;KazooTimeoutError('Connection time-out')
"""

Scenario: Check wait replication sync
When we execute command on clickhouse01
"""
ch-monitoring wait-replication-sync -t 10 -p 1
"""
Then we get response
"""
0;OK
"""

0 comments on commit 700802b

Please sign in to comment.