Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wait replication sync command #91

Merged
merged 9 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ch_tools/monrun_checks/ch_replication_lag.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
)
@click.pass_context
def replication_lag_command(ctx, xcrit, crit, warn, mwarn, mcrit, verbose):
return estimate_replication_lag(ctx, xcrit, crit, warn, mwarn, mcrit, verbose)


def estimate_replication_lag(ctx, xcrit=3600, crit=6000, warn=300, mwarn=50.0, mcrit=90.0, verbose=0):
aalexfvk marked this conversation as resolved.
Show resolved Hide resolved
"""
Check for replication lag across replicas.
Should be: lag >= lag_with_errors, lag >= max_execution
Expand Down
42 changes: 42 additions & 0 deletions ch_tools/monrun_checks/ch_replication_sync.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import time

from click import command, option, pass_context

from ch_tools.common.result import Result
from ch_tools.monrun_checks.ch_replication_lag import estimate_replication_lag


@command("wait-replication-sync")
@option(
"-s",
"--status",
type=int,
default=0,
help="Wait until returned status is no worse than given, 0 = OK (default), 1 = WARN, 2 = CRIT.",
)
@option(
"-p",
"--pause",
type=int,
default=30,
help="Pause between request in seconds, default is 30 seconds.",
)
@option(
"-t",
"--timeout",
type=int,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest to use TimeSpanParamType

Usage example:

@option(
"-g",
"--guard-interval",
"--to-time",
"to_time",
default=DEFAULT_GUARD_INTERVAL,
type=TimeSpanParamType(),
help=("End of inspecting interval in human-friendly format."),
)

default=3 * 24 * 60 * 60,
help="Max amount of time to wait, in seconds. Default is 30 days.",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Default is 3 days, right? Not 30.

And default values should be automatically added to help messages. So it's not required to write them manually.

)
@pass_context
def wait_replication_sync_command(ctx, status, pause, timeout):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest to move the command to chadmin and estimate_replication_lag function from ch_replication_lag.py somewhere to https://github.com/yandex/ch-tools/blob/master/ch_tools/common

"""Wait for ClickHouse server to sync replication with other replicas."""

deadline = time.time() + timeout
while time.time() < deadline:
res = estimate_replication_lag(ctx)
aalexfvk marked this conversation as resolved.
Show resolved Hide resolved
if res.code <= status:
return Result(code=0)
time.sleep(pause)

return Result(code=2, message=f"ClickHouse can\'t sync replica for {timeout} seconds")
aalexfvk marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 2 additions & 0 deletions ch_tools/monrun_checks/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from ch_tools.monrun_checks.ch_log_errors import log_errors_command
from ch_tools.monrun_checks.ch_ping import ping_command
from ch_tools.monrun_checks.ch_replication_lag import replication_lag_command
from ch_tools.monrun_checks.ch_replication_sync import wait_replication_sync_command
from ch_tools.monrun_checks.ch_resetup_state import resetup_state_command
from ch_tools.monrun_checks.ch_ro_replica import ro_replica_command
from ch_tools.monrun_checks.ch_s3_backup_orphaned import orphaned_backups_command
Expand Down Expand Up @@ -135,6 +136,7 @@ def cli(ctx, ensure_monitoring_user):
ping_command,
log_errors_command,
replication_lag_command,
wait_replication_sync_command,
system_queues_command,
core_dumps_command,
dist_tables_command,
Expand Down
10 changes: 10 additions & 0 deletions tests/features/monrun.feature
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,13 @@ Feature: ch-monitoring tool
"""
2;KazooTimeoutError('Connection time-out')
"""

Scenario: Check wait replication sync
When we execute command on clickhouse01
"""
ch-monitoring wait-replication-sync -t 10 -p 1
"""
Then we get response
"""
0;OK
"""
Loading