diff --git a/ch_tools/chadmin/cli/wait_group.py b/ch_tools/chadmin/cli/wait_group.py index f3843446..201a4003 100644 --- a/ch_tools/chadmin/cli/wait_group.py +++ b/ch_tools/chadmin/cli/wait_group.py @@ -7,6 +7,7 @@ from ch_tools.chadmin.cli.chadmin_group import Chadmin from ch_tools.chadmin.internal.clickhouse_disks import S3_METADATA_STORE_PATH +from ch_tools.chadmin.internal.system import match_ch_version from ch_tools.chadmin.internal.table_replica import list_table_replicas from ch_tools.chadmin.internal.utils import execute_query from ch_tools.common import logging @@ -80,6 +81,12 @@ def wait_group(): type=FloatRange(0.0, 100.0), help="Warning threshold in percent of max_replicated_merges_in_queue.", ) +@option( + "--lightweight/--full", + is_flag=True, + default=True, + help="Use SYNC REPLICA with LIGHTWEIGHT option and skip replication lag check.", +) @pass_context def wait_replication_sync_command( ctx, @@ -92,8 +99,19 @@ def wait_replication_sync_command( warn, mwarn, mcrit, + lightweight, ): """Wait for ClickHouse server to sync replication with other replicas.""" + # Lightweight sync is added in 23.4 + try: + if lightweight and not match_ch_version(ctx, "23.4"): + logging.warning( + "Lightweight sync requires version 23.4, will do full sync instead." + ) + lightweight = False + except Exception: + logging.error("Connection error while getting CH version.") + sys.exit(1) start_time = time.time() deadline = start_time + total_timeout.total_seconds() @@ -104,9 +122,13 @@ def wait_replication_sync_command( full_name = f"`{replica['database']}`.`{replica['table']}`" time_left = deadline - time.time() + query = f"SYSTEM SYNC REPLICA {full_name} LIGHTWEIGHT" + if not lightweight: + query = f"SYSTEM SYNC REPLICA {full_name}" + execute_query( ctx, - f"SYSTEM SYNC REPLICA {full_name}", + query, format_=None, timeout=replica_timeout.total_seconds(), settings={"receive_timeout": time_left}, @@ -123,6 +145,9 @@ def wait_replication_sync_command( sys.exit(1) raise + if lightweight: + sys.exit(0) + # Replication lag while time.time() < deadline: res = estimate_replication_lag(ctx, xcrit, crit, warn, mwarn, mcrit) diff --git a/tests/features/chadmin.feature b/tests/features/chadmin.feature index b571c5a3..2382a88f 100644 --- a/tests/features/chadmin.feature +++ b/tests/features/chadmin.feature @@ -118,7 +118,7 @@ Feature: chadmin commands. | --database=db2 | table_01\tReplicatedMergeTree\ntable_02\tMergeTree\ntable_03\tReplicatedMergeTree\ntable_04\tReplicatedMergeTree | | --database=db2 --table=table_01 | table_01\tReplicatedMergeTree\ntable_02\tMergeTree\ntable_03\tMergeTree\ntable_04\tMergeTree | - Scenario: Check wait replication sync + Scenario Outline: Check wait replication sync Given we have executed queries on clickhouse01 """ CREATE DATABASE IF NOT EXISTS test ON CLUSTER 'cluster'; @@ -128,7 +128,7 @@ Feature: chadmin commands. """ When we execute command on clickhouse01 """ - chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 + chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 """ When we execute query on clickhouse01 """ @@ -141,7 +141,7 @@ Feature: chadmin commands. And we sleep for 5 seconds When we try to execute command on clickhouse01 """ - chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 + chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 """ Then it fails with response contains """ @@ -157,11 +157,11 @@ Feature: chadmin commands. """ When we try to execute command on clickhouse01 """ - chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 + chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 """ Then it fails with response contains """ - Connection error while running query. + Connection error """ When we execute command on clickhouse01 """ @@ -169,8 +169,14 @@ Feature: chadmin commands. """ When we execute command on clickhouse01 """ - chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 + chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 """ + Examples: + | options | + | | + | --lightweight | + | --full | + Scenario Outline: Check replica restore ( replicas, workers) Given populated clickhouse with replicated tables on clickhouse01 with db database and table_ prefix