Skip to content

Commit

Permalink
Lightweight sync (#256)
Browse files Browse the repository at this point in the history
* Lightweight sync replica

* Make lightweight a default option

* Fix dead ch case
  • Loading branch information
kirillgarbar authored Nov 26, 2024
1 parent 4afa499 commit c562fa9
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 7 deletions.
27 changes: 26 additions & 1 deletion ch_tools/chadmin/cli/wait_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from ch_tools.chadmin.cli.chadmin_group import Chadmin
from ch_tools.chadmin.internal.clickhouse_disks import S3_METADATA_STORE_PATH
from ch_tools.chadmin.internal.system import match_ch_version
from ch_tools.chadmin.internal.table_replica import list_table_replicas
from ch_tools.chadmin.internal.utils import execute_query
from ch_tools.common import logging
Expand Down Expand Up @@ -80,6 +81,12 @@ def wait_group():
type=FloatRange(0.0, 100.0),
help="Warning threshold in percent of max_replicated_merges_in_queue.",
)
@option(
"--lightweight/--full",
is_flag=True,
default=True,
help="Use SYNC REPLICA with LIGHTWEIGHT option and skip replication lag check.",
)
@pass_context
def wait_replication_sync_command(
ctx,
Expand All @@ -92,8 +99,19 @@ def wait_replication_sync_command(
warn,
mwarn,
mcrit,
lightweight,
):
"""Wait for ClickHouse server to sync replication with other replicas."""
# Lightweight sync is added in 23.4
try:
if lightweight and not match_ch_version(ctx, "23.4"):
logging.warning(
"Lightweight sync requires version 23.4, will do full sync instead."
)
lightweight = False
except Exception:
logging.error("Connection error while getting CH version.")
sys.exit(1)

start_time = time.time()
deadline = start_time + total_timeout.total_seconds()
Expand All @@ -104,9 +122,13 @@ def wait_replication_sync_command(
full_name = f"`{replica['database']}`.`{replica['table']}`"
time_left = deadline - time.time()

query = f"SYSTEM SYNC REPLICA {full_name} LIGHTWEIGHT"
if not lightweight:
query = f"SYSTEM SYNC REPLICA {full_name}"

execute_query(
ctx,
f"SYSTEM SYNC REPLICA {full_name}",
query,
format_=None,
timeout=replica_timeout.total_seconds(),
settings={"receive_timeout": time_left},
Expand All @@ -123,6 +145,9 @@ def wait_replication_sync_command(
sys.exit(1)
raise

if lightweight:
sys.exit(0)

# Replication lag
while time.time() < deadline:
res = estimate_replication_lag(ctx, xcrit, crit, warn, mwarn, mcrit)
Expand Down
18 changes: 12 additions & 6 deletions tests/features/chadmin.feature
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ Feature: chadmin commands.
| --database=db2 | table_01\tReplicatedMergeTree\ntable_02\tMergeTree\ntable_03\tReplicatedMergeTree\ntable_04\tReplicatedMergeTree |
| --database=db2 --table=table_01 | table_01\tReplicatedMergeTree\ntable_02\tMergeTree\ntable_03\tMergeTree\ntable_04\tMergeTree |

Scenario: Check wait replication sync
Scenario Outline: Check wait replication sync
Given we have executed queries on clickhouse01
"""
CREATE DATABASE IF NOT EXISTS test ON CLUSTER 'cluster';
Expand All @@ -128,7 +128,7 @@ Feature: chadmin commands.
"""
When we execute command on clickhouse01
"""
chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4
chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 <options>
"""
When we execute query on clickhouse01
"""
Expand All @@ -141,7 +141,7 @@ Feature: chadmin commands.
And we sleep for 5 seconds
When we try to execute command on clickhouse01
"""
chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4
chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 <options>
"""
Then it fails with response contains
"""
Expand All @@ -157,20 +157,26 @@ Feature: chadmin commands.
"""
When we try to execute command on clickhouse01
"""
chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4
chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 <options>
"""
Then it fails with response contains
"""
Connection error while running query.
Connection error
"""
When we execute command on clickhouse01
"""
supervisorctl start clickhouse-server
"""
When we execute command on clickhouse01
"""
chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4
chadmin wait replication-sync --total-timeout 10 --replica-timeout 3 -p 1 -w 4 <options>
"""
Examples:
| options |
| |
| --lightweight |
| --full |


Scenario Outline: Check replica restore (<replicas_count> replicas, <workers> workers)
Given populated clickhouse with <replicas_count> replicated tables on clickhouse01 with db database and table_ prefix
Expand Down

0 comments on commit c562fa9

Please sign in to comment.