From 700802bebe580467417cef2888d88e994117acb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Tue, 23 Jan 2024 12:58:58 +0300
Subject: [PATCH 1/9] Wait replication sync command

---
 ch_tools/monrun_checks/ch_replication_lag.py  |  4 ++
 ch_tools/monrun_checks/ch_replication_sync.py | 42 +++++++++++++++++++
 ch_tools/monrun_checks/main.py                |  2 +
 tests/features/monrun.feature                 | 10 +++++
 4 files changed, 58 insertions(+)
 create mode 100644 ch_tools/monrun_checks/ch_replication_sync.py

diff --git a/ch_tools/monrun_checks/ch_replication_lag.py b/ch_tools/monrun_checks/ch_replication_lag.py
index 761ef1d8..b4583100 100644
--- a/ch_tools/monrun_checks/ch_replication_lag.py
+++ b/ch_tools/monrun_checks/ch_replication_lag.py
@@ -56,6 +56,10 @@
 )
 @click.pass_context
 def replication_lag_command(ctx, xcrit, crit, warn, mwarn, mcrit, verbose):
+    return estimate_replication_lag(ctx, xcrit, crit, warn, mwarn, mcrit, verbose)
+
+
+def estimate_replication_lag(ctx, xcrit=3600, crit=6000, warn=300, mwarn=50.0, mcrit=90.0, verbose=0):
     """
     Check for replication lag across replicas.
     Should be: lag >= lag_with_errors, lag >= max_execution
diff --git a/ch_tools/monrun_checks/ch_replication_sync.py b/ch_tools/monrun_checks/ch_replication_sync.py
new file mode 100644
index 00000000..28f8ccab
--- /dev/null
+++ b/ch_tools/monrun_checks/ch_replication_sync.py
@@ -0,0 +1,42 @@
+import time
+
+from click import command, option, pass_context
+
+from ch_tools.common.result import Result
+from ch_tools.monrun_checks.ch_replication_lag import estimate_replication_lag
+
+
+@command("wait-replication-sync")
+@option(
+    "-s",
+    "--status",
+    type=int,
+    default=0,
+    help="Wait until returned status is no worse than given, 0 = OK (default), 1 = WARN, 2 = CRIT.",
+)
+@option(
+    "-p",
+    "--pause",
+    type=int,
+    default=30,
+    help="Pause between request in seconds, default is 30 seconds.",
+)
+@option(
+    "-t",
+    "--timeout",
+    type=int,
+    default=3 * 24 * 60 * 60,
+    help="Max amount of time to wait, in seconds. Default is 30 days.",
+)
+@pass_context
+def wait_replication_sync_command(ctx, status, pause, timeout):
+    """Wait for ClickHouse server to sync replication with other replicas."""
+
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        res = estimate_replication_lag(ctx)
+        if res.code <= status:
+            return Result(code=0) 
+        time.sleep(pause)
+
+    return Result(code=2, message=f"ClickHouse can\'t sync replica for {timeout} seconds")
diff --git a/ch_tools/monrun_checks/main.py b/ch_tools/monrun_checks/main.py
index 423db95c..fb8b396e 100644
--- a/ch_tools/monrun_checks/main.py
+++ b/ch_tools/monrun_checks/main.py
@@ -28,6 +28,7 @@
 from ch_tools.monrun_checks.ch_log_errors import log_errors_command
 from ch_tools.monrun_checks.ch_ping import ping_command
 from ch_tools.monrun_checks.ch_replication_lag import replication_lag_command
+from ch_tools.monrun_checks.ch_replication_sync import wait_replication_sync_command
 from ch_tools.monrun_checks.ch_resetup_state import resetup_state_command
 from ch_tools.monrun_checks.ch_ro_replica import ro_replica_command
 from ch_tools.monrun_checks.ch_s3_backup_orphaned import orphaned_backups_command
@@ -135,6 +136,7 @@ def cli(ctx, ensure_monitoring_user):
     ping_command,
     log_errors_command,
     replication_lag_command,
+    wait_replication_sync_command,
     system_queues_command,
     core_dumps_command,
     dist_tables_command,
diff --git a/tests/features/monrun.feature b/tests/features/monrun.feature
index 4006986c..b0718c21 100644
--- a/tests/features/monrun.feature
+++ b/tests/features/monrun.feature
@@ -371,3 +371,13 @@ Feature: ch-monitoring tool
     """
     2;KazooTimeoutError('Connection time-out')
     """
+
+  Scenario: Check wait replication sync
+    When we execute command on clickhouse01
+    """
+    ch-monitoring wait-replication-sync -t 10 -p 1
+    """
+    Then we get response
+    """
+    0;OK
+    """

From 9b42451ef3cb508bda358b7df303fd3e62cad4af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Tue, 23 Jan 2024 14:10:50 +0300
Subject: [PATCH 2/9] Test for replication lag

---
 tests/features/monrun.feature | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/features/monrun.feature b/tests/features/monrun.feature
index b0718c21..243c1d02 100644
--- a/tests/features/monrun.feature
+++ b/tests/features/monrun.feature
@@ -139,6 +139,26 @@ Feature: ch-monitoring tool
     """
     0;OK
     """
+    When we execute query on clickhouse01
+    """
+    SYSTEM STOP FETCHES
+    """
+    And we execute query on clickhouse02
+    """
+    INSERT INTO test.table_01 SELECT number FROM numbers(100)
+    """
+    And we execute command on clickhouse02
+    """
+    sleep 5
+    """
+    And we execute command on clickhouse01
+    """
+    ch-monitoring replication-lag -w 4
+    """
+    Then we get response contains
+    """
+    1;
+    """
 
   Scenario: Check System queues size
     When we execute command on clickhouse01

From 456b693f7fb4318908091a78ab3c39ddb143e5db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Tue, 23 Jan 2024 19:17:50 +0300
Subject: [PATCH 3/9] Move wait replication to chadmin

---
 .../cli/wait_replication_sync_command.py}     | 25 ++++++++++---------
 tests/features/chadmin.feature                | 15 +++++++++++
 tests/features/monrun.feature                 | 10 --------
 3 files changed, 28 insertions(+), 22 deletions(-)
 rename ch_tools/{monrun_checks/ch_replication_sync.py => chadmin/cli/wait_replication_sync_command.py} (57%)
 create mode 100644 tests/features/chadmin.feature

diff --git a/ch_tools/monrun_checks/ch_replication_sync.py b/ch_tools/chadmin/cli/wait_replication_sync_command.py
similarity index 57%
rename from ch_tools/monrun_checks/ch_replication_sync.py
rename to ch_tools/chadmin/cli/wait_replication_sync_command.py
index 28f8ccab..81cf1ff8 100644
--- a/ch_tools/monrun_checks/ch_replication_sync.py
+++ b/ch_tools/chadmin/cli/wait_replication_sync_command.py
@@ -1,9 +1,10 @@
 import time
+import sys
 
 from click import command, option, pass_context
 
-from ch_tools.common.result import Result
 from ch_tools.monrun_checks.ch_replication_lag import estimate_replication_lag
+from ch_tools.common.cli.parameters import TimeSpanParamType
 
 
 @command("wait-replication-sync")
@@ -12,31 +13,31 @@
     "--status",
     type=int,
     default=0,
-    help="Wait until returned status is no worse than given, 0 = OK (default), 1 = WARN, 2 = CRIT.",
+    help="Wait until returned status is no worse than given, 0 = OK, 1 = WARN, 2 = CRIT.",
 )
 @option(
     "-p",
     "--pause",
-    type=int,
-    default=30,
-    help="Pause between request in seconds, default is 30 seconds.",
+    type=TimeSpanParamType(),
+    default="30s",
+    help="Pause between requests.",
 )
 @option(
     "-t",
     "--timeout",
-    type=int,
-    default=3 * 24 * 60 * 60,
-    help="Max amount of time to wait, in seconds. Default is 30 days.",
+    type=TimeSpanParamType(),
+    default="3d",
+    help="Max amount of time to wait.",
 )
 @pass_context
 def wait_replication_sync_command(ctx, status, pause, timeout):
     """Wait for ClickHouse server to sync replication with other replicas."""
 
-    deadline = time.time() + timeout
+    deadline = time.time() + timeout.total_seconds()
     while time.time() < deadline:
         res = estimate_replication_lag(ctx)
         if res.code <= status:
-            return Result(code=0) 
-        time.sleep(pause)
+            sys.exit(0)
+        time.sleep(pause.total_seconds())
 
-    return Result(code=2, message=f"ClickHouse can\'t sync replica for {timeout} seconds")
+    sys.exit(1)
diff --git a/tests/features/chadmin.feature b/tests/features/chadmin.feature
new file mode 100644
index 00000000..175eaa5a
--- /dev/null
+++ b/tests/features/chadmin.feature
@@ -0,0 +1,15 @@
+Feature: chadmin commands.
+
+  Background:
+    Given default configuration
+    And a working s3
+    And a working zookeeper
+    And a working clickhouse on clickhouse01
+    And a working clickhouse on clickhouse02
+
+
+  Scenario: Check wait replication sync
+    When we execute command on clickhouse01
+    """
+    chadmin wait-replication-sync -t 10 -p 1
+    """
diff --git a/tests/features/monrun.feature b/tests/features/monrun.feature
index 243c1d02..7d50b97d 100644
--- a/tests/features/monrun.feature
+++ b/tests/features/monrun.feature
@@ -391,13 +391,3 @@ Feature: ch-monitoring tool
     """
     2;KazooTimeoutError('Connection time-out')
     """
-
-  Scenario: Check wait replication sync
-    When we execute command on clickhouse01
-    """
-    ch-monitoring wait-replication-sync -t 10 -p 1
-    """
-    Then we get response
-    """
-    0;OK
-    """

From 43db5e69d449f1b5955e15fb4d5b93aeb428fcc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Wed, 24 Jan 2024 13:14:18 +0300
Subject: [PATCH 4/9] Move replication lag to common

---
 ch_tools/chadmin/chadmin_cli.py               |   2 +
 .../cli/wait_replication_sync_command.py      |   4 +-
 ch_tools/common/replication_lag.py            | 275 +++++++++++++++++
 ch_tools/monrun_checks/ch_replication_lag.py  | 277 +-----------------
 ch_tools/monrun_checks/main.py                |   2 -
 5 files changed, 281 insertions(+), 279 deletions(-)
 create mode 100644 ch_tools/common/replication_lag.py

diff --git a/ch_tools/chadmin/chadmin_cli.py b/ch_tools/chadmin/chadmin_cli.py
index 1c8fdbef..2cd275eb 100755
--- a/ch_tools/chadmin/chadmin_cli.py
+++ b/ch_tools/chadmin/chadmin_cli.py
@@ -42,6 +42,7 @@
 from ch_tools.chadmin.cli.table_group import table_group
 from ch_tools.chadmin.cli.table_replica_group import table_replica_group
 from ch_tools.chadmin.cli.thread_log_group import thread_log_group
+from ch_tools.chadmin.cli.wait_replication_sync_command import wait_replication_sync_command
 from ch_tools.chadmin.cli.wait_started_command import wait_started_command
 from ch_tools.chadmin.cli.zookeeper_group import zookeeper_group
 from ch_tools.common.cli.context_settings import CONTEXT_SETTINGS
@@ -116,6 +117,7 @@ def cli(ctx, format_, settings, timeout, port, debug):
     list_settings_command,
     restore_replica_command,
     stack_trace_command,
+    wait_replication_sync_command,
     wait_started_command,
 ]
 
diff --git a/ch_tools/chadmin/cli/wait_replication_sync_command.py b/ch_tools/chadmin/cli/wait_replication_sync_command.py
index 81cf1ff8..4cdee1ad 100644
--- a/ch_tools/chadmin/cli/wait_replication_sync_command.py
+++ b/ch_tools/chadmin/cli/wait_replication_sync_command.py
@@ -1,9 +1,10 @@
+import logging
 import time
 import sys
 
 from click import command, option, pass_context
 
-from ch_tools.monrun_checks.ch_replication_lag import estimate_replication_lag
+from ch_tools.common.replication_lag import estimate_replication_lag
 from ch_tools.common.cli.parameters import TimeSpanParamType
 
 
@@ -40,4 +41,5 @@ def wait_replication_sync_command(ctx, status, pause, timeout):
             sys.exit(0)
         time.sleep(pause.total_seconds())
 
+    logging.error(f"ClickHouse can't sync replica.")
     sys.exit(1)
diff --git a/ch_tools/common/replication_lag.py b/ch_tools/common/replication_lag.py
new file mode 100644
index 00000000..c230c520
--- /dev/null
+++ b/ch_tools/common/replication_lag.py
@@ -0,0 +1,275 @@
+from typing import Any, Dict
+
+from tabulate import tabulate
+
+from ch_tools.common.result import Result
+from ch_tools.common.clickhouse.client.clickhouse_client import clickhouse_client
+
+
+XCRIT=3600
+CRIT=600
+WARN=300
+MWARN=50.0
+MCRIT=90.0
+VERBOSE=0
+
+
+def estimate_replication_lag(ctx, xcrit=XCRIT, crit=CRIT, warn=WARN, mwarn=MWARN, mcrit=MCRIT, verbose=VERBOSE):
+    """
+    Check for replication lag across replicas.
+    Should be: lag >= lag_with_errors, lag >= max_execution
+    """
+    # pylint: disable=too-many-branches,too-many-locals
+    ch_client = clickhouse_client(ctx)
+    lag, lag_with_errors, max_execution, max_merges, chart = get_replication_lag(
+        ch_client
+    )
+
+    msg_verbose = ""
+    msg_verbose_2 = "\n\n"
+
+    if verbose >= 1:
+        verbtab = []
+
+        headers = [
+            "Table",
+            "Lag [s]",
+            "Tasks",
+            "Max task execution [s]",
+            "Non-retrayable errors",
+            "Has user fault errors",
+            "Merges with 1000+ tries",
+        ]
+        for key, item in chart.items():
+            if item.get("multi_replicas", False):
+                tabletab = [
+                    key,
+                    item.get("delay", 0),
+                    item.get("tasks", 0),
+                    item.get("max_execution", 0),
+                    item.get("errors", 0),
+                    item.get("user_fault", False),
+                    item.get("retried_merges", 0),
+                ]
+                verbtab.append(tabletab)
+                if verbose >= 2:
+                    exceptions_retrayable = ""
+                    exceptions_non_retrayable = ""
+                    exceptions_ignored = ""
+                    for exception in item.get("exceptions", []):
+                        if exception:
+                            if is_userfault_exception(exception):
+                                exceptions_ignored += "\t" + exception[5:] + "\n"
+                            elif exception.startswith("<pr> "):
+                                exceptions_retrayable += "\t" + exception[5:] + "\n"
+                            else:
+                                exceptions_non_retrayable += "\t" + exception[5:] + "\n"
+                    max_execution_part = (
+                        item.get("max_execution_part", "")
+                        if item.get("max_execution", 0)
+                        else 0
+                    )
+                    if (
+                        exceptions_retrayable
+                        or exceptions_non_retrayable
+                        or exceptions_ignored
+                        or max_execution_part
+                    ):
+                        msg_verbose_2 = msg_verbose_2 + key + ":\n"
+                    if exceptions_non_retrayable:
+                        msg_verbose_2 = (
+                            msg_verbose_2
+                            + "  Non-retrayable errors:\n"
+                            + exceptions_non_retrayable
+                        )
+                    if exceptions_retrayable:
+                        msg_verbose_2 = (
+                            msg_verbose_2
+                            + "  Retrayable errors:\n"
+                            + exceptions_retrayable
+                        )
+                    if exceptions_ignored:
+                        msg_verbose_2 = (
+                            msg_verbose_2
+                            + "  User fault errors:\n"
+                            + exceptions_ignored
+                        )
+                    if max_execution_part:
+                        msg_verbose_2 = (
+                            msg_verbose_2
+                            + "  Result part of task with max execution time: "
+                            + max_execution_part
+                            + "\n"
+                        )
+        msg_verbose = tabulate(verbtab, headers=headers)
+        if verbose >= 2:
+            msg_verbose = msg_verbose + msg_verbose_2
+
+    max_merges_warn_threshold = 1
+    max_merges_crit_threshold = 1
+    if max_merges > 0:
+        max_replicated_merges_in_queue = get_max_replicated_merges_in_queue(ch_client)
+        max_merges_warn_threshold = int(max_replicated_merges_in_queue * mwarn / 100.0)
+        max_merges_crit_threshold = int(max_replicated_merges_in_queue * mcrit / 100.0)
+
+    if lag < warn and max_merges < max_merges_warn_threshold:
+        return Result(code=0, message="OK", verbose=msg_verbose)
+
+    msg = "Max {0} seconds, with errors {1} seconds, max task execution {2} seconds, max merges in queue {3}".format(
+        lag, lag_with_errors, max_execution, max_merges
+    )
+
+    if (
+        lag_with_errors < crit
+        and max_execution < xcrit
+        and max_merges < max_merges_crit_threshold
+    ):
+        return Result(code=1, message=msg, verbose=msg_verbose)
+
+    return Result(code=2, message=msg, verbose=msg_verbose)
+
+
+def get_replication_lag(ch_client):
+    """
+    Get max absolute_delay from system.replicas.
+    """
+
+    tables = get_tables_with_replication_delay(ch_client)
+    chart: Dict[str, Dict[str, Any]] = {}
+    for t in tables:
+        key = "{database}.{table}".format(database=t["database"], table=t["table"])
+        chart[key] = {}
+        chart[key]["delay"] = int(t["absolute_delay"])
+    tables = filter_out_single_replica_tables(ch_client, tables)
+    for t in tables:
+        key = "{database}.{table}".format(database=t["database"], table=t["table"])
+        chart[key]["multi_replicas"] = True
+    tables = count_errors(ch_client, tables, -1)
+
+    max_merges = 0
+    for t in tables:
+        key = "{database}.{table}".format(database=t["database"], table=t["table"])
+        chart[key]["tasks"] = int(t["tasks"])
+        chart[key]["errors"] = int(t["errors"])
+        chart[key]["max_execution"] = int(t["max_execution"])
+        chart[key]["max_execution_part"] = t["max_execution_part"]
+        chart[key]["exceptions"] = t["exceptions"]
+        chart[key]["retried_merges"] = int(t["retried_merges"])
+        max_merges = max(int(t["retried_merges"]), max_merges)
+        for exception in t["exceptions"]:
+            if is_userfault_exception(exception):
+                chart[key]["userfault"] = True
+                break
+
+    lag = 0
+    lag_with_errors = 0
+    max_execution = 0
+    for key, item in chart.items():
+        if item.get("multi_replicas", False):
+            delay = item.get("delay", 0)
+            if delay > lag:
+                lag = delay
+            if (
+                delay > lag_with_errors
+                and item.get("errors", 0) > 0
+                and not item.get("userfault", False)
+            ):
+                lag_with_errors = delay
+            execution = item.get("max_execution", 0)
+            if execution > max_execution:
+                max_execution = execution
+
+    return lag, lag_with_errors, max_execution, max_merges, chart
+
+
+def get_tables_with_replication_delay(ch_client):
+    """
+    Get tables with absolute_delay > 0.
+    """
+    query = "SELECT database, table, zookeeper_path, absolute_delay FROM system.replicas WHERE absolute_delay > 0"
+    return ch_client.query(query=query, format_="JSON")["data"]
+
+
+def filter_out_single_replica_tables(ch_client, tables):
+    if not tables:
+        return tables
+
+    query = """
+        SELECT
+            database,
+            table,
+            zookeeper_path
+        FROM system.replicas
+        WHERE (database, table) IN ({tables})
+        AND total_replicas > 1
+        """.format(
+        tables=",".join(
+            "('{0}', '{1}')".format(t["database"], t["table"]) for t in tables
+        )
+    )
+    return ch_client.query(query=query, format_="JSON")["data"]
+
+
+def count_errors(ch_client, tables, exceptions_limit):
+    """
+    Add count of replication errors.
+    """
+    if not tables:
+        return tables
+
+    limit = "" if exceptions_limit < 0 else "({})".format(exceptions_limit)
+
+    query = """
+        SELECT
+            database,
+            table,
+            count() as tasks,
+            countIf(last_exception != '' AND postpone_reason = '') as errors,
+            max(IF(is_currently_executing, dateDiff('second', last_attempt_time, now()), 0)) as max_execution,
+            groupUniqArray{limit}(IF(last_exception != '', concat(IF(postpone_reason = '', '     ', '<pr> '), last_exception), '')) as exceptions,
+            argMax(new_part_name, IF(is_currently_executing, dateDiff('second', last_attempt_time, now()), 0)) as max_execution_part,
+            countIf(type = 'MERGE_PARTS' and num_tries >= 1000) as retried_merges
+        FROM system.replication_queue
+        WHERE (database, table) IN ({tables})
+        GROUP BY database,table
+        """.format(
+        tables=",".join(
+            "('{0}', '{1}')".format(t["database"], t["table"]) for t in tables
+        ),
+        limit=limit,
+    )
+    return ch_client.query(query=query, format_="JSON")["data"]
+
+
+def is_userfault_exception(exception):
+    """
+    Check if exception was caused by user.
+    Current list:
+      * DB::Exception: Cannot reserve 1.00 MiB, not enough space
+      * DB::Exception: Incorrect data: Sign = -127 (must be 1 or -1)
+    """
+
+    if "DB::Exception: Cannot reserve" in exception and "not enough space" in exception:
+        return True
+    if (
+        "DB::Exception: Incorrect data: Sign" in exception
+        and "(must be 1 or -1)" in exception
+    ):
+        return True
+
+    return False
+
+
+def get_max_replicated_merges_in_queue(ch_client):
+    """
+    Get max_replicated_merges_in_queue value
+    """
+    query = """
+        SELECT value FROM system.merge_tree_settings WHERE name='max_replicated_merges_in_queue'
+    """
+    res = ch_client.query(query=query, format_="JSONCompact")["data"]
+    if not res:
+        return (
+            16  # 16 is default value for 'max_replicated_merges_in_queue' in ClickHouse
+        )
+    return int(res[0][0])
\ No newline at end of file
diff --git a/ch_tools/monrun_checks/ch_replication_lag.py b/ch_tools/monrun_checks/ch_replication_lag.py
index b4583100..53558363 100644
--- a/ch_tools/monrun_checks/ch_replication_lag.py
+++ b/ch_tools/monrun_checks/ch_replication_lag.py
@@ -2,11 +2,7 @@
 from typing import Any, Dict
 
 import click
-from tabulate import tabulate
-
-from ch_tools.common.result import Result
-from ch_tools.monrun_checks.clickhouse_client import ClickhouseClient
-from ch_tools.monrun_checks.clickhouse_info import ClickhouseInfo
+from ch_tools.common.replication_lag import estimate_replication_lag
 
 
 @click.command("replication-lag")
@@ -57,274 +53,3 @@
 @click.pass_context
 def replication_lag_command(ctx, xcrit, crit, warn, mwarn, mcrit, verbose):
     return estimate_replication_lag(ctx, xcrit, crit, warn, mwarn, mcrit, verbose)
-
-
-def estimate_replication_lag(ctx, xcrit=3600, crit=6000, warn=300, mwarn=50.0, mcrit=90.0, verbose=0):
-    """
-    Check for replication lag across replicas.
-    Should be: lag >= lag_with_errors, lag >= max_execution
-    """
-    # pylint: disable=too-many-branches,too-many-locals
-    ch_client = ClickhouseClient(ctx)
-    lag, lag_with_errors, max_execution, max_merges, chart = get_replication_lag(
-        ch_client
-    )
-
-    msg_verbose = ""
-    msg_verbose_2 = "\n\n"
-
-    if verbose >= 1:
-        verbtab = []
-
-        headers = [
-            "Table",
-            "Lag [s]",
-            "Tasks",
-            "Max task execution [s]",
-            "Non-retrayable errors",
-            "Has user fault errors",
-            "Merges with 1000+ tries",
-        ]
-        for key, item in chart.items():
-            if item.get("multi_replicas", False):
-                tabletab = [
-                    key,
-                    item.get("delay", 0),
-                    item.get("tasks", 0),
-                    item.get("max_execution", 0),
-                    item.get("errors", 0),
-                    item.get("user_fault", False),
-                    item.get("retried_merges", 0),
-                ]
-                verbtab.append(tabletab)
-                if verbose >= 2:
-                    exceptions_retrayable = ""
-                    exceptions_non_retrayable = ""
-                    exceptions_ignored = ""
-                    for exception in item.get("exceptions", []):
-                        if exception:
-                            if is_userfault_exception(exception):
-                                exceptions_ignored += "\t" + exception[5:] + "\n"
-                            elif exception.startswith("<pr> "):
-                                exceptions_retrayable += "\t" + exception[5:] + "\n"
-                            else:
-                                exceptions_non_retrayable += "\t" + exception[5:] + "\n"
-                    max_execution_part = (
-                        item.get("max_execution_part", "")
-                        if item.get("max_execution", 0)
-                        else 0
-                    )
-                    if (
-                        exceptions_retrayable
-                        or exceptions_non_retrayable
-                        or exceptions_ignored
-                        or max_execution_part
-                    ):
-                        msg_verbose_2 = msg_verbose_2 + key + ":\n"
-                    if exceptions_non_retrayable:
-                        msg_verbose_2 = (
-                            msg_verbose_2
-                            + "  Non-retrayable errors:\n"
-                            + exceptions_non_retrayable
-                        )
-                    if exceptions_retrayable:
-                        msg_verbose_2 = (
-                            msg_verbose_2
-                            + "  Retrayable errors:\n"
-                            + exceptions_retrayable
-                        )
-                    if exceptions_ignored:
-                        msg_verbose_2 = (
-                            msg_verbose_2
-                            + "  User fault errors:\n"
-                            + exceptions_ignored
-                        )
-                    if max_execution_part:
-                        msg_verbose_2 = (
-                            msg_verbose_2
-                            + "  Result part of task with max execution time: "
-                            + max_execution_part
-                            + "\n"
-                        )
-        msg_verbose = tabulate(verbtab, headers=headers)
-        if verbose >= 2:
-            msg_verbose = msg_verbose + msg_verbose_2
-
-    max_merges_warn_threshold = 1
-    max_merges_crit_threshold = 1
-    if max_merges > 0:
-        max_replicated_merges_in_queue = get_max_replicated_merges_in_queue(ch_client)
-        max_merges_warn_threshold = int(max_replicated_merges_in_queue * mwarn / 100.0)
-        max_merges_crit_threshold = int(max_replicated_merges_in_queue * mcrit / 100.0)
-
-    if lag < warn and max_merges < max_merges_warn_threshold:
-        return Result(code=0, message="OK", verbose=msg_verbose)
-
-    msg = "Max {0} seconds, with errors {1} seconds, max task execution {2} seconds, max merges in queue {3}".format(
-        lag, lag_with_errors, max_execution, max_merges
-    )
-
-    try:
-        replica_versions_mismatch = ClickhouseInfo.get_versions_count(ctx) > 1
-        if replica_versions_mismatch:
-            msg += ", ClickHouse versions on replicas mismatch"
-            return Result(code=1, message=msg, verbose=msg_verbose)
-    except Exception:
-        logging.warning("Unable to get version info from replicas", exc_info=True)
-        msg += ", one or more replicas is unavailable"
-        return Result(code=1, message=msg, verbose=msg_verbose)
-
-    if (
-        lag_with_errors < crit
-        and max_execution < xcrit
-        and max_merges < max_merges_crit_threshold
-    ):
-        return Result(code=1, message=msg, verbose=msg_verbose)
-
-    return Result(code=2, message=msg, verbose=msg_verbose)
-
-
-def get_replication_lag(ch_client):
-    """
-    Get max absolute_delay from system.replicas.
-    """
-
-    tables = get_tables_with_replication_delay(ch_client)
-    chart: Dict[str, Dict[str, Any]] = {}
-    for t in tables:
-        key = "{database}.{table}".format(database=t["database"], table=t["table"])
-        chart[key] = {}
-        chart[key]["delay"] = int(t["absolute_delay"])
-    tables = filter_out_single_replica_tables(ch_client, tables)
-    for t in tables:
-        key = "{database}.{table}".format(database=t["database"], table=t["table"])
-        chart[key]["multi_replicas"] = True
-    tables = count_errors(ch_client, tables, -1)
-
-    max_merges = 0
-    for t in tables:
-        key = "{database}.{table}".format(database=t["database"], table=t["table"])
-        chart[key]["tasks"] = int(t["tasks"])
-        chart[key]["errors"] = int(t["errors"])
-        chart[key]["max_execution"] = int(t["max_execution"])
-        chart[key]["max_execution_part"] = t["max_execution_part"]
-        chart[key]["exceptions"] = t["exceptions"]
-        chart[key]["retried_merges"] = int(t["retried_merges"])
-        max_merges = max(int(t["retried_merges"]), max_merges)
-        for exception in t["exceptions"]:
-            if is_userfault_exception(exception):
-                chart[key]["userfault"] = True
-                break
-
-    lag = 0
-    lag_with_errors = 0
-    max_execution = 0
-    for key, item in chart.items():
-        if item.get("multi_replicas", False):
-            delay = item.get("delay", 0)
-            if delay > lag:
-                lag = delay
-            if (
-                delay > lag_with_errors
-                and item.get("errors", 0) > 0
-                and not item.get("userfault", False)
-            ):
-                lag_with_errors = delay
-            execution = item.get("max_execution", 0)
-            if execution > max_execution:
-                max_execution = execution
-
-    return lag, lag_with_errors, max_execution, max_merges, chart
-
-
-def get_tables_with_replication_delay(ch_client):
-    """
-    Get tables with absolute_delay > 0.
-    """
-    query = "SELECT database, table, zookeeper_path, absolute_delay FROM system.replicas WHERE absolute_delay > 0"
-    return ch_client.execute(query, compact=False)
-
-
-def filter_out_single_replica_tables(ch_client, tables):
-    if not tables:
-        return tables
-
-    query = """
-        SELECT
-            database,
-            table,
-            zookeeper_path
-        FROM system.replicas
-        WHERE (database, table) IN ({tables})
-        AND total_replicas > 1
-        """.format(
-        tables=",".join(
-            "('{0}', '{1}')".format(t["database"], t["table"]) for t in tables
-        )
-    )
-    return ch_client.execute(query, False)
-
-
-def count_errors(ch_client, tables, exceptions_limit):
-    """
-    Add count of replication errors.
-    """
-    if not tables:
-        return tables
-
-    limit = "" if exceptions_limit < 0 else "({})".format(exceptions_limit)
-
-    query = """
-        SELECT
-            database,
-            table,
-            count() as tasks,
-            countIf(last_exception != '' AND postpone_reason = '') as errors,
-            max(IF(is_currently_executing, dateDiff('second', last_attempt_time, now()), 0)) as max_execution,
-            groupUniqArray{limit}(IF(last_exception != '', concat(IF(postpone_reason = '', '     ', '<pr> '), last_exception), '')) as exceptions,
-            argMax(new_part_name, IF(is_currently_executing, dateDiff('second', last_attempt_time, now()), 0)) as max_execution_part,
-            countIf(type = 'MERGE_PARTS' and num_tries >= 1000) as retried_merges
-        FROM system.replication_queue
-        WHERE (database, table) IN ({tables})
-        GROUP BY database,table
-        """.format(
-        tables=",".join(
-            "('{0}', '{1}')".format(t["database"], t["table"]) for t in tables
-        ),
-        limit=limit,
-    )
-    return ch_client.execute(query, False)
-
-
-def is_userfault_exception(exception):
-    """
-    Check if exception was caused by user.
-    Current list:
-      * DB::Exception: Cannot reserve 1.00 MiB, not enough space
-      * DB::Exception: Incorrect data: Sign = -127 (must be 1 or -1)
-    """
-
-    if "DB::Exception: Cannot reserve" in exception and "not enough space" in exception:
-        return True
-    if (
-        "DB::Exception: Incorrect data: Sign" in exception
-        and "(must be 1 or -1)" in exception
-    ):
-        return True
-
-    return False
-
-
-def get_max_replicated_merges_in_queue(ch_client):
-    """
-    Get max_replicated_merges_in_queue value
-    """
-    query = """
-        SELECT value FROM system.merge_tree_settings WHERE name='max_replicated_merges_in_queue'
-    """
-    res = ch_client.execute(query, True)
-    if not res:
-        return (
-            16  # 16 is default value for 'max_replicated_merges_in_queue' in ClickHouse
-        )
-    return int(res[0][0])
diff --git a/ch_tools/monrun_checks/main.py b/ch_tools/monrun_checks/main.py
index fb8b396e..423db95c 100644
--- a/ch_tools/monrun_checks/main.py
+++ b/ch_tools/monrun_checks/main.py
@@ -28,7 +28,6 @@
 from ch_tools.monrun_checks.ch_log_errors import log_errors_command
 from ch_tools.monrun_checks.ch_ping import ping_command
 from ch_tools.monrun_checks.ch_replication_lag import replication_lag_command
-from ch_tools.monrun_checks.ch_replication_sync import wait_replication_sync_command
 from ch_tools.monrun_checks.ch_resetup_state import resetup_state_command
 from ch_tools.monrun_checks.ch_ro_replica import ro_replica_command
 from ch_tools.monrun_checks.ch_s3_backup_orphaned import orphaned_backups_command
@@ -136,7 +135,6 @@ def cli(ctx, ensure_monitoring_user):
     ping_command,
     log_errors_command,
     replication_lag_command,
-    wait_replication_sync_command,
     system_queues_command,
     core_dumps_command,
     dist_tables_command,

From b6c943bc6a74340e15f826db7f9bd9686e13f48c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Wed, 24 Jan 2024 13:29:03 +0300
Subject: [PATCH 5/9] Formatting

---
 ch_tools/chadmin/chadmin_cli.py               |  4 +++-
 .../cli/wait_replication_sync_command.py      |  4 ++--
 ch_tools/common/replication_lag.py            | 21 ++++++++++---------
 ch_tools/monrun_checks/ch_replication_lag.py  |  2 +-
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/ch_tools/chadmin/chadmin_cli.py b/ch_tools/chadmin/chadmin_cli.py
index 2cd275eb..def8fa66 100755
--- a/ch_tools/chadmin/chadmin_cli.py
+++ b/ch_tools/chadmin/chadmin_cli.py
@@ -42,7 +42,9 @@
 from ch_tools.chadmin.cli.table_group import table_group
 from ch_tools.chadmin.cli.table_replica_group import table_replica_group
 from ch_tools.chadmin.cli.thread_log_group import thread_log_group
-from ch_tools.chadmin.cli.wait_replication_sync_command import wait_replication_sync_command
+from ch_tools.chadmin.cli.wait_replication_sync_command import (
+    wait_replication_sync_command,
+)
 from ch_tools.chadmin.cli.wait_started_command import wait_started_command
 from ch_tools.chadmin.cli.zookeeper_group import zookeeper_group
 from ch_tools.common.cli.context_settings import CONTEXT_SETTINGS
diff --git a/ch_tools/chadmin/cli/wait_replication_sync_command.py b/ch_tools/chadmin/cli/wait_replication_sync_command.py
index 4cdee1ad..8e85f436 100644
--- a/ch_tools/chadmin/cli/wait_replication_sync_command.py
+++ b/ch_tools/chadmin/cli/wait_replication_sync_command.py
@@ -1,11 +1,11 @@
 import logging
-import time
 import sys
+import time
 
 from click import command, option, pass_context
 
-from ch_tools.common.replication_lag import estimate_replication_lag
 from ch_tools.common.cli.parameters import TimeSpanParamType
+from ch_tools.common.replication_lag import estimate_replication_lag
 
 
 @command("wait-replication-sync")
diff --git a/ch_tools/common/replication_lag.py b/ch_tools/common/replication_lag.py
index c230c520..9bac5b85 100644
--- a/ch_tools/common/replication_lag.py
+++ b/ch_tools/common/replication_lag.py
@@ -2,19 +2,20 @@
 
 from tabulate import tabulate
 
-from ch_tools.common.result import Result
 from ch_tools.common.clickhouse.client.clickhouse_client import clickhouse_client
+from ch_tools.common.result import Result
 
-
-XCRIT=3600
-CRIT=600
-WARN=300
-MWARN=50.0
-MCRIT=90.0
-VERBOSE=0
+XCRIT = 3600
+CRIT = 600
+WARN = 300
+MWARN = 50.0
+MCRIT = 90.0
+VERBOSE = 0
 
 
-def estimate_replication_lag(ctx, xcrit=XCRIT, crit=CRIT, warn=WARN, mwarn=MWARN, mcrit=MCRIT, verbose=VERBOSE):
+def estimate_replication_lag(
+    ctx, xcrit=XCRIT, crit=CRIT, warn=WARN, mwarn=MWARN, mcrit=MCRIT, verbose=VERBOSE
+):
     """
     Check for replication lag across replicas.
     Should be: lag >= lag_with_errors, lag >= max_execution
@@ -272,4 +273,4 @@ def get_max_replicated_merges_in_queue(ch_client):
         return (
             16  # 16 is default value for 'max_replicated_merges_in_queue' in ClickHouse
         )
-    return int(res[0][0])
\ No newline at end of file
+    return int(res[0][0])
diff --git a/ch_tools/monrun_checks/ch_replication_lag.py b/ch_tools/monrun_checks/ch_replication_lag.py
index 53558363..2439cfc9 100644
--- a/ch_tools/monrun_checks/ch_replication_lag.py
+++ b/ch_tools/monrun_checks/ch_replication_lag.py
@@ -1,7 +1,7 @@
-import logging
 from typing import Any, Dict
 
 import click
+
 from ch_tools.common.replication_lag import estimate_replication_lag
 
 

From 91b5921f38563dcf5aa209daf2b1020817726b9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Wed, 24 Jan 2024 13:56:44 +0300
Subject: [PATCH 6/9] Wait commands group

---
 ch_tools/chadmin/chadmin_cli.py               |  8 +--
 ...{wait_started_command.py => wait_group.py} | 49 ++++++++++++++++++-
 .../cli/wait_replication_sync_command.py      | 45 -----------------
 .../common/{ => commands}/replication_lag.py  |  0
 ch_tools/monrun_checks/ch_replication_lag.py  |  2 +-
 tests/features/chadmin.feature                |  2 +-
 tests/features/monrun.feature                 |  5 +-
 7 files changed, 52 insertions(+), 59 deletions(-)
 rename ch_tools/chadmin/cli/{wait_started_command.py => wait_group.py} (66%)
 delete mode 100644 ch_tools/chadmin/cli/wait_replication_sync_command.py
 rename ch_tools/common/{ => commands}/replication_lag.py (100%)

diff --git a/ch_tools/chadmin/chadmin_cli.py b/ch_tools/chadmin/chadmin_cli.py
index def8fa66..f7967ddb 100755
--- a/ch_tools/chadmin/chadmin_cli.py
+++ b/ch_tools/chadmin/chadmin_cli.py
@@ -42,10 +42,7 @@
 from ch_tools.chadmin.cli.table_group import table_group
 from ch_tools.chadmin.cli.table_replica_group import table_replica_group
 from ch_tools.chadmin.cli.thread_log_group import thread_log_group
-from ch_tools.chadmin.cli.wait_replication_sync_command import (
-    wait_replication_sync_command,
-)
-from ch_tools.chadmin.cli.wait_started_command import wait_started_command
+from ch_tools.chadmin.cli.wait_group import wait_group
 from ch_tools.chadmin.cli.zookeeper_group import zookeeper_group
 from ch_tools.common.cli.context_settings import CONTEXT_SETTINGS
 from ch_tools.common.cli.locale_resolver import LocaleResolver
@@ -119,8 +116,6 @@ def cli(ctx, format_, settings, timeout, port, debug):
     list_settings_command,
     restore_replica_command,
     stack_trace_command,
-    wait_replication_sync_command,
-    wait_started_command,
 ]
 
 groups: List[Any] = [
@@ -143,6 +138,7 @@ def cli(ctx, format_, settings, timeout, port, debug):
     table_group,
     table_replica_group,
     thread_log_group,
+    wait_group,
     zookeeper_group,
 ]
 
diff --git a/ch_tools/chadmin/cli/wait_started_command.py b/ch_tools/chadmin/cli/wait_group.py
similarity index 66%
rename from ch_tools/chadmin/cli/wait_started_command.py
rename to ch_tools/chadmin/cli/wait_group.py
index 335e7ae4..c1088b54 100644
--- a/ch_tools/chadmin/cli/wait_started_command.py
+++ b/ch_tools/chadmin/cli/wait_group.py
@@ -3,9 +3,11 @@
 import sys
 import time
 
-from click import command, option, pass_context
+from click import group, option, pass_context
 
 from ch_tools.chadmin.internal.utils import execute_query
+from ch_tools.common.cli.parameters import TimeSpanParamType
+from ch_tools.common.commands.replication_lag import estimate_replication_lag
 from ch_tools.common.utils import execute
 
 BASE_TIMEOUT = 600
@@ -13,7 +15,50 @@
 S3_PART_LOAD_SPEED = 0.5  # in data parts per second
 
 
-@command("wait-started")
+@group("wait")
+def wait_group():
+    """Commands to wait until Clickhouse is in a certain state."""
+    pass
+
+
+@wait_group.command("replication-sync")
+@option(
+    "-s",
+    "--status",
+    type=int,
+    default=0,
+    help="Wait until returned status is no worse than given, 0 = OK, 1 = WARN, 2 = CRIT.",
+)
+@option(
+    "-p",
+    "--pause",
+    type=TimeSpanParamType(),
+    default="30s",
+    help="Pause between requests.",
+)
+@option(
+    "-t",
+    "--timeout",
+    type=TimeSpanParamType(),
+    default="3d",
+    help="Max amount of time to wait.",
+)
+@pass_context
+def wait_replication_sync_command(ctx, status, pause, timeout):
+    """Wait for ClickHouse server to sync replication with other replicas."""
+
+    deadline = time.time() + timeout.total_seconds()
+    while time.time() < deadline:
+        res = estimate_replication_lag(ctx)
+        if res.code <= status:
+            sys.exit(0)
+        time.sleep(pause.total_seconds())
+
+    logging.error(f"ClickHouse can't sync replica.")
+    sys.exit(1)
+
+
+@wait_group.command("started")
 @option(
     "--timeout",
     type=int,
diff --git a/ch_tools/chadmin/cli/wait_replication_sync_command.py b/ch_tools/chadmin/cli/wait_replication_sync_command.py
deleted file mode 100644
index 8e85f436..00000000
--- a/ch_tools/chadmin/cli/wait_replication_sync_command.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import logging
-import sys
-import time
-
-from click import command, option, pass_context
-
-from ch_tools.common.cli.parameters import TimeSpanParamType
-from ch_tools.common.replication_lag import estimate_replication_lag
-
-
-@command("wait-replication-sync")
-@option(
-    "-s",
-    "--status",
-    type=int,
-    default=0,
-    help="Wait until returned status is no worse than given, 0 = OK, 1 = WARN, 2 = CRIT.",
-)
-@option(
-    "-p",
-    "--pause",
-    type=TimeSpanParamType(),
-    default="30s",
-    help="Pause between requests.",
-)
-@option(
-    "-t",
-    "--timeout",
-    type=TimeSpanParamType(),
-    default="3d",
-    help="Max amount of time to wait.",
-)
-@pass_context
-def wait_replication_sync_command(ctx, status, pause, timeout):
-    """Wait for ClickHouse server to sync replication with other replicas."""
-
-    deadline = time.time() + timeout.total_seconds()
-    while time.time() < deadline:
-        res = estimate_replication_lag(ctx)
-        if res.code <= status:
-            sys.exit(0)
-        time.sleep(pause.total_seconds())
-
-    logging.error(f"ClickHouse can't sync replica.")
-    sys.exit(1)
diff --git a/ch_tools/common/replication_lag.py b/ch_tools/common/commands/replication_lag.py
similarity index 100%
rename from ch_tools/common/replication_lag.py
rename to ch_tools/common/commands/replication_lag.py
diff --git a/ch_tools/monrun_checks/ch_replication_lag.py b/ch_tools/monrun_checks/ch_replication_lag.py
index 2439cfc9..396b3e93 100644
--- a/ch_tools/monrun_checks/ch_replication_lag.py
+++ b/ch_tools/monrun_checks/ch_replication_lag.py
@@ -2,7 +2,7 @@
 
 import click
 
-from ch_tools.common.replication_lag import estimate_replication_lag
+from ch_tools.common.commands.replication_lag import estimate_replication_lag
 
 
 @click.command("replication-lag")
diff --git a/tests/features/chadmin.feature b/tests/features/chadmin.feature
index 175eaa5a..ec88e430 100644
--- a/tests/features/chadmin.feature
+++ b/tests/features/chadmin.feature
@@ -11,5 +11,5 @@ Feature: chadmin commands.
   Scenario: Check wait replication sync
     When we execute command on clickhouse01
     """
-    chadmin wait-replication-sync -t 10 -p 1
+    chadmin wait replication-sync -t 10 -p 1
     """
diff --git a/tests/features/monrun.feature b/tests/features/monrun.feature
index 7d50b97d..da2e359a 100644
--- a/tests/features/monrun.feature
+++ b/tests/features/monrun.feature
@@ -147,10 +147,7 @@ Feature: ch-monitoring tool
     """
     INSERT INTO test.table_01 SELECT number FROM numbers(100)
     """
-    And we execute command on clickhouse02
-    """
-    sleep 5
-    """
+    And we sleep for 5 seconds
     And we execute command on clickhouse01
     """
     ch-monitoring replication-lag -w 4

From 1274de848c6f15a7cb0ed308c183e1716b365285 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Wed, 24 Jan 2024 14:04:41 +0300
Subject: [PATCH 7/9] Lint

---
 ch_tools/chadmin/cli/wait_group.py           | 2 +-
 ch_tools/monrun_checks/ch_replication_lag.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/ch_tools/chadmin/cli/wait_group.py b/ch_tools/chadmin/cli/wait_group.py
index c1088b54..25f17fbb 100644
--- a/ch_tools/chadmin/cli/wait_group.py
+++ b/ch_tools/chadmin/cli/wait_group.py
@@ -54,7 +54,7 @@ def wait_replication_sync_command(ctx, status, pause, timeout):
             sys.exit(0)
         time.sleep(pause.total_seconds())
 
-    logging.error(f"ClickHouse can't sync replica.")
+    logging.error("ClickHouse can't sync replicas.")
     sys.exit(1)
 
 
diff --git a/ch_tools/monrun_checks/ch_replication_lag.py b/ch_tools/monrun_checks/ch_replication_lag.py
index 396b3e93..1cd401e4 100644
--- a/ch_tools/monrun_checks/ch_replication_lag.py
+++ b/ch_tools/monrun_checks/ch_replication_lag.py
@@ -1,5 +1,3 @@
-from typing import Any, Dict
-
 import click
 
 from ch_tools.common.commands.replication_lag import estimate_replication_lag

From fc5f9c0e605aff5a75264ea0f1bbdc2f687e4098 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Wed, 24 Jan 2024 23:00:18 +0300
Subject: [PATCH 8/9] Pass replication-lag args

---
 ch_tools/chadmin/cli/wait_group.py | 54 +++++++++++++++++++++++++++---
 tests/features/chadmin.feature     | 31 ++++++++++++++++-
 2 files changed, 79 insertions(+), 6 deletions(-)

diff --git a/ch_tools/chadmin/cli/wait_group.py b/ch_tools/chadmin/cli/wait_group.py
index 25f17fbb..f979fe81 100644
--- a/ch_tools/chadmin/cli/wait_group.py
+++ b/ch_tools/chadmin/cli/wait_group.py
@@ -3,7 +3,7 @@
 import sys
 import time
 
-from click import group, option, pass_context
+from click import FloatRange, group, option, pass_context
 
 from ch_tools.chadmin.internal.utils import execute_query
 from ch_tools.common.cli.parameters import TimeSpanParamType
@@ -27,7 +27,7 @@ def wait_group():
     "--status",
     type=int,
     default=0,
-    help="Wait until returned status is no worse than given, 0 = OK, 1 = WARN, 2 = CRIT.",
+    help="Wait until replication-lag returned status is no worse than given, 0 = OK, 1 = WARN, 2 = CRIT.",
 )
 @option(
     "-p",
@@ -43,13 +43,57 @@ def wait_group():
     default="3d",
     help="Max amount of time to wait.",
 )
+@option(
+    "-x",
+    "--exec-critical",
+    "xcrit",
+    type=int,
+    default=3600,
+    help="Critical threshold for one task execution.",
+)
+@option(
+    "-c",
+    "--critical",
+    "crit",
+    type=int,
+    default=600,
+    help="Critical threshold for lag with errors.",
+)
+@option("-w", "--warning", "warn", type=int, default=300, help="Warning threshold.")
+@option(
+    "-M",
+    "--merges-critical",
+    "mcrit",
+    type=FloatRange(0.0, 100.0),
+    default=90.0,
+    help="Critical threshold in percent of max_replicated_merges_in_queue.",
+)
+@option(
+    "-m",
+    "--merges-warning",
+    "mwarn",
+    type=FloatRange(0.0, 100.0),
+    default=50.0,
+    help="Warning threshold in percent of max_replicated_merges_in_queue.",
+)
+@option(
+    "-v",
+    "--verbose",
+    "verbose",
+    type=int,
+    count=True,
+    default=0,
+    help="Show details about lag.",
+)
 @pass_context
-def wait_replication_sync_command(ctx, status, pause, timeout):
-    """Wait for ClickHouse server to sync replication with other replicas."""
+def wait_replication_sync_command(
+    ctx, status, pause, timeout, xcrit, crit, warn, mwarn, mcrit, verbose
+):
+    """Wait for ClickHouse server to sync replication with other replicas using replication-lag command."""
 
     deadline = time.time() + timeout.total_seconds()
     while time.time() < deadline:
-        res = estimate_replication_lag(ctx)
+        res = estimate_replication_lag(ctx, xcrit, crit, warn, mwarn, mcrit, verbose)
         if res.code <= status:
             sys.exit(0)
         time.sleep(pause.total_seconds())
diff --git a/tests/features/chadmin.feature b/tests/features/chadmin.feature
index ec88e430..abcc739f 100644
--- a/tests/features/chadmin.feature
+++ b/tests/features/chadmin.feature
@@ -6,10 +6,39 @@ Feature: chadmin commands.
     And a working zookeeper
     And a working clickhouse on clickhouse01
     And a working clickhouse on clickhouse02
+    Given we have executed queries on clickhouse01
+    """
+    CREATE DATABASE IF NOT EXISTS test ON CLUSTER 'cluster';
+
+    CREATE TABLE IF NOT EXISTS test.table_01 ON CLUSTER 'cluster' (n Int32)
+    ENGINE = ReplicatedMergeTree('/tables/table_01', '{replica}') PARTITION BY n ORDER BY n;
+    """
 
 
   Scenario: Check wait replication sync
     When we execute command on clickhouse01
     """
-    chadmin wait replication-sync -t 10 -p 1
+    chadmin wait replication-sync -t 10 -p 1 -w 4
+    """
+    When we execute query on clickhouse01
+    """
+    SYSTEM STOP FETCHES
+    """
+    And we execute query on clickhouse02
+    """
+    INSERT INTO test.table_01 SELECT number FROM numbers(100)
+    """
+    And we sleep for 5 seconds
+    When we try to execute command on clickhouse01
+    """
+    chadmin wait replication-sync -t 10 -p 1 -w 4
+    """
+    Then it fails
+    When we execute query on clickhouse01
+    """
+    SYSTEM START FETCHES
+    """
+    When we execute command on clickhouse01
+    """
+    chadmin wait replication-sync -t 10 -p 1 -w 4
     """

From 1970cb0f153b31aeed6e0aeb7a52fcb22718e285 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?=
 =?UTF-8?q?=D0=B1=D0=B0=D1=80?= <st087492@student.spbu.ru>
Date: Wed, 24 Jan 2024 23:00:33 +0300
Subject: [PATCH 9/9] Pin requests version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0b493b90..21145ad0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -95,7 +95,7 @@ pygments = "*"
 pyopenssl = "*"
 python-dateutil = "*"
 pyyaml = "<5.4"
-requests = "*"
+requests = "<2.30.0"
 tabulate = "*"
 tenacity = "*"
 termcolor = "*"