From 700802bebe580467417cef2888d88e994117acb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Tue, 23 Jan 2024 12:58:58 +0300 Subject: [PATCH 1/9] Wait replication sync command --- ch_tools/monrun_checks/ch_replication_lag.py | 4 ++ ch_tools/monrun_checks/ch_replication_sync.py | 42 +++++++++++++++++++ ch_tools/monrun_checks/main.py | 2 + tests/features/monrun.feature | 10 +++++ 4 files changed, 58 insertions(+) create mode 100644 ch_tools/monrun_checks/ch_replication_sync.py diff --git a/ch_tools/monrun_checks/ch_replication_lag.py b/ch_tools/monrun_checks/ch_replication_lag.py index 761ef1d8..b4583100 100644 --- a/ch_tools/monrun_checks/ch_replication_lag.py +++ b/ch_tools/monrun_checks/ch_replication_lag.py @@ -56,6 +56,10 @@ ) @click.pass_context def replication_lag_command(ctx, xcrit, crit, warn, mwarn, mcrit, verbose): + return estimate_replication_lag(ctx, xcrit, crit, warn, mwarn, mcrit, verbose) + + +def estimate_replication_lag(ctx, xcrit=3600, crit=6000, warn=300, mwarn=50.0, mcrit=90.0, verbose=0): """ Check for replication lag across replicas. Should be: lag >= lag_with_errors, lag >= max_execution diff --git a/ch_tools/monrun_checks/ch_replication_sync.py b/ch_tools/monrun_checks/ch_replication_sync.py new file mode 100644 index 00000000..28f8ccab --- /dev/null +++ b/ch_tools/monrun_checks/ch_replication_sync.py @@ -0,0 +1,42 @@ +import time + +from click import command, option, pass_context + +from ch_tools.common.result import Result +from ch_tools.monrun_checks.ch_replication_lag import estimate_replication_lag + + +@command("wait-replication-sync") +@option( + "-s", + "--status", + type=int, + default=0, + help="Wait until returned status is no worse than given, 0 = OK (default), 1 = WARN, 2 = CRIT.", +) +@option( + "-p", + "--pause", + type=int, + default=30, + help="Pause between request in seconds, default is 30 seconds.", +) +@option( + "-t", + "--timeout", + type=int, + default=3 * 24 * 60 * 60, + help="Max amount of time to wait, in seconds. Default is 30 days.", +) +@pass_context +def wait_replication_sync_command(ctx, status, pause, timeout): + """Wait for ClickHouse server to sync replication with other replicas.""" + + deadline = time.time() + timeout + while time.time() < deadline: + res = estimate_replication_lag(ctx) + if res.code <= status: + return Result(code=0) + time.sleep(pause) + + return Result(code=2, message=f"ClickHouse can\'t sync replica for {timeout} seconds") diff --git a/ch_tools/monrun_checks/main.py b/ch_tools/monrun_checks/main.py index 423db95c..fb8b396e 100644 --- a/ch_tools/monrun_checks/main.py +++ b/ch_tools/monrun_checks/main.py @@ -28,6 +28,7 @@ from ch_tools.monrun_checks.ch_log_errors import log_errors_command from ch_tools.monrun_checks.ch_ping import ping_command from ch_tools.monrun_checks.ch_replication_lag import replication_lag_command +from ch_tools.monrun_checks.ch_replication_sync import wait_replication_sync_command from ch_tools.monrun_checks.ch_resetup_state import resetup_state_command from ch_tools.monrun_checks.ch_ro_replica import ro_replica_command from ch_tools.monrun_checks.ch_s3_backup_orphaned import orphaned_backups_command @@ -135,6 +136,7 @@ def cli(ctx, ensure_monitoring_user): ping_command, log_errors_command, replication_lag_command, + wait_replication_sync_command, system_queues_command, core_dumps_command, dist_tables_command, diff --git a/tests/features/monrun.feature b/tests/features/monrun.feature index 4006986c..b0718c21 100644 --- a/tests/features/monrun.feature +++ b/tests/features/monrun.feature @@ -371,3 +371,13 @@ Feature: ch-monitoring tool """ 2;KazooTimeoutError('Connection time-out') """ + + Scenario: Check wait replication sync + When we execute command on clickhouse01 + """ + ch-monitoring wait-replication-sync -t 10 -p 1 + """ + Then we get response + """ + 0;OK + """ From 9b42451ef3cb508bda358b7df303fd3e62cad4af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Tue, 23 Jan 2024 14:10:50 +0300 Subject: [PATCH 2/9] Test for replication lag --- tests/features/monrun.feature | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/features/monrun.feature b/tests/features/monrun.feature index b0718c21..243c1d02 100644 --- a/tests/features/monrun.feature +++ b/tests/features/monrun.feature @@ -139,6 +139,26 @@ Feature: ch-monitoring tool """ 0;OK """ + When we execute query on clickhouse01 + """ + SYSTEM STOP FETCHES + """ + And we execute query on clickhouse02 + """ + INSERT INTO test.table_01 SELECT number FROM numbers(100) + """ + And we execute command on clickhouse02 + """ + sleep 5 + """ + And we execute command on clickhouse01 + """ + ch-monitoring replication-lag -w 4 + """ + Then we get response contains + """ + 1; + """ Scenario: Check System queues size When we execute command on clickhouse01 From 456b693f7fb4318908091a78ab3c39ddb143e5db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Tue, 23 Jan 2024 19:17:50 +0300 Subject: [PATCH 3/9] Move wait replication to chadmin --- .../cli/wait_replication_sync_command.py} | 25 ++++++++++--------- tests/features/chadmin.feature | 15 +++++++++++ tests/features/monrun.feature | 10 -------- 3 files changed, 28 insertions(+), 22 deletions(-) rename ch_tools/{monrun_checks/ch_replication_sync.py => chadmin/cli/wait_replication_sync_command.py} (57%) create mode 100644 tests/features/chadmin.feature diff --git a/ch_tools/monrun_checks/ch_replication_sync.py b/ch_tools/chadmin/cli/wait_replication_sync_command.py similarity index 57% rename from ch_tools/monrun_checks/ch_replication_sync.py rename to ch_tools/chadmin/cli/wait_replication_sync_command.py index 28f8ccab..81cf1ff8 100644 --- a/ch_tools/monrun_checks/ch_replication_sync.py +++ b/ch_tools/chadmin/cli/wait_replication_sync_command.py @@ -1,9 +1,10 @@ import time +import sys from click import command, option, pass_context -from ch_tools.common.result import Result from ch_tools.monrun_checks.ch_replication_lag import estimate_replication_lag +from ch_tools.common.cli.parameters import TimeSpanParamType @command("wait-replication-sync") @@ -12,31 +13,31 @@ "--status", type=int, default=0, - help="Wait until returned status is no worse than given, 0 = OK (default), 1 = WARN, 2 = CRIT.", + help="Wait until returned status is no worse than given, 0 = OK, 1 = WARN, 2 = CRIT.", ) @option( "-p", "--pause", - type=int, - default=30, - help="Pause between request in seconds, default is 30 seconds.", + type=TimeSpanParamType(), + default="30s", + help="Pause between requests.", ) @option( "-t", "--timeout", - type=int, - default=3 * 24 * 60 * 60, - help="Max amount of time to wait, in seconds. Default is 30 days.", + type=TimeSpanParamType(), + default="3d", + help="Max amount of time to wait.", ) @pass_context def wait_replication_sync_command(ctx, status, pause, timeout): """Wait for ClickHouse server to sync replication with other replicas.""" - deadline = time.time() + timeout + deadline = time.time() + timeout.total_seconds() while time.time() < deadline: res = estimate_replication_lag(ctx) if res.code <= status: - return Result(code=0) - time.sleep(pause) + sys.exit(0) + time.sleep(pause.total_seconds()) - return Result(code=2, message=f"ClickHouse can\'t sync replica for {timeout} seconds") + sys.exit(1) diff --git a/tests/features/chadmin.feature b/tests/features/chadmin.feature new file mode 100644 index 00000000..175eaa5a --- /dev/null +++ b/tests/features/chadmin.feature @@ -0,0 +1,15 @@ +Feature: chadmin commands. + + Background: + Given default configuration + And a working s3 + And a working zookeeper + And a working clickhouse on clickhouse01 + And a working clickhouse on clickhouse02 + + + Scenario: Check wait replication sync + When we execute command on clickhouse01 + """ + chadmin wait-replication-sync -t 10 -p 1 + """ diff --git a/tests/features/monrun.feature b/tests/features/monrun.feature index 243c1d02..7d50b97d 100644 --- a/tests/features/monrun.feature +++ b/tests/features/monrun.feature @@ -391,13 +391,3 @@ Feature: ch-monitoring tool """ 2;KazooTimeoutError('Connection time-out') """ - - Scenario: Check wait replication sync - When we execute command on clickhouse01 - """ - ch-monitoring wait-replication-sync -t 10 -p 1 - """ - Then we get response - """ - 0;OK - """ From 43db5e69d449f1b5955e15fb4d5b93aeb428fcc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Wed, 24 Jan 2024 13:14:18 +0300 Subject: [PATCH 4/9] Move replication lag to common --- ch_tools/chadmin/chadmin_cli.py | 2 + .../cli/wait_replication_sync_command.py | 4 +- ch_tools/common/replication_lag.py | 275 +++++++++++++++++ ch_tools/monrun_checks/ch_replication_lag.py | 277 +----------------- ch_tools/monrun_checks/main.py | 2 - 5 files changed, 281 insertions(+), 279 deletions(-) create mode 100644 ch_tools/common/replication_lag.py diff --git a/ch_tools/chadmin/chadmin_cli.py b/ch_tools/chadmin/chadmin_cli.py index 1c8fdbef..2cd275eb 100755 --- a/ch_tools/chadmin/chadmin_cli.py +++ b/ch_tools/chadmin/chadmin_cli.py @@ -42,6 +42,7 @@ from ch_tools.chadmin.cli.table_group import table_group from ch_tools.chadmin.cli.table_replica_group import table_replica_group from ch_tools.chadmin.cli.thread_log_group import thread_log_group +from ch_tools.chadmin.cli.wait_replication_sync_command import wait_replication_sync_command from ch_tools.chadmin.cli.wait_started_command import wait_started_command from ch_tools.chadmin.cli.zookeeper_group import zookeeper_group from ch_tools.common.cli.context_settings import CONTEXT_SETTINGS @@ -116,6 +117,7 @@ def cli(ctx, format_, settings, timeout, port, debug): list_settings_command, restore_replica_command, stack_trace_command, + wait_replication_sync_command, wait_started_command, ] diff --git a/ch_tools/chadmin/cli/wait_replication_sync_command.py b/ch_tools/chadmin/cli/wait_replication_sync_command.py index 81cf1ff8..4cdee1ad 100644 --- a/ch_tools/chadmin/cli/wait_replication_sync_command.py +++ b/ch_tools/chadmin/cli/wait_replication_sync_command.py @@ -1,9 +1,10 @@ +import logging import time import sys from click import command, option, pass_context -from ch_tools.monrun_checks.ch_replication_lag import estimate_replication_lag +from ch_tools.common.replication_lag import estimate_replication_lag from ch_tools.common.cli.parameters import TimeSpanParamType @@ -40,4 +41,5 @@ def wait_replication_sync_command(ctx, status, pause, timeout): sys.exit(0) time.sleep(pause.total_seconds()) + logging.error(f"ClickHouse can't sync replica.") sys.exit(1) diff --git a/ch_tools/common/replication_lag.py b/ch_tools/common/replication_lag.py new file mode 100644 index 00000000..c230c520 --- /dev/null +++ b/ch_tools/common/replication_lag.py @@ -0,0 +1,275 @@ +from typing import Any, Dict + +from tabulate import tabulate + +from ch_tools.common.result import Result +from ch_tools.common.clickhouse.client.clickhouse_client import clickhouse_client + + +XCRIT=3600 +CRIT=600 +WARN=300 +MWARN=50.0 +MCRIT=90.0 +VERBOSE=0 + + +def estimate_replication_lag(ctx, xcrit=XCRIT, crit=CRIT, warn=WARN, mwarn=MWARN, mcrit=MCRIT, verbose=VERBOSE): + """ + Check for replication lag across replicas. + Should be: lag >= lag_with_errors, lag >= max_execution + """ + # pylint: disable=too-many-branches,too-many-locals + ch_client = clickhouse_client(ctx) + lag, lag_with_errors, max_execution, max_merges, chart = get_replication_lag( + ch_client + ) + + msg_verbose = "" + msg_verbose_2 = "\n\n" + + if verbose >= 1: + verbtab = [] + + headers = [ + "Table", + "Lag [s]", + "Tasks", + "Max task execution [s]", + "Non-retrayable errors", + "Has user fault errors", + "Merges with 1000+ tries", + ] + for key, item in chart.items(): + if item.get("multi_replicas", False): + tabletab = [ + key, + item.get("delay", 0), + item.get("tasks", 0), + item.get("max_execution", 0), + item.get("errors", 0), + item.get("user_fault", False), + item.get("retried_merges", 0), + ] + verbtab.append(tabletab) + if verbose >= 2: + exceptions_retrayable = "" + exceptions_non_retrayable = "" + exceptions_ignored = "" + for exception in item.get("exceptions", []): + if exception: + if is_userfault_exception(exception): + exceptions_ignored += "\t" + exception[5:] + "\n" + elif exception.startswith(" "): + exceptions_retrayable += "\t" + exception[5:] + "\n" + else: + exceptions_non_retrayable += "\t" + exception[5:] + "\n" + max_execution_part = ( + item.get("max_execution_part", "") + if item.get("max_execution", 0) + else 0 + ) + if ( + exceptions_retrayable + or exceptions_non_retrayable + or exceptions_ignored + or max_execution_part + ): + msg_verbose_2 = msg_verbose_2 + key + ":\n" + if exceptions_non_retrayable: + msg_verbose_2 = ( + msg_verbose_2 + + " Non-retrayable errors:\n" + + exceptions_non_retrayable + ) + if exceptions_retrayable: + msg_verbose_2 = ( + msg_verbose_2 + + " Retrayable errors:\n" + + exceptions_retrayable + ) + if exceptions_ignored: + msg_verbose_2 = ( + msg_verbose_2 + + " User fault errors:\n" + + exceptions_ignored + ) + if max_execution_part: + msg_verbose_2 = ( + msg_verbose_2 + + " Result part of task with max execution time: " + + max_execution_part + + "\n" + ) + msg_verbose = tabulate(verbtab, headers=headers) + if verbose >= 2: + msg_verbose = msg_verbose + msg_verbose_2 + + max_merges_warn_threshold = 1 + max_merges_crit_threshold = 1 + if max_merges > 0: + max_replicated_merges_in_queue = get_max_replicated_merges_in_queue(ch_client) + max_merges_warn_threshold = int(max_replicated_merges_in_queue * mwarn / 100.0) + max_merges_crit_threshold = int(max_replicated_merges_in_queue * mcrit / 100.0) + + if lag < warn and max_merges < max_merges_warn_threshold: + return Result(code=0, message="OK", verbose=msg_verbose) + + msg = "Max {0} seconds, with errors {1} seconds, max task execution {2} seconds, max merges in queue {3}".format( + lag, lag_with_errors, max_execution, max_merges + ) + + if ( + lag_with_errors < crit + and max_execution < xcrit + and max_merges < max_merges_crit_threshold + ): + return Result(code=1, message=msg, verbose=msg_verbose) + + return Result(code=2, message=msg, verbose=msg_verbose) + + +def get_replication_lag(ch_client): + """ + Get max absolute_delay from system.replicas. + """ + + tables = get_tables_with_replication_delay(ch_client) + chart: Dict[str, Dict[str, Any]] = {} + for t in tables: + key = "{database}.{table}".format(database=t["database"], table=t["table"]) + chart[key] = {} + chart[key]["delay"] = int(t["absolute_delay"]) + tables = filter_out_single_replica_tables(ch_client, tables) + for t in tables: + key = "{database}.{table}".format(database=t["database"], table=t["table"]) + chart[key]["multi_replicas"] = True + tables = count_errors(ch_client, tables, -1) + + max_merges = 0 + for t in tables: + key = "{database}.{table}".format(database=t["database"], table=t["table"]) + chart[key]["tasks"] = int(t["tasks"]) + chart[key]["errors"] = int(t["errors"]) + chart[key]["max_execution"] = int(t["max_execution"]) + chart[key]["max_execution_part"] = t["max_execution_part"] + chart[key]["exceptions"] = t["exceptions"] + chart[key]["retried_merges"] = int(t["retried_merges"]) + max_merges = max(int(t["retried_merges"]), max_merges) + for exception in t["exceptions"]: + if is_userfault_exception(exception): + chart[key]["userfault"] = True + break + + lag = 0 + lag_with_errors = 0 + max_execution = 0 + for key, item in chart.items(): + if item.get("multi_replicas", False): + delay = item.get("delay", 0) + if delay > lag: + lag = delay + if ( + delay > lag_with_errors + and item.get("errors", 0) > 0 + and not item.get("userfault", False) + ): + lag_with_errors = delay + execution = item.get("max_execution", 0) + if execution > max_execution: + max_execution = execution + + return lag, lag_with_errors, max_execution, max_merges, chart + + +def get_tables_with_replication_delay(ch_client): + """ + Get tables with absolute_delay > 0. + """ + query = "SELECT database, table, zookeeper_path, absolute_delay FROM system.replicas WHERE absolute_delay > 0" + return ch_client.query(query=query, format_="JSON")["data"] + + +def filter_out_single_replica_tables(ch_client, tables): + if not tables: + return tables + + query = """ + SELECT + database, + table, + zookeeper_path + FROM system.replicas + WHERE (database, table) IN ({tables}) + AND total_replicas > 1 + """.format( + tables=",".join( + "('{0}', '{1}')".format(t["database"], t["table"]) for t in tables + ) + ) + return ch_client.query(query=query, format_="JSON")["data"] + + +def count_errors(ch_client, tables, exceptions_limit): + """ + Add count of replication errors. + """ + if not tables: + return tables + + limit = "" if exceptions_limit < 0 else "({})".format(exceptions_limit) + + query = """ + SELECT + database, + table, + count() as tasks, + countIf(last_exception != '' AND postpone_reason = '') as errors, + max(IF(is_currently_executing, dateDiff('second', last_attempt_time, now()), 0)) as max_execution, + groupUniqArray{limit}(IF(last_exception != '', concat(IF(postpone_reason = '', ' ', ' '), last_exception), '')) as exceptions, + argMax(new_part_name, IF(is_currently_executing, dateDiff('second', last_attempt_time, now()), 0)) as max_execution_part, + countIf(type = 'MERGE_PARTS' and num_tries >= 1000) as retried_merges + FROM system.replication_queue + WHERE (database, table) IN ({tables}) + GROUP BY database,table + """.format( + tables=",".join( + "('{0}', '{1}')".format(t["database"], t["table"]) for t in tables + ), + limit=limit, + ) + return ch_client.query(query=query, format_="JSON")["data"] + + +def is_userfault_exception(exception): + """ + Check if exception was caused by user. + Current list: + * DB::Exception: Cannot reserve 1.00 MiB, not enough space + * DB::Exception: Incorrect data: Sign = -127 (must be 1 or -1) + """ + + if "DB::Exception: Cannot reserve" in exception and "not enough space" in exception: + return True + if ( + "DB::Exception: Incorrect data: Sign" in exception + and "(must be 1 or -1)" in exception + ): + return True + + return False + + +def get_max_replicated_merges_in_queue(ch_client): + """ + Get max_replicated_merges_in_queue value + """ + query = """ + SELECT value FROM system.merge_tree_settings WHERE name='max_replicated_merges_in_queue' + """ + res = ch_client.query(query=query, format_="JSONCompact")["data"] + if not res: + return ( + 16 # 16 is default value for 'max_replicated_merges_in_queue' in ClickHouse + ) + return int(res[0][0]) \ No newline at end of file diff --git a/ch_tools/monrun_checks/ch_replication_lag.py b/ch_tools/monrun_checks/ch_replication_lag.py index b4583100..53558363 100644 --- a/ch_tools/monrun_checks/ch_replication_lag.py +++ b/ch_tools/monrun_checks/ch_replication_lag.py @@ -2,11 +2,7 @@ from typing import Any, Dict import click -from tabulate import tabulate - -from ch_tools.common.result import Result -from ch_tools.monrun_checks.clickhouse_client import ClickhouseClient -from ch_tools.monrun_checks.clickhouse_info import ClickhouseInfo +from ch_tools.common.replication_lag import estimate_replication_lag @click.command("replication-lag") @@ -57,274 +53,3 @@ @click.pass_context def replication_lag_command(ctx, xcrit, crit, warn, mwarn, mcrit, verbose): return estimate_replication_lag(ctx, xcrit, crit, warn, mwarn, mcrit, verbose) - - -def estimate_replication_lag(ctx, xcrit=3600, crit=6000, warn=300, mwarn=50.0, mcrit=90.0, verbose=0): - """ - Check for replication lag across replicas. - Should be: lag >= lag_with_errors, lag >= max_execution - """ - # pylint: disable=too-many-branches,too-many-locals - ch_client = ClickhouseClient(ctx) - lag, lag_with_errors, max_execution, max_merges, chart = get_replication_lag( - ch_client - ) - - msg_verbose = "" - msg_verbose_2 = "\n\n" - - if verbose >= 1: - verbtab = [] - - headers = [ - "Table", - "Lag [s]", - "Tasks", - "Max task execution [s]", - "Non-retrayable errors", - "Has user fault errors", - "Merges with 1000+ tries", - ] - for key, item in chart.items(): - if item.get("multi_replicas", False): - tabletab = [ - key, - item.get("delay", 0), - item.get("tasks", 0), - item.get("max_execution", 0), - item.get("errors", 0), - item.get("user_fault", False), - item.get("retried_merges", 0), - ] - verbtab.append(tabletab) - if verbose >= 2: - exceptions_retrayable = "" - exceptions_non_retrayable = "" - exceptions_ignored = "" - for exception in item.get("exceptions", []): - if exception: - if is_userfault_exception(exception): - exceptions_ignored += "\t" + exception[5:] + "\n" - elif exception.startswith(" "): - exceptions_retrayable += "\t" + exception[5:] + "\n" - else: - exceptions_non_retrayable += "\t" + exception[5:] + "\n" - max_execution_part = ( - item.get("max_execution_part", "") - if item.get("max_execution", 0) - else 0 - ) - if ( - exceptions_retrayable - or exceptions_non_retrayable - or exceptions_ignored - or max_execution_part - ): - msg_verbose_2 = msg_verbose_2 + key + ":\n" - if exceptions_non_retrayable: - msg_verbose_2 = ( - msg_verbose_2 - + " Non-retrayable errors:\n" - + exceptions_non_retrayable - ) - if exceptions_retrayable: - msg_verbose_2 = ( - msg_verbose_2 - + " Retrayable errors:\n" - + exceptions_retrayable - ) - if exceptions_ignored: - msg_verbose_2 = ( - msg_verbose_2 - + " User fault errors:\n" - + exceptions_ignored - ) - if max_execution_part: - msg_verbose_2 = ( - msg_verbose_2 - + " Result part of task with max execution time: " - + max_execution_part - + "\n" - ) - msg_verbose = tabulate(verbtab, headers=headers) - if verbose >= 2: - msg_verbose = msg_verbose + msg_verbose_2 - - max_merges_warn_threshold = 1 - max_merges_crit_threshold = 1 - if max_merges > 0: - max_replicated_merges_in_queue = get_max_replicated_merges_in_queue(ch_client) - max_merges_warn_threshold = int(max_replicated_merges_in_queue * mwarn / 100.0) - max_merges_crit_threshold = int(max_replicated_merges_in_queue * mcrit / 100.0) - - if lag < warn and max_merges < max_merges_warn_threshold: - return Result(code=0, message="OK", verbose=msg_verbose) - - msg = "Max {0} seconds, with errors {1} seconds, max task execution {2} seconds, max merges in queue {3}".format( - lag, lag_with_errors, max_execution, max_merges - ) - - try: - replica_versions_mismatch = ClickhouseInfo.get_versions_count(ctx) > 1 - if replica_versions_mismatch: - msg += ", ClickHouse versions on replicas mismatch" - return Result(code=1, message=msg, verbose=msg_verbose) - except Exception: - logging.warning("Unable to get version info from replicas", exc_info=True) - msg += ", one or more replicas is unavailable" - return Result(code=1, message=msg, verbose=msg_verbose) - - if ( - lag_with_errors < crit - and max_execution < xcrit - and max_merges < max_merges_crit_threshold - ): - return Result(code=1, message=msg, verbose=msg_verbose) - - return Result(code=2, message=msg, verbose=msg_verbose) - - -def get_replication_lag(ch_client): - """ - Get max absolute_delay from system.replicas. - """ - - tables = get_tables_with_replication_delay(ch_client) - chart: Dict[str, Dict[str, Any]] = {} - for t in tables: - key = "{database}.{table}".format(database=t["database"], table=t["table"]) - chart[key] = {} - chart[key]["delay"] = int(t["absolute_delay"]) - tables = filter_out_single_replica_tables(ch_client, tables) - for t in tables: - key = "{database}.{table}".format(database=t["database"], table=t["table"]) - chart[key]["multi_replicas"] = True - tables = count_errors(ch_client, tables, -1) - - max_merges = 0 - for t in tables: - key = "{database}.{table}".format(database=t["database"], table=t["table"]) - chart[key]["tasks"] = int(t["tasks"]) - chart[key]["errors"] = int(t["errors"]) - chart[key]["max_execution"] = int(t["max_execution"]) - chart[key]["max_execution_part"] = t["max_execution_part"] - chart[key]["exceptions"] = t["exceptions"] - chart[key]["retried_merges"] = int(t["retried_merges"]) - max_merges = max(int(t["retried_merges"]), max_merges) - for exception in t["exceptions"]: - if is_userfault_exception(exception): - chart[key]["userfault"] = True - break - - lag = 0 - lag_with_errors = 0 - max_execution = 0 - for key, item in chart.items(): - if item.get("multi_replicas", False): - delay = item.get("delay", 0) - if delay > lag: - lag = delay - if ( - delay > lag_with_errors - and item.get("errors", 0) > 0 - and not item.get("userfault", False) - ): - lag_with_errors = delay - execution = item.get("max_execution", 0) - if execution > max_execution: - max_execution = execution - - return lag, lag_with_errors, max_execution, max_merges, chart - - -def get_tables_with_replication_delay(ch_client): - """ - Get tables with absolute_delay > 0. - """ - query = "SELECT database, table, zookeeper_path, absolute_delay FROM system.replicas WHERE absolute_delay > 0" - return ch_client.execute(query, compact=False) - - -def filter_out_single_replica_tables(ch_client, tables): - if not tables: - return tables - - query = """ - SELECT - database, - table, - zookeeper_path - FROM system.replicas - WHERE (database, table) IN ({tables}) - AND total_replicas > 1 - """.format( - tables=",".join( - "('{0}', '{1}')".format(t["database"], t["table"]) for t in tables - ) - ) - return ch_client.execute(query, False) - - -def count_errors(ch_client, tables, exceptions_limit): - """ - Add count of replication errors. - """ - if not tables: - return tables - - limit = "" if exceptions_limit < 0 else "({})".format(exceptions_limit) - - query = """ - SELECT - database, - table, - count() as tasks, - countIf(last_exception != '' AND postpone_reason = '') as errors, - max(IF(is_currently_executing, dateDiff('second', last_attempt_time, now()), 0)) as max_execution, - groupUniqArray{limit}(IF(last_exception != '', concat(IF(postpone_reason = '', ' ', ' '), last_exception), '')) as exceptions, - argMax(new_part_name, IF(is_currently_executing, dateDiff('second', last_attempt_time, now()), 0)) as max_execution_part, - countIf(type = 'MERGE_PARTS' and num_tries >= 1000) as retried_merges - FROM system.replication_queue - WHERE (database, table) IN ({tables}) - GROUP BY database,table - """.format( - tables=",".join( - "('{0}', '{1}')".format(t["database"], t["table"]) for t in tables - ), - limit=limit, - ) - return ch_client.execute(query, False) - - -def is_userfault_exception(exception): - """ - Check if exception was caused by user. - Current list: - * DB::Exception: Cannot reserve 1.00 MiB, not enough space - * DB::Exception: Incorrect data: Sign = -127 (must be 1 or -1) - """ - - if "DB::Exception: Cannot reserve" in exception and "not enough space" in exception: - return True - if ( - "DB::Exception: Incorrect data: Sign" in exception - and "(must be 1 or -1)" in exception - ): - return True - - return False - - -def get_max_replicated_merges_in_queue(ch_client): - """ - Get max_replicated_merges_in_queue value - """ - query = """ - SELECT value FROM system.merge_tree_settings WHERE name='max_replicated_merges_in_queue' - """ - res = ch_client.execute(query, True) - if not res: - return ( - 16 # 16 is default value for 'max_replicated_merges_in_queue' in ClickHouse - ) - return int(res[0][0]) diff --git a/ch_tools/monrun_checks/main.py b/ch_tools/monrun_checks/main.py index fb8b396e..423db95c 100644 --- a/ch_tools/monrun_checks/main.py +++ b/ch_tools/monrun_checks/main.py @@ -28,7 +28,6 @@ from ch_tools.monrun_checks.ch_log_errors import log_errors_command from ch_tools.monrun_checks.ch_ping import ping_command from ch_tools.monrun_checks.ch_replication_lag import replication_lag_command -from ch_tools.monrun_checks.ch_replication_sync import wait_replication_sync_command from ch_tools.monrun_checks.ch_resetup_state import resetup_state_command from ch_tools.monrun_checks.ch_ro_replica import ro_replica_command from ch_tools.monrun_checks.ch_s3_backup_orphaned import orphaned_backups_command @@ -136,7 +135,6 @@ def cli(ctx, ensure_monitoring_user): ping_command, log_errors_command, replication_lag_command, - wait_replication_sync_command, system_queues_command, core_dumps_command, dist_tables_command, From b6c943bc6a74340e15f826db7f9bd9686e13f48c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Wed, 24 Jan 2024 13:29:03 +0300 Subject: [PATCH 5/9] Formatting --- ch_tools/chadmin/chadmin_cli.py | 4 +++- .../cli/wait_replication_sync_command.py | 4 ++-- ch_tools/common/replication_lag.py | 21 ++++++++++--------- ch_tools/monrun_checks/ch_replication_lag.py | 2 +- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/ch_tools/chadmin/chadmin_cli.py b/ch_tools/chadmin/chadmin_cli.py index 2cd275eb..def8fa66 100755 --- a/ch_tools/chadmin/chadmin_cli.py +++ b/ch_tools/chadmin/chadmin_cli.py @@ -42,7 +42,9 @@ from ch_tools.chadmin.cli.table_group import table_group from ch_tools.chadmin.cli.table_replica_group import table_replica_group from ch_tools.chadmin.cli.thread_log_group import thread_log_group -from ch_tools.chadmin.cli.wait_replication_sync_command import wait_replication_sync_command +from ch_tools.chadmin.cli.wait_replication_sync_command import ( + wait_replication_sync_command, +) from ch_tools.chadmin.cli.wait_started_command import wait_started_command from ch_tools.chadmin.cli.zookeeper_group import zookeeper_group from ch_tools.common.cli.context_settings import CONTEXT_SETTINGS diff --git a/ch_tools/chadmin/cli/wait_replication_sync_command.py b/ch_tools/chadmin/cli/wait_replication_sync_command.py index 4cdee1ad..8e85f436 100644 --- a/ch_tools/chadmin/cli/wait_replication_sync_command.py +++ b/ch_tools/chadmin/cli/wait_replication_sync_command.py @@ -1,11 +1,11 @@ import logging -import time import sys +import time from click import command, option, pass_context -from ch_tools.common.replication_lag import estimate_replication_lag from ch_tools.common.cli.parameters import TimeSpanParamType +from ch_tools.common.replication_lag import estimate_replication_lag @command("wait-replication-sync") diff --git a/ch_tools/common/replication_lag.py b/ch_tools/common/replication_lag.py index c230c520..9bac5b85 100644 --- a/ch_tools/common/replication_lag.py +++ b/ch_tools/common/replication_lag.py @@ -2,19 +2,20 @@ from tabulate import tabulate -from ch_tools.common.result import Result from ch_tools.common.clickhouse.client.clickhouse_client import clickhouse_client +from ch_tools.common.result import Result - -XCRIT=3600 -CRIT=600 -WARN=300 -MWARN=50.0 -MCRIT=90.0 -VERBOSE=0 +XCRIT = 3600 +CRIT = 600 +WARN = 300 +MWARN = 50.0 +MCRIT = 90.0 +VERBOSE = 0 -def estimate_replication_lag(ctx, xcrit=XCRIT, crit=CRIT, warn=WARN, mwarn=MWARN, mcrit=MCRIT, verbose=VERBOSE): +def estimate_replication_lag( + ctx, xcrit=XCRIT, crit=CRIT, warn=WARN, mwarn=MWARN, mcrit=MCRIT, verbose=VERBOSE +): """ Check for replication lag across replicas. Should be: lag >= lag_with_errors, lag >= max_execution @@ -272,4 +273,4 @@ def get_max_replicated_merges_in_queue(ch_client): return ( 16 # 16 is default value for 'max_replicated_merges_in_queue' in ClickHouse ) - return int(res[0][0]) \ No newline at end of file + return int(res[0][0]) diff --git a/ch_tools/monrun_checks/ch_replication_lag.py b/ch_tools/monrun_checks/ch_replication_lag.py index 53558363..2439cfc9 100644 --- a/ch_tools/monrun_checks/ch_replication_lag.py +++ b/ch_tools/monrun_checks/ch_replication_lag.py @@ -1,7 +1,7 @@ -import logging from typing import Any, Dict import click + from ch_tools.common.replication_lag import estimate_replication_lag From 91b5921f38563dcf5aa209daf2b1020817726b9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Wed, 24 Jan 2024 13:56:44 +0300 Subject: [PATCH 6/9] Wait commands group --- ch_tools/chadmin/chadmin_cli.py | 8 +-- ...{wait_started_command.py => wait_group.py} | 49 ++++++++++++++++++- .../cli/wait_replication_sync_command.py | 45 ----------------- .../common/{ => commands}/replication_lag.py | 0 ch_tools/monrun_checks/ch_replication_lag.py | 2 +- tests/features/chadmin.feature | 2 +- tests/features/monrun.feature | 5 +- 7 files changed, 52 insertions(+), 59 deletions(-) rename ch_tools/chadmin/cli/{wait_started_command.py => wait_group.py} (66%) delete mode 100644 ch_tools/chadmin/cli/wait_replication_sync_command.py rename ch_tools/common/{ => commands}/replication_lag.py (100%) diff --git a/ch_tools/chadmin/chadmin_cli.py b/ch_tools/chadmin/chadmin_cli.py index def8fa66..f7967ddb 100755 --- a/ch_tools/chadmin/chadmin_cli.py +++ b/ch_tools/chadmin/chadmin_cli.py @@ -42,10 +42,7 @@ from ch_tools.chadmin.cli.table_group import table_group from ch_tools.chadmin.cli.table_replica_group import table_replica_group from ch_tools.chadmin.cli.thread_log_group import thread_log_group -from ch_tools.chadmin.cli.wait_replication_sync_command import ( - wait_replication_sync_command, -) -from ch_tools.chadmin.cli.wait_started_command import wait_started_command +from ch_tools.chadmin.cli.wait_group import wait_group from ch_tools.chadmin.cli.zookeeper_group import zookeeper_group from ch_tools.common.cli.context_settings import CONTEXT_SETTINGS from ch_tools.common.cli.locale_resolver import LocaleResolver @@ -119,8 +116,6 @@ def cli(ctx, format_, settings, timeout, port, debug): list_settings_command, restore_replica_command, stack_trace_command, - wait_replication_sync_command, - wait_started_command, ] groups: List[Any] = [ @@ -143,6 +138,7 @@ def cli(ctx, format_, settings, timeout, port, debug): table_group, table_replica_group, thread_log_group, + wait_group, zookeeper_group, ] diff --git a/ch_tools/chadmin/cli/wait_started_command.py b/ch_tools/chadmin/cli/wait_group.py similarity index 66% rename from ch_tools/chadmin/cli/wait_started_command.py rename to ch_tools/chadmin/cli/wait_group.py index 335e7ae4..c1088b54 100644 --- a/ch_tools/chadmin/cli/wait_started_command.py +++ b/ch_tools/chadmin/cli/wait_group.py @@ -3,9 +3,11 @@ import sys import time -from click import command, option, pass_context +from click import group, option, pass_context from ch_tools.chadmin.internal.utils import execute_query +from ch_tools.common.cli.parameters import TimeSpanParamType +from ch_tools.common.commands.replication_lag import estimate_replication_lag from ch_tools.common.utils import execute BASE_TIMEOUT = 600 @@ -13,7 +15,50 @@ S3_PART_LOAD_SPEED = 0.5 # in data parts per second -@command("wait-started") +@group("wait") +def wait_group(): + """Commands to wait until Clickhouse is in a certain state.""" + pass + + +@wait_group.command("replication-sync") +@option( + "-s", + "--status", + type=int, + default=0, + help="Wait until returned status is no worse than given, 0 = OK, 1 = WARN, 2 = CRIT.", +) +@option( + "-p", + "--pause", + type=TimeSpanParamType(), + default="30s", + help="Pause between requests.", +) +@option( + "-t", + "--timeout", + type=TimeSpanParamType(), + default="3d", + help="Max amount of time to wait.", +) +@pass_context +def wait_replication_sync_command(ctx, status, pause, timeout): + """Wait for ClickHouse server to sync replication with other replicas.""" + + deadline = time.time() + timeout.total_seconds() + while time.time() < deadline: + res = estimate_replication_lag(ctx) + if res.code <= status: + sys.exit(0) + time.sleep(pause.total_seconds()) + + logging.error(f"ClickHouse can't sync replica.") + sys.exit(1) + + +@wait_group.command("started") @option( "--timeout", type=int, diff --git a/ch_tools/chadmin/cli/wait_replication_sync_command.py b/ch_tools/chadmin/cli/wait_replication_sync_command.py deleted file mode 100644 index 8e85f436..00000000 --- a/ch_tools/chadmin/cli/wait_replication_sync_command.py +++ /dev/null @@ -1,45 +0,0 @@ -import logging -import sys -import time - -from click import command, option, pass_context - -from ch_tools.common.cli.parameters import TimeSpanParamType -from ch_tools.common.replication_lag import estimate_replication_lag - - -@command("wait-replication-sync") -@option( - "-s", - "--status", - type=int, - default=0, - help="Wait until returned status is no worse than given, 0 = OK, 1 = WARN, 2 = CRIT.", -) -@option( - "-p", - "--pause", - type=TimeSpanParamType(), - default="30s", - help="Pause between requests.", -) -@option( - "-t", - "--timeout", - type=TimeSpanParamType(), - default="3d", - help="Max amount of time to wait.", -) -@pass_context -def wait_replication_sync_command(ctx, status, pause, timeout): - """Wait for ClickHouse server to sync replication with other replicas.""" - - deadline = time.time() + timeout.total_seconds() - while time.time() < deadline: - res = estimate_replication_lag(ctx) - if res.code <= status: - sys.exit(0) - time.sleep(pause.total_seconds()) - - logging.error(f"ClickHouse can't sync replica.") - sys.exit(1) diff --git a/ch_tools/common/replication_lag.py b/ch_tools/common/commands/replication_lag.py similarity index 100% rename from ch_tools/common/replication_lag.py rename to ch_tools/common/commands/replication_lag.py diff --git a/ch_tools/monrun_checks/ch_replication_lag.py b/ch_tools/monrun_checks/ch_replication_lag.py index 2439cfc9..396b3e93 100644 --- a/ch_tools/monrun_checks/ch_replication_lag.py +++ b/ch_tools/monrun_checks/ch_replication_lag.py @@ -2,7 +2,7 @@ import click -from ch_tools.common.replication_lag import estimate_replication_lag +from ch_tools.common.commands.replication_lag import estimate_replication_lag @click.command("replication-lag") diff --git a/tests/features/chadmin.feature b/tests/features/chadmin.feature index 175eaa5a..ec88e430 100644 --- a/tests/features/chadmin.feature +++ b/tests/features/chadmin.feature @@ -11,5 +11,5 @@ Feature: chadmin commands. Scenario: Check wait replication sync When we execute command on clickhouse01 """ - chadmin wait-replication-sync -t 10 -p 1 + chadmin wait replication-sync -t 10 -p 1 """ diff --git a/tests/features/monrun.feature b/tests/features/monrun.feature index 7d50b97d..da2e359a 100644 --- a/tests/features/monrun.feature +++ b/tests/features/monrun.feature @@ -147,10 +147,7 @@ Feature: ch-monitoring tool """ INSERT INTO test.table_01 SELECT number FROM numbers(100) """ - And we execute command on clickhouse02 - """ - sleep 5 - """ + And we sleep for 5 seconds And we execute command on clickhouse01 """ ch-monitoring replication-lag -w 4 From 1274de848c6f15a7cb0ed308c183e1716b365285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Wed, 24 Jan 2024 14:04:41 +0300 Subject: [PATCH 7/9] Lint --- ch_tools/chadmin/cli/wait_group.py | 2 +- ch_tools/monrun_checks/ch_replication_lag.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/ch_tools/chadmin/cli/wait_group.py b/ch_tools/chadmin/cli/wait_group.py index c1088b54..25f17fbb 100644 --- a/ch_tools/chadmin/cli/wait_group.py +++ b/ch_tools/chadmin/cli/wait_group.py @@ -54,7 +54,7 @@ def wait_replication_sync_command(ctx, status, pause, timeout): sys.exit(0) time.sleep(pause.total_seconds()) - logging.error(f"ClickHouse can't sync replica.") + logging.error("ClickHouse can't sync replicas.") sys.exit(1) diff --git a/ch_tools/monrun_checks/ch_replication_lag.py b/ch_tools/monrun_checks/ch_replication_lag.py index 396b3e93..1cd401e4 100644 --- a/ch_tools/monrun_checks/ch_replication_lag.py +++ b/ch_tools/monrun_checks/ch_replication_lag.py @@ -1,5 +1,3 @@ -from typing import Any, Dict - import click from ch_tools.common.commands.replication_lag import estimate_replication_lag From fc5f9c0e605aff5a75264ea0f1bbdc2f687e4098 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Wed, 24 Jan 2024 23:00:18 +0300 Subject: [PATCH 8/9] Pass replication-lag args --- ch_tools/chadmin/cli/wait_group.py | 54 +++++++++++++++++++++++++++--- tests/features/chadmin.feature | 31 ++++++++++++++++- 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/ch_tools/chadmin/cli/wait_group.py b/ch_tools/chadmin/cli/wait_group.py index 25f17fbb..f979fe81 100644 --- a/ch_tools/chadmin/cli/wait_group.py +++ b/ch_tools/chadmin/cli/wait_group.py @@ -3,7 +3,7 @@ import sys import time -from click import group, option, pass_context +from click import FloatRange, group, option, pass_context from ch_tools.chadmin.internal.utils import execute_query from ch_tools.common.cli.parameters import TimeSpanParamType @@ -27,7 +27,7 @@ def wait_group(): "--status", type=int, default=0, - help="Wait until returned status is no worse than given, 0 = OK, 1 = WARN, 2 = CRIT.", + help="Wait until replication-lag returned status is no worse than given, 0 = OK, 1 = WARN, 2 = CRIT.", ) @option( "-p", @@ -43,13 +43,57 @@ def wait_group(): default="3d", help="Max amount of time to wait.", ) +@option( + "-x", + "--exec-critical", + "xcrit", + type=int, + default=3600, + help="Critical threshold for one task execution.", +) +@option( + "-c", + "--critical", + "crit", + type=int, + default=600, + help="Critical threshold for lag with errors.", +) +@option("-w", "--warning", "warn", type=int, default=300, help="Warning threshold.") +@option( + "-M", + "--merges-critical", + "mcrit", + type=FloatRange(0.0, 100.0), + default=90.0, + help="Critical threshold in percent of max_replicated_merges_in_queue.", +) +@option( + "-m", + "--merges-warning", + "mwarn", + type=FloatRange(0.0, 100.0), + default=50.0, + help="Warning threshold in percent of max_replicated_merges_in_queue.", +) +@option( + "-v", + "--verbose", + "verbose", + type=int, + count=True, + default=0, + help="Show details about lag.", +) @pass_context -def wait_replication_sync_command(ctx, status, pause, timeout): - """Wait for ClickHouse server to sync replication with other replicas.""" +def wait_replication_sync_command( + ctx, status, pause, timeout, xcrit, crit, warn, mwarn, mcrit, verbose +): + """Wait for ClickHouse server to sync replication with other replicas using replication-lag command.""" deadline = time.time() + timeout.total_seconds() while time.time() < deadline: - res = estimate_replication_lag(ctx) + res = estimate_replication_lag(ctx, xcrit, crit, warn, mwarn, mcrit, verbose) if res.code <= status: sys.exit(0) time.sleep(pause.total_seconds()) diff --git a/tests/features/chadmin.feature b/tests/features/chadmin.feature index ec88e430..abcc739f 100644 --- a/tests/features/chadmin.feature +++ b/tests/features/chadmin.feature @@ -6,10 +6,39 @@ Feature: chadmin commands. And a working zookeeper And a working clickhouse on clickhouse01 And a working clickhouse on clickhouse02 + Given we have executed queries on clickhouse01 + """ + CREATE DATABASE IF NOT EXISTS test ON CLUSTER 'cluster'; + + CREATE TABLE IF NOT EXISTS test.table_01 ON CLUSTER 'cluster' (n Int32) + ENGINE = ReplicatedMergeTree('/tables/table_01', '{replica}') PARTITION BY n ORDER BY n; + """ Scenario: Check wait replication sync When we execute command on clickhouse01 """ - chadmin wait replication-sync -t 10 -p 1 + chadmin wait replication-sync -t 10 -p 1 -w 4 + """ + When we execute query on clickhouse01 + """ + SYSTEM STOP FETCHES + """ + And we execute query on clickhouse02 + """ + INSERT INTO test.table_01 SELECT number FROM numbers(100) + """ + And we sleep for 5 seconds + When we try to execute command on clickhouse01 + """ + chadmin wait replication-sync -t 10 -p 1 -w 4 + """ + Then it fails + When we execute query on clickhouse01 + """ + SYSTEM START FETCHES + """ + When we execute command on clickhouse01 + """ + chadmin wait replication-sync -t 10 -p 1 -w 4 """ From 1970cb0f153b31aeed6e0aeb7a52fcb22718e285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Wed, 24 Jan 2024 23:00:33 +0300 Subject: [PATCH 9/9] Pin requests version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0b493b90..21145ad0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ pygments = "*" pyopenssl = "*" python-dateutil = "*" pyyaml = "<5.4" -requests = "*" +requests = "<2.30.0" tabulate = "*" tenacity = "*" termcolor = "*"