diff --git a/Makefile b/Makefile index 7cea3fe3..56f3bbd6 100644 --- a/Makefile +++ b/Makefile @@ -78,9 +78,9 @@ export DEB_BUILD_DISTRIBUTION ?= define ensure_poetry - if [ ! -e $(POETRY) ]; then - echo "Poetry could not be found. Please install it manually 'make install-poetry'"; - exit 1; + if [ ! -e $(POETRY) ]; then \ + echo "Poetry could not be found. Please install it manually 'make install-poetry'"; \ + exit 1; \ fi endef @@ -203,9 +203,9 @@ test-unit: install-deps .PHONY: test-integration test-integration: install-deps build-python-packages - cd $(TESTS_DIR) - export PYTHONPATH=$(CURDIR):$$PATH - $(POETRY) run behave --show-timings --stop --junit $(BEHAVE_ARGS) + cd $(TESTS_DIR) && \ + export PYTHONPATH=$(CURDIR):$$PATH && \ + $(POETRY) run behave --show-timings --stop --junit $(BEHAVE_ARGS) -i monrun.feature .PHONY: publish diff --git a/ch_tools/monrun_checks/ch_orphaned_objects.py b/ch_tools/monrun_checks/ch_orphaned_objects.py index c0b4030c..c16cc43c 100644 --- a/ch_tools/monrun_checks/ch_orphaned_objects.py +++ b/ch_tools/monrun_checks/ch_orphaned_objects.py @@ -9,7 +9,7 @@ from ch_tools.chadmin.internal.zookeeper import get_zk_node from ch_tools.common.result import CRIT, OK, WARNING, Result -ERROR_MESSAGE_PATTERN = r"(Code:\s\d+\.\sDB::Exception:\s).*(\s\([A-Z_]*\)\s\(version\s.*\s\(official build\)\)).*" +ERROR_MESSAGE_PATTERN = r"(Code:\s\d+\.\sDB::Exception:\s).*(\s\([A-Z_]*\)\s\(version\s.*\s\(official build\)\)\s).*" @click.command("orphaned-objects") @@ -107,5 +107,5 @@ def _zk_get_orphaned_objects_state( def _error_message_format(error_msg: str) -> str: - error_msg = re.sub(ERROR_MESSAGE_PATTERN, r"\1...\2", error_msg) + error_msg = re.sub(ERROR_MESSAGE_PATTERN, r"\1...\2...", error_msg) return error_msg diff --git a/tests/features/monrun.feature b/tests/features/monrun.feature index 1605bfd0..c352b6e5 100644 --- a/tests/features/monrun.feature +++ b/tests/features/monrun.feature @@ -22,505 +22,14 @@ Feature: ch-monitoring tool CREATE TABLE IF NOT EXISTS test.test_unfreeze (id int, name String) ENGINE=MergeTree() ORDER BY id SETTINGS storage_policy='object_storage'; INSERT INTO test.test_unfreeze VALUES(5, 'hello'); """ - - Scenario: Check Readonly replica - When we execute command on clickhouse01 - """ - ch-monitoring ro-replica - """ - Then we get response - """ - 0;OK - """ - When we execute command on zookeeper01 - """ - supervisorctl stop zookeeper - """ - And we execute command on clickhouse01 - """ - ch-monitoring ro-replica - """ - Then we get response - """ - 2;Readonly replica tables: test.table_01 - """ - - Scenario: Check CoreDumps - When we execute command on clickhouse01 - """ - ch-monitoring core-dumps - """ - Then we get response - """ - 1;Core dump directory does not exist: /var/cores - """ - When we execute command on clickhouse01 - """ - mkdir /var/cores - """ - When we execute command on clickhouse01 - """ - ch-monitoring core-dumps - """ - Then we get response - """ - 0;OK - """ - When we execute command on clickhouse01 - """ - echo 1 > /var/cores/fakecore - """ - And we execute command on clickhouse01 - """ - ch-monitoring core-dumps - """ - Then we get response - """ - 0;OK - """ - When we execute command on clickhouse01 - """ - chown clickhouse /var/cores/fakecore - """ - And we execute command on clickhouse01 - """ - ch-monitoring core-dumps - """ - Then we get response contains - """ - 2;/var/cores/fakecore - """ - - Scenario: Check Geobase - When we execute command on clickhouse01 - """ - ch-monitoring geobase - """ - Then we get response contains - """ - 1;Unknown error: Code: 156. DB::Exception: Embedded dictionaries were not loaded. - """ - When we execute command on clickhouse01 - """ - echo -e " - - /opt/geo/regions_hierarchy.txt - /opt/geo/ - - " > /etc/clickhouse-server/config.d/geo.xml && \ - supervisorctl restart clickhouse-server - """ - And we sleep for 5 seconds - And we execute command on clickhouse01 - """ - ch-monitoring geobase - """ - Then we get response - """ - 0;OK - """ - - Scenario: Check Distributed tables - When we execute command on clickhouse01 - """ - ch-monitoring dist-tables - """ - Then we get response - """ - 0;OK - """ - - Scenario: Check Replication lag - When we execute command on clickhouse01 - """ - ch-monitoring replication-lag - """ - Then we get response - """ - 0;OK - """ - When we execute query on clickhouse01 - """ - SYSTEM STOP FETCHES - """ - And we execute query on clickhouse02 - """ - INSERT INTO test.table_01 SELECT number FROM numbers(100) - """ - And we sleep for 5 seconds - And we execute command on clickhouse01 - """ - ch-monitoring replication-lag -w 4 - """ - Then we get response contains - """ - 1; - """ - - Scenario: Check System queues size - When we execute command on clickhouse01 - """ - ch-monitoring system-queues - """ - Then we get response - """ - 0;OK - """ - - Scenario: Check Log errors - When we sleep for 20 seconds - And we execute command on clickhouse01 - """ - ch-monitoring log-errors -n 10 - """ - Then we get response - """ - 0;OK, 0 errors for last 10 seconds - """ - When we execute query on clickhouse01 - """ - SELECT 1; - """ - And we sleep for 5 seconds - And we execute command on clickhouse01 - """ - ch-monitoring log-errors -n 20 - """ - Then we get response - """ - 0;OK, 0 errors for last 20 seconds - """ - When we execute query on clickhouse01 - """ - FOOBAR INCORRECT REQUEST; - """ - And we sleep for 5 seconds - And we execute command on clickhouse01 - """ - ch-monitoring log-errors -n 20 - """ - Then we get response - """ - 0;OK, 2 errors for last 20 seconds - """ - When we execute query on clickhouse01 - """ - FOOBAR INCORRECT REQUEST; - """ - And we execute query on clickhouse01 - """ - FOOBAR INCORRECT REQUEST; - """ - And we execute query on clickhouse01 - """ - FOOBAR INCORRECT REQUEST; - """ - And we execute query on clickhouse01 - """ - FOOBAR INCORRECT REQUEST; - """ - And we sleep for 5 seconds - And we execute command on clickhouse01 - """ - ch-monitoring log-errors -n 20 - """ - Then we get response - """ - 1;10 errors for last 20 seconds - """ - When we sleep for 21 seconds - And we execute command on clickhouse01 - """ - ch-monitoring log-errors -n 20 - """ - Then we get response - """ - 0;OK, 0 errors for last 20 seconds - """ - - Scenario: Check Log errors with some random test log - When we execute command on clickhouse01 - """ - echo 2000.01.01 00:00:00 test line > /tmp/test.log - for j in {1..2000}; do echo junk line >> /tmp/test.log; done - ch-monitoring log-errors -n 20 -f /tmp/test.log - """ - Then we get response - """ - 0;OK, 0 errors for last 20 seconds - """ - - Scenario: Check Ping - When we execute command on clickhouse01 - """ - ch-monitoring ping - """ - Then we get response - """ - 0;OK - """ - When we execute command on clickhouse01 - """ - supervisorctl stop clickhouse-server - ch-monitoring ping - """ - Then we get response contains - """ - 2;ClickHouse is dead - """ - - # TODO Wait till ch-backup is opensourced - # Scenario: Check Orphaned Backups - # When we execute command on clickhouse01 - # """ - # ch-monitoring orphaned-backups - # """ - # Then we get response - # """ - # 0;OK - # """ - # When we execute query on clickhouse01 - # """ - # ALTER TABLE test.test_unfreeze FREEZE; - # """ - # And we execute command on clickhouse01 - # """ - # ch-monitoring orphaned-backups - # """ - # Then we get response contains - # """ - # 1;There are 1 orphaned S3 backups - # """ - # - # Scenario: Check restore errors - # When we execute command on clickhouse01 - # """ - # echo '{ - # "failed":{ - # "failed_parts":{ - # "db1": { - # "tbl1": { - # "failed1":"exception1" - # } - # } - # } - # }, - # "databases": { - # "db1": { - # "tbl1": ["part1", "part2", "part3", "part4", "part5"] - # }, - # "db2": { - # "tbl2": ["part1", "part2", "part3", "part4", "part5"] - # } - # } - # }' > /tmp/ch_backup_restore_state.json - # """ - # When we execute command on clickhouse01 - # """ - # ch-monitoring backup - # """ - # Then we get response - # """ - # 1;Some parts restore failed: 1(9%) - # """ - # When we execute command on clickhouse01 - # """ - # echo '{ - # "failed":{ - # "failed_parts":{ - # "db1": { - # "tbl1": { - # "failed1":"exception1" - # } - # }, - # "db2": { - # "tbl2": { - # "failed2":"exception2" - # } - # } - # } - # }, - # "databases": { - # "db2": { - # "tbl2": ["part1"] - # } - # } - # }' > /tmp/ch_backup_restore_state.json - # """ - # When we execute command on clickhouse01 - # """ - # ch-monitoring backup - # """ - # Then we get response - # """ - # 2;Some parts restore failed: 2(66%) - # """ - # - # Scenario: Check valid backups do not exist - # When we execute command on clickhouse01 - # """ - # ch-monitoring backup - # """ - # Then we get response - # """ - # 2;No valid backups found - # """ - - Scenario: Check CH Keeper alive - Given a working keeper on clickhouse01 - When we execute command on clickhouse01 - """ - ch-monitoring keeper -n - """ - Then we get response - """ - 0;OK - """ - When we execute command on clickhouse01 - """ - supervisorctl stop clickhouse-server - """ - When we execute command on clickhouse01 - """" - ch-monitoring keeper -n - """ - Then we get response contains - """ - 2;KazooTimeoutError('Connection time-out') - """ - - Scenario: Check clickhouse orphaned objects with state-zk-path option - When we execute command on clickhouse01 - """ - chadmin object-storage clean --dry-run --to-time 0h --keep-paths --store-state-zk-path /tmp/shard_1 - """ - When we execute command on clickhouse01 - """ - ch-monitoring orphaned-objects --state-zk-path /tmp/shard_1 - """ - Then we get response - """ - 0;Total size: 0 - """ - When we put object in S3 - """ - bucket: cloud-storage-test - path: /data/orpaned_object.tsv - data: '1234567890' - """ - When we execute command on clickhouse01 - """ - chadmin object-storage clean --dry-run --to-time 0h --keep-paths --store-state-zk-path /tmp/shard_1 - """ - When we execute command on clickhouse01 - """ - ch-monitoring orphaned-objects --state-zk-path /tmp/shard_1 - """ - Then we get response contains - """ - 0;Total size: 10 - """ - When we execute command on clickhouse01 - """ - ch-monitoring orphaned-objects -w 9 -c 19 --state-zk-path /tmp/shard_1 - """ - Then we get response contains - """ - 1;Total size: 10 - """ - When we execute command on clickhouse01 - """ - ch-monitoring orphaned-objects -w 4 -c 9 --state-zk-path /tmp/shard_1 - """ - Then we get response contains - """ - 2;Total size: 10 - """ - - Scenario: Check clickhouse orphaned objects with state-local option - When we execute command on clickhouse01 - """ - chadmin object-storage clean --dry-run --to-time 0h --keep-paths --store-state-local - """ - When we execute command on clickhouse01 - """ - ch-monitoring orphaned-objects --state-local - """ - Then we get response - """ - 0;Total size: 0 - """ - When we put object in S3 - """ - bucket: cloud-storage-test - path: /data/orpaned_object.tsv - data: '1234567890' - """ - When we execute command on clickhouse01 - """ - chadmin object-storage clean --dry-run --to-time 0h --keep-paths --store-state-local - """ - When we execute command on clickhouse01 - """ - ch-monitoring orphaned-objects --state-local - """ - Then we get response contains - """ - 0;Total size: 10 - """ - When we execute command on clickhouse01 - """ - ch-monitoring orphaned-objects -w 9 -c 19 --state-local - """ - Then we get response contains - """ - 1;Total size: 10 - """ - When we execute command on clickhouse01 - """ - ch-monitoring orphaned-objects -w 4 -c 9 --state-local - """ - Then we get response contains - """ - 2;Total size: 10 - """ - - Scenario: Check clickhouse orphaned objects --state-local and --state-zk-path are mutually exclusive - When we execute command on clickhouse01 - """ - ch-monitoring orphaned-objects -w 9 -c 19 --state-local --state-zk-path /tmp/shard_1 - """ - Then we get response contains - """ - 1;Unknown error: Options --state-local and --state-zk-path are mutually exclusive. - """ - When we execute command on clickhouse01 - """ - ch-monitoring orphaned-objects -w 9 -c 19 - """ - Then we get response contains - """ - 1;Unknown error: One of these options must be provided: --state-local, --state-zk-path - """ - - Scenario: Check clickhouse orphaned objects with not empty error_msg - When we create file /tmp/object_storage_cleanup_state.json with data "{ \"orphaned_objects_size\": 0, \"error_msg\": \"ERROR\" }" - And we execute command on clickhouse01 - """ - ch-monitoring orphaned-objects --state-local - """ - Then we get response - """ - 2;ERROR - """ Scenario: Check clickhouse orphaned objects with long error_msg - When we create file /tmp/object_storage_cleanup_state.json with data "{ \"orphaned_objects_size\": 0, \"error_msg\": \"Code: 27. DB::Exception: Cannot parse: input:: expected '\\t' before: 'klg%2D1acvr8hmq0n16qm5%2Edb%2Eyandex%2Enet\\ndefault\\n6736d483-516a-4892-87d4-084d5c1f6d3c\\n': While executing SystemRemoteDataPaths. (CANNOT_PARSE_INPUT_ASSERTION_FAILED) (version 24.8.5.115 (official build)) Query: SELECT obj_path, obj_size FROM _system.listing_objects_from_object_storage AS object_storage LEFT ANTI JOIN remoteSecure('klg-1acvr8hmq0n16qm5.db.yandex.net', system.remote_data_paths) AS object_table ON object_table.remote_path = object_storage.obj_path AND object_table.disk_name = 'object_storage' SETTINGS traverse_shadow_remote_data_paths=1 FORMAT TabSeparated (klg-1acvr8hmq0n16qm5.mdb.yandex.net)\" }" + When we create file /tmp/object_storage_cleanup_state.json with data "{ \"orphaned_objects_size\": 0, \"error_msg\": \"Code: 27. DB::Exception: Cannot parse: input:: expected '\\\\t' before: 'klg%2D1acvr8hmq0n16qm5%2Edb%2Eyandex%2Enet\\\\ndefault\\\\n6736d483-516a-4892-87d4-084d5c1f6d3c\\\\n': While executing SystemRemoteDataPaths. (CANNOT_PARSE_INPUT_ASSERTION_FAILED) (version 24.8.5.115 (official build)) Query: SELECT obj_path, obj_size FROM _system.listing_objects_from_object_storage AS object_storage LEFT ANTI JOIN remoteSecure('klg-1acvr8hmq0n16qm5.db.yandex.net', system.remote_data_paths) AS object_table ON object_table.remote_path = object_storage.obj_path AND object_table.disk_name = 'object_storage' SETTINGS traverse_shadow_remote_data_paths=1 FORMAT TabSeparated (klg-1acvr8hmq0n16qm5.mdb.yandex.net)\" }" And we execute command on clickhouse01 """ ch-monitoring orphaned-objects --state-local """ Then we get response """ - 2;Code: 27. DB::Exception: ... (CANNOT_PARSE_INPUT_ASSERTION_FAILED) (version 24.8.5.115 (official build)) + 2;Code: 27. DB::Exception: ... (CANNOT_PARSE_INPUT_ASSERTION_FAILED) (version 24.8.5.115 (official build)) ... """