From b7df296937e54baf0d675d608d34af3fa330d1bc Mon Sep 17 00:00:00 2001 From: David Boucher Date: Fri, 22 Nov 2024 10:56:14 +0100 Subject: [PATCH] Mon 153321 mariadb connection with errno0 24.10 (#1884) * fix(broker/sql): two issues in the mysql object * A possible segfault fixed. * An issue on errors raised by mariadb that can have errno=0 now. * enh(tests): new tests on database connection * enh(tests): new test that can lead to a segfault with the mysql object * fix(cmake): missing dependency on pb_neb_lib REFS: MON-153321 --- broker/CMakeLists.txt | 2 +- broker/core/sql/src/mysql_connection.cc | 27 ++- broker/core/sql/src/mysql_multi_insert.cc | 6 +- .../services-and-bulk-stmt.robot | 156 +++++++++++++++- tests/broker-engine/services-increased.robot | 4 +- tests/resources/Broker.py | 174 +++++++++++++++++- tests/resources/resources.resource | 18 +- 7 files changed, 355 insertions(+), 32 deletions(-) diff --git a/broker/CMakeLists.txt b/broker/CMakeLists.txt index ad2373471fe..a17b6afbf17 100644 --- a/broker/CMakeLists.txt +++ b/broker/CMakeLists.txt @@ -468,7 +468,7 @@ target_link_libraries( # Standalone binary. add_executable(cbd ${SRC_DIR}/main.cc) -add_dependencies(cbd multiplexing centreon_common) +add_dependencies(cbd multiplexing centreon_common pb_neb_lib) # Flags needed to include all symbols in binary. target_link_libraries( diff --git a/broker/core/sql/src/mysql_connection.cc b/broker/core/sql/src/mysql_connection.cc index 5c6d2548bba..13f5fd1be7c 100644 --- a/broker/core/sql/src/mysql_connection.cc +++ b/broker/core/sql/src/mysql_connection.cc @@ -16,6 +16,7 @@ * For more information : contact@centreon.com */ #include +#include #include "com/centreon/broker/config/applier/init.hh" #include "com/centreon/broker/misc/misc.hh" @@ -460,18 +461,26 @@ void mysql_connection::_statement(mysql_task* t) { "mysql_connection {:p}: execute statement {:x} attempt {}: {}", static_cast(this), task->statement_id, attempts, query); if (mysql_stmt_execute(stmt)) { - std::string err_msg( - fmt::format("{} errno={} {}", mysql_error::msg[task->error_code], - ::mysql_errno(_conn), ::mysql_stmt_error(stmt))); - SPDLOG_LOGGER_ERROR(_logger, - "connection fail to execute statement {:p}: {}", - static_cast(this), err_msg); - if (_server_error(::mysql_stmt_errno(stmt))) { + int32_t err_code = ::mysql_stmt_errno(stmt); + std::string err_msg(fmt::format("{} errno={} {}", + mysql_error::msg[task->error_code], + err_code, ::mysql_stmt_error(stmt))); + if (err_code == 0) { + SPDLOG_LOGGER_ERROR(_logger, + "mysql_connection: errno=0, so we simulate a " + "server error CR_SERVER_LOST"); + err_code = CR_SERVER_LOST; + } else { + SPDLOG_LOGGER_ERROR(_logger, + "connection fail to execute statement {:p}: {}", + static_cast(this), err_msg); + } + if (_server_error(err_code)) { set_error_message(err_msg); break; } - if (mysql_stmt_errno(stmt) != 1213 && - mysql_stmt_errno(stmt) != 1205) // Dead Lock error + if (err_code != ER_LOCK_DEADLOCK && + err_code != ER_LOCK_WAIT_TIMEOUT) // Dead Lock error attempts = MAX_ATTEMPTS; if (mysql_commit(_conn)) { diff --git a/broker/core/sql/src/mysql_multi_insert.cc b/broker/core/sql/src/mysql_multi_insert.cc index cafc020e386..7d375cb82cd 100644 --- a/broker/core/sql/src/mysql_multi_insert.cc +++ b/broker/core/sql/src/mysql_multi_insert.cc @@ -132,7 +132,11 @@ void bulk_or_multi::execute(mysql& connexion, my_error::code ec, int thread_id) { if (_bulk_stmt) { - if (!_bulk_bind->empty()) { + /* If the database connection is lost, we can have this issue */ + if (!_bulk_bind) { + _bulk_bind = _bulk_stmt->create_bind(); + _bulk_bind->reserve(_bulk_row); + } else if (!_bulk_bind->empty()) { _bulk_stmt->set_bind(std::move(_bulk_bind)); connexion.run_statement(*_bulk_stmt, ec, thread_id); _bulk_bind = _bulk_stmt->create_bind(); diff --git a/tests/broker-engine/services-and-bulk-stmt.robot b/tests/broker-engine/services-and-bulk-stmt.robot index c6d070e8b05..4bb3255a524 100644 --- a/tests/broker-engine/services-and-bulk-stmt.robot +++ b/tests/broker-engine/services-and-bulk-stmt.robot @@ -29,7 +29,7 @@ EBBPS1 ${start} Get Current Date ${start_broker} Get Current Date Ctn Start Broker - Ctn Start engine + Ctn Start Engine Ctn Wait For Engine To Be Ready ${start} FOR ${i} IN RANGE ${1000} @@ -52,6 +52,7 @@ EBBPS1 IF "${output}" == "((0,),)" BREAK END Should Be Equal As Strings ${output} ((0,),) + Disconnect From Database FOR ${i} IN RANGE ${1000} Ctn Process Service Check Result host_1 service_${i+1} 2 warning${i} @@ -89,6 +90,7 @@ EBBPS1 IF "${output}" == "((0,),)" BREAK END Should Be Equal As Strings ${output} ((0,),) + Disconnect From Database EBBPS2 [Documentation] 1000 service check results are sent to the poller. The test is done with the unified_sql stream, no service status is lost, we find the 1000 results in the database: table services. @@ -109,7 +111,7 @@ EBBPS2 ${start} Get Current Date ${start_broker} Get Current Date Ctn Start Broker - Ctn Start engine + Ctn Start Engine ${content} Create List INITIAL SERVICE STATE: host_1;service_1000; ${result} Ctn Find In Log With Timeout ${engineLog0} ${start} ${content} 30 Should Be True @@ -135,6 +137,7 @@ EBBPS2 IF "${output}" == "((0,),)" BREAK END Should Be Equal As Strings ${output} ((0,),) + Disconnect From Database FOR ${i} IN RANGE ${1000} Ctn Process Service Check Result host_1 service_${i+1} 2 critical${i} @@ -171,6 +174,7 @@ EBBPS2 IF "${output}" == "((0,),)" BREAK END Should Be Equal As Strings ${output} ((0,),) + Disconnect From Database EBMSSM [Documentation] 1000 services are configured with 100 metrics each. The rrd output is removed from the broker configuration. GetSqlManagerStats is called to measure writes into data_bin. @@ -191,7 +195,7 @@ EBMSSM Ctn Clear Retention ${start} Get Current Date Ctn Start Broker - Ctn Start engine + Ctn Start Engine Ctn Broker Set Sql Manager Stats 51001 5 5 # Let's wait for the external command check start @@ -217,6 +221,7 @@ EBMSSM Sleep 1s END Should Be True ${output[0][0]} >= 100000 + Disconnect From Database EBPS2 [Documentation] 1000 services are configured with 20 metrics each. The rrd output is removed from the broker configuration to avoid to write too many rrd files. While metrics are written in bulk, the database is stopped. This must not crash broker. @@ -240,7 +245,7 @@ EBPS2 ${start} Get Current Date Ctn Start Broker - Ctn Start engine + Ctn Start Engine # Let's wait for the external command check start ${content} Create List check_for_external_commands() ${result} Ctn Find In Log With Timeout ${engineLog0} ${start} ${content} 60 @@ -294,7 +299,7 @@ RLCode ${start} Get Current Date Ctn Start Broker - Ctn Start engine + Ctn Start Engine ${content} Create List check_for_external_commands() ${result} Ctn Find In Log With Timeout ${engineLog0} ${start} ${content} 60 @@ -364,7 +369,7 @@ metric_mapping ${start} Get Current Date Ctn Start Broker - Ctn Start engine + Ctn Start Engine ${content} Create List check_for_external_commands() ${result} Ctn Find In Log With Timeout ${engineLog0} ${start} ${content} 60 @@ -404,7 +409,7 @@ Services_and_bulks_${id} ${start} Get Current Date Ctn Start Broker - Ctn Start engine + Ctn Start Engine Ctn Broker Set Sql Manager Stats 51001 5 5 # Let's wait for the external command check start @@ -435,6 +440,143 @@ Services_and_bulks_${id} ... 1 1020 ... 2 150 +EBMSSMDBD + [Documentation] 1000 services are configured with 100 metrics each. + ... The rrd output is removed from the broker configuration. + ... While metrics are written in the database, we stop the database and then restart it. + ... Broker must recover its connection to the database and continue to write metrics. + [Tags] broker engine unified_sql MON-152743 + Ctn Clear Metrics + Ctn Config Engine ${1} ${1} ${1000} + # We want all the services to be passive to avoid parasite checks during our test. + Ctn Set Services Passive ${0} service_.* + Ctn Config Broker central + Ctn Config Broker rrd + Ctn Config Broker module ${1} + Ctn Config BBDO3 1 + Ctn Broker Config Log central core error + Ctn Broker Config Log central tcp error + Ctn Broker Config Log central sql debug + Ctn Config Broker Sql Output central unified_sql + Ctn Config Broker Remove Rrd Output central + Ctn Clear Retention + ${start} Get Current Date + Ctn Start Broker + Ctn Start Engine + + Ctn Wait For Engine To Be Ready ${start} 1 + + ${start} Ctn Get Round Current Date + # Let's wait for one "INSERT INTO data_bin" to appear in stats. + Log To Console Many service checks with 100 metrics each are processed. + FOR ${i} IN RANGE ${1000} + Ctn Process Service Check Result With Metrics host_1 service_${i+1} 1 warning${i} 100 + END + + Log To Console We wait for at least one metric to be written in the database. + # Let's wait for all force checks to be in the storage database. + Connect To Database pymysql ${DBName} ${DBUser} ${DBPass} ${DBHost} ${DBPort} + FOR ${i} IN RANGE ${500} + ${output} Query + ... SELECT COUNT(s.last_check) FROM metrics m LEFT JOIN index_data i ON m.index_id = i.id LEFT JOIN services s ON s.host_id = i.host_id AND s.service_id = i.service_id WHERE metric_name LIKE "metric_%%" AND s.last_check >= ${start} + IF ${output[0][0]} >= 1 BREAK + Sleep 1s + END + Disconnect From Database + + Log To Console Let's start some database manipulation... + ${start} Get Current Date + + FOR ${i} IN RANGE ${3} + Ctn Stop Mysql + Sleep 10s + Ctn Start Mysql + ${content} Create List could not insert data in data_bin + ${result} Ctn Find In Log With Timeout ${centralLog} ${start} ${content} 10 + Log To Console ${result} + END + +EBMSSMPART + [Documentation] 1000 services are configured with 100 metrics each. + ... The rrd output is removed from the broker configuration. + ... The data_bin table is configured with two partitions p1 and p2 such + ... that p1 contains old data and p2 contains current data. + ... While metrics are written in the database, we remove the p2 partition. + ... Once the p2 partition is recreated, broker must recover its connection + ... to the database and continue to write metrics. + ... To check that last point, we force a last service check and we check + ... that its metrics are written in the database. + [Tags] broker engine unified_sql MON-152743 + Ctn Clear Metrics + Ctn Config Engine ${1} ${1} ${1000} + # We want all the services to be passive to avoid parasite checks during our test. + Ctn Set Services Passive ${0} service_.* + Ctn Config Broker central + Ctn Config Broker rrd + Ctn Config Broker module ${1} + Ctn Config BBDO3 1 + Ctn Broker Config Log central core error + Ctn Broker Config Log central tcp error + Ctn Broker Config Log central sql trace + Ctn Config Broker Sql Output central unified_sql + Ctn Config Broker Remove Rrd Output central + Ctn Clear Retention + + Ctn Prepare Partitions For Data Bin + ${start} Get Current Date + Ctn Start Broker + Ctn Start Engine + + Ctn Wait For Engine To Be Ready ${start} 1 + + ${start} Ctn Get Round Current Date + # Let's wait for one "INSERT INTO data_bin" to appear in stats. + Log To Console Many service checks with 100 metrics each are processed. + FOR ${i} IN RANGE ${1000} + Ctn Process Service Check Result With Metrics host_1 service_${i+1} 1 warning${i} 100 + END + + Log To Console We wait for at least one metric to be written in the database. + # Let's wait for all force checks to be in the storage database. + Connect To Database pymysql ${DBName} ${DBUser} ${DBPass} ${DBHost} ${DBPort} + FOR ${i} IN RANGE ${500} + ${output} Query + ... SELECT COUNT(s.last_check) FROM metrics m LEFT JOIN index_data i ON m.index_id = i.id LEFT JOIN services s ON s.host_id = i.host_id AND s.service_id = i.service_id WHERE metric_name LIKE "metric_%%" AND s.last_check >= ${start} + IF ${output[0][0]} >= 1 BREAK + Sleep 1s + END + Disconnect From Database + + Log To Console Let's start some database manipulation... + Ctn Remove P2 From Data Bin + ${start} Get Current Date + + ${content} Create List errno= + FOR ${i} IN RANGE ${6} + ${result} Ctn Find In Log With Timeout ${centralLog} ${start} ${content} 10 + IF ${result} BREAK + END + + Log To Console Let's recreate the p2 partition... + Ctn Add P2 To Data Bin + + ${start} Ctn Get Round Current Date + Ctn Process Service Check Result With Metrics host_1 service_1 0 Last Output OK 100 + + Log To Console Let's wait for the last service check to be in the database... + Connect To Database pymysql ${DBName} ${DBUser} ${DBPass} ${DBHost} ${DBPort} + FOR ${i} IN RANGE ${120} + ${output} Query SELECT count(*) FROM data_bin WHERE ctime >= ${start} - 10 + Log To Console ${output} + IF ${output[0][0]} >= 100 BREAK + Sleep 1s + END + Log To Console ${output} + Should Be True ${output[0][0]} >= 100 + Disconnect From Database + + Ctn Init Data Bin Without Partition + *** Keywords *** Ctn Test Clean diff --git a/tests/broker-engine/services-increased.robot b/tests/broker-engine/services-increased.robot index d64909f265f..b6c2ad86e52 100644 --- a/tests/broker-engine/services-increased.robot +++ b/tests/broker-engine/services-increased.robot @@ -42,7 +42,7 @@ EBNSVC1 ${result} Ctn Check Number Of Resources Monitored By Poller Is ${3} ${nb_res} 30 Should Be True ${result} Poller 3 should monitor ${nb_srv} services and 16 hosts. END - Ctn Stop engine + Ctn Stop Engine Ctn Kindly Stop Broker Service_increased_huge_check_interval @@ -154,4 +154,4 @@ Service_increased_huge_check_interval ... rra[0].pdp_per_row must be equal to 5400 for metric ${m} END - [Teardown] Run Keywords Ctn Stop engine AND Ctn Kindly Stop Broker + [Teardown] Run Keywords Ctn Stop Engine AND Ctn Kindly Stop Broker diff --git a/tests/resources/Broker.py b/tests/resources/Broker.py index e23c0fd98a7..61acb9c12f2 100755 --- a/tests/resources/Broker.py +++ b/tests/resources/Broker.py @@ -1690,7 +1690,7 @@ def ctn_get_service_index(host_id: int, service_id: int, timeout: int = 60): my_id = [r['id'] for r in result] if len(my_id) > 0: logger.console( - f"Index data {id} found for service {host_id}:{service_id}") + f"Index data {id} found for service {host_id}:{service_id}") return my_id[0] time.sleep(2) logger.console(f"no index data found for service {host_id}:{service_id}") @@ -1780,7 +1780,6 @@ def ctn_compare_metrics_of_service(service_id: int, metrics: list, timeout: int return False - def ctn_get_not_existing_metrics(count: int): """ Return a list of metrics that does not exist. @@ -2082,14 +2081,16 @@ def ctn_get_indexes_to_rebuild(count: int, nb_day=180): dt = now.replace(hour=0, minute=0, second=0, microsecond=0) start = dt - datetime.timedelta(days=nb_day) start = int(start.timestamp()) - logger.console(f">>>>>>>>>> start = {datetime.datetime.fromtimestamp(start)}") + logger.console( + f">>>>>>>>>> start = {datetime.datetime.fromtimestamp(start)}") value = int(r['metric_id']) // 2 status_value = index_id % 3 cursor.execute("DELETE FROM data_bin WHERE id_metric={} AND ctime >= {}".format( r['metric_id'], start)) # We set the value to a constant on 180 days now = int(now.timestamp()) - logger.console(f">>>>>>>>>> end = {datetime.datetime.fromtimestamp(now)}") + logger.console( + f">>>>>>>>>> end = {datetime.datetime.fromtimestamp(now)}") for i in range(start, now, 60 * 5): if i == start: logger.console( @@ -2974,3 +2975,168 @@ def ctn_get_broker_log_info(port, log, timeout=TIMEOUT): except: logger.console("gRPC server not ready") return str(res) + + +def aes_encrypt(port, app_secret, salt, content, timeout: int = 30): + """ + Send a gRPC command to aes encrypt a content + + Args: + port (int): the port to the gRPC server. + app_secret (str): The APP_SECRET base64 encoded. + salt (str): Salt base64 encoded. + content (str): The content to encrypt. + + Returns: + The encrypted result string or an error message. + """ + limit = time.time() + timeout + encoded = "" + while time.time() < limit: + time.sleep(1) + with grpc.insecure_channel(f"127.0.0.1:{port}") as channel: + stub = broker_pb2_grpc.BrokerStub(channel) + te = broker_pb2.AesMessage() + te.app_secret = app_secret + te.salt = salt + te.content = content + try: + encoded = stub.Aes256Encrypt(te) + break + except grpc.RpcError as rpc_error: + return rpc_error.details() + except: + logger.console("gRPC server not ready") + + return encoded.str_arg + + +def aes_decrypt(port, app_secret, salt, content, timeout: int = 30): + """ + Send a gRPC command to aes decrypt a content + + Args: + port (int): the port to the gRPC server. + app_secret (str): The APP_SECRET base64 encoded. + salt (str): Salt base64 encoded. + content (str): The content to decrypt. + + Returns: + The decrypted result string or an error message. + """ + limit = time.time() + timeout + encoded = "" + while time.time() < limit: + time.sleep(1) + with grpc.insecure_channel(f"127.0.0.1:{port}") as channel: + stub = broker_pb2_grpc.BrokerStub(channel) + te = broker_pb2.AesMessage() + te.app_secret = app_secret + te.salt = salt + te.content = content + try: + encoded = stub.Aes256Decrypt(te) + break + except grpc.RpcError as rpc_error: + return rpc_error.details() + except: + logger.console("gRPC server not ready") + + return encoded.str_arg + + +def ctn_prepare_partitions_for_data_bin(): + """ + Create two partitions for the data_bin table. + The first one named p1 contains data with ctime older than now - 60. + The second one named p2 contains data with ctime older than now + 3600. + """ + connection = pymysql.connect(host=DB_HOST, + user=DB_USER, + password=DB_PASS, + database=DB_NAME_STORAGE, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + + now = int(time.time()) + before = now - 60 + after = now + 3600 + with connection: + with connection.cursor() as cursor: + cursor.execute("DROP TABLE IF EXISTS data_bin") + sql = f"""CREATE TABLE `data_bin` ( + `id_metric` int(11) DEFAULT NULL, + `ctime` int(11) DEFAULT NULL, + `value` float DEFAULT NULL, + `status` enum('0','1','2','3','4') DEFAULT NULL, + KEY `index_metric` (`id_metric`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 + PARTITION BY RANGE (`ctime`) +(PARTITION `p1` VALUES LESS THAN ({before}) ENGINE = InnoDB, + PARTITION `p2` VALUES LESS THAN ({after}) ENGINE = InnoDB)""" + cursor.execute(sql) + connection.commit() + + +def ctn_remove_p2_from_data_bin(): + """ + Remove the partition p2 from the data_bin table. + """ + connection = pymysql.connect(host=DB_HOST, + user=DB_USER, + password=DB_PASS, + database=DB_NAME_STORAGE, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + + with connection: + with connection.cursor() as cursor: + cursor.execute("ALTER TABLE data_bin DROP PARTITION p2") + connection.commit() + + +def ctn_add_p2_to_data_bin(): + """ + Add the partition p2 the the data_bin table. + """ + connection = pymysql.connect(host=DB_HOST, + user=DB_USER, + password=DB_PASS, + database=DB_NAME_STORAGE, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + + after = int(time.time()) + 3600 + with connection: + with connection.cursor() as cursor: + cursor.execute( + f"ALTER TABLE data_bin ADD PARTITION (PARTITION p2 VALUES LESS THAN ({after}))") + connection.commit() + + +def ctn_init_data_bin_without_partition(): + """ + Recreate the data_bin table without partition. + """ + connection = pymysql.connect(host=DB_HOST, + user=DB_USER, + password=DB_PASS, + database=DB_NAME_STORAGE, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + + now = int(time.time()) + before = now - 60 + after = now + 3600 + with connection: + with connection.cursor() as cursor: + cursor.execute("DROP TABLE IF EXISTS data_bin") + sql = f"""CREATE TABLE `data_bin` ( + `id_metric` int(11) DEFAULT NULL, + `ctime` int(11) DEFAULT NULL, + `value` float DEFAULT NULL, + `status` enum('0','1','2','3','4') DEFAULT NULL, + KEY `index_metric` (`id_metric`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1""" + cursor.execute(sql) + connection.commit() diff --git a/tests/resources/resources.resource b/tests/resources/resources.resource index d1143f9da95..bcf0b5de4b6 100644 --- a/tests/resources/resources.resource +++ b/tests/resources/resources.resource @@ -369,13 +369,15 @@ Ctn Dump Ba On Error Ctn Process Service Result Hard [Arguments] ${host} ${svc} ${state} ${output} - Repeat Keyword - ... 3 times - ... Ctn Process Service Check Result - ... ${host} - ... ${svc} - ... ${state} - ... ${output} + FOR ${idx} IN RANGE 3 + Ctn Process Service Check Result + ... ${host} + ... ${svc} + ... ${state} + ... ${output} + Sleep 1s + END + Ctn Wait For Engine To Be Ready [Arguments] ${start} ${nbEngine}=1 @@ -385,7 +387,7 @@ Ctn Wait For Engine To Be Ready ${result} Ctn Find In Log With Timeout ... ${ENGINE_LOG}/config${i}/centengine.log ... ${start} ${content} 60 - ... verbose=False + ... verbose=False Should Be True ... ${result} ... A message telling check_for_external_commands() should be available in config${i}/centengine.log.