From e96fc32bb72b9c56c59b6d98f7a63fb12b39db48 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 19 Nov 2024 11:27:27 +0100 Subject: [PATCH 1/2] Fix race when removing a node --- include/libnuraft/raft_server.hxx | 2 +- src/handle_client_request.cxx | 2 +- src/raft_server.cxx | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/libnuraft/raft_server.hxx b/include/libnuraft/raft_server.hxx index a21341d0..bc70a176 100644 --- a/include/libnuraft/raft_server.hxx +++ b/include/libnuraft/raft_server.hxx @@ -1459,7 +1459,7 @@ protected: /** * Lock of handling client request and role change. */ - std::mutex cli_lock_; + std::recursive_mutex cli_lock_; /** * Condition variable to invoke BG commit thread. diff --git a/src/handle_client_request.cxx b/src/handle_client_request.cxx index a71bf974..e2164e7c 100644 --- a/src/handle_client_request.cxx +++ b/src/handle_client_request.cxx @@ -51,7 +51,7 @@ ptr raft_server::handle_cli_req_prelock(req_msg& req, case raft_params::dual_mutex: default: { // TODO: Use RW lock here. - auto_lock(cli_lock_); + recur_lock(cli_lock_); resp = handle_cli_req(req, ext_params, timestamp_us); break; } diff --git a/src/raft_server.cxx b/src/raft_server.cxx index 15dacb4b..7d8efb9d 100644 --- a/src/raft_server.cxx +++ b/src/raft_server.cxx @@ -1025,7 +1025,7 @@ void raft_server::become_leader() { } ptr params = ctx_->get_params(); - { auto_lock(cli_lock_); + { recur_lock(cli_lock_); role_ = srv_role::leader; leader_ = id_; srv_to_join_.reset(); @@ -1389,7 +1389,7 @@ bool raft_server::request_leadership() { void raft_server::become_follower() { // stop hb for all peers p_in("[BECOME FOLLOWER] term %" PRIu64 "", state_->get_term()); - { std::lock_guard ll(cli_lock_); + { std::lock_guard ll(cli_lock_); for (peer_itor it = peers_.begin(); it != peers_.end(); ++it) { it->second->enable_hb(false); } @@ -1446,7 +1446,7 @@ bool raft_server::update_term(ulong term) { // // To avoid this issue, we acquire `cli_lock_`, // and change `role_` first before setting the term. - std::lock_guard ll(cli_lock_); + std::lock_guard ll(cli_lock_); role_ = srv_role::follower; state_->set_term(term); } @@ -1764,6 +1764,7 @@ ulong raft_server::store_log_entry(ptr& entry, ulong index) { } if ( role_ == srv_role::leader ) { + recur_lock(cli_lock_); // Need to progress precommit index for config. try_update_precommit_index(log_index); } From bc1baa46dd9dbe68fdbf1ef270f0ad896fba29ac Mon Sep 17 00:00:00 2001 From: Jung-Sang Ahn Date: Tue, 3 Dec 2024 12:56:31 -0800 Subject: [PATCH 2/2] Add comment --- src/raft_server.cxx | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/raft_server.cxx b/src/raft_server.cxx index d72a5e1a..ddf70501 100644 --- a/src/raft_server.cxx +++ b/src/raft_server.cxx @@ -1773,7 +1773,36 @@ ulong raft_server::store_log_entry(ptr& entry, ulong index) { } if ( role_ == srv_role::leader ) { + // WARNING: + // Configuration changes, such as adding or removing a member, + // can run concurrently with normal log appending operations. This + // concurrency can lead to an inversion issue between precommit and + // commit orders, particularly when there is only one member in the + // cluster. + // + // Let's consider the following scenario: T1 is the thread handling + // log appending, T2 is the thread processing configuration changes, + // and T3 is the commit thread. + // The initial precommit and commit index is 10. + // + // [T1] Acquires `cli_lock_` and enters `handle_cli_req()`. + // [T1] Appends a log at index 11 by calling `store_log_entry()`. + // [T2] Appends a log at index 12 by calling `store_log_entry()`. + // [T2] Calls `try_update_precommit_index()`, + // updating the precommit index to 12. + // [T2] Calls `request_append_entries()` and `commit()`, + // updating the commit index to 12. + // [T3] Calls `state_machine::commit()` for logs 11 and 12. + // [T1] Calls `state_machine::pre_commit()` for log 11. + // => order inversion happens here. + // + // To prevent this inversion, T2 should acquire the same `cli_lock_` + // before calling `try_update_precommit_index()`. This ensures that T2 + // cannot update the precommit index between T1's `store_log_entry()` + // and `state_machine::pre_commit()` calls, maintaining the correct + // order of operations. recur_lock(cli_lock_); + // Need to progress precommit index for config. try_update_precommit_index(log_index); }