diff --git a/include/libnuraft/raft_server.hxx b/include/libnuraft/raft_server.hxx index 3a1da833..65c34af8 100644 --- a/include/libnuraft/raft_server.hxx +++ b/include/libnuraft/raft_server.hxx @@ -1460,7 +1460,7 @@ protected: /** * Lock of handling client request and role change. */ - std::mutex cli_lock_; + std::recursive_mutex cli_lock_; /** * Condition variable to invoke BG commit thread. diff --git a/src/handle_client_request.cxx b/src/handle_client_request.cxx index a71bf974..e2164e7c 100644 --- a/src/handle_client_request.cxx +++ b/src/handle_client_request.cxx @@ -51,7 +51,7 @@ ptr raft_server::handle_cli_req_prelock(req_msg& req, case raft_params::dual_mutex: default: { // TODO: Use RW lock here. - auto_lock(cli_lock_); + recur_lock(cli_lock_); resp = handle_cli_req(req, ext_params, timestamp_us); break; } diff --git a/src/raft_server.cxx b/src/raft_server.cxx index ac6c6909..ddf70501 100644 --- a/src/raft_server.cxx +++ b/src/raft_server.cxx @@ -1030,7 +1030,7 @@ void raft_server::become_leader() { } ptr params = ctx_->get_params(); - { auto_lock(cli_lock_); + { recur_lock(cli_lock_); role_ = srv_role::leader; leader_ = id_; srv_to_join_.reset(); @@ -1397,7 +1397,7 @@ bool raft_server::request_leadership() { void raft_server::become_follower() { // stop hb for all peers p_in("[BECOME FOLLOWER] term %" PRIu64 "", state_->get_term()); - { std::lock_guard ll(cli_lock_); + { std::lock_guard ll(cli_lock_); for (peer_itor it = peers_.begin(); it != peers_.end(); ++it) { it->second->enable_hb(false); it->second->reset_stream(); @@ -1455,7 +1455,7 @@ bool raft_server::update_term(ulong term) { // // To avoid this issue, we acquire `cli_lock_`, // and change `role_` first before setting the term. - std::lock_guard ll(cli_lock_); + std::lock_guard ll(cli_lock_); role_ = srv_role::follower; state_->set_term(term); } @@ -1773,6 +1773,36 @@ ulong raft_server::store_log_entry(ptr& entry, ulong index) { } if ( role_ == srv_role::leader ) { + // WARNING: + // Configuration changes, such as adding or removing a member, + // can run concurrently with normal log appending operations. This + // concurrency can lead to an inversion issue between precommit and + // commit orders, particularly when there is only one member in the + // cluster. + // + // Let's consider the following scenario: T1 is the thread handling + // log appending, T2 is the thread processing configuration changes, + // and T3 is the commit thread. + // The initial precommit and commit index is 10. + // + // [T1] Acquires `cli_lock_` and enters `handle_cli_req()`. + // [T1] Appends a log at index 11 by calling `store_log_entry()`. + // [T2] Appends a log at index 12 by calling `store_log_entry()`. + // [T2] Calls `try_update_precommit_index()`, + // updating the precommit index to 12. + // [T2] Calls `request_append_entries()` and `commit()`, + // updating the commit index to 12. + // [T3] Calls `state_machine::commit()` for logs 11 and 12. + // [T1] Calls `state_machine::pre_commit()` for log 11. + // => order inversion happens here. + // + // To prevent this inversion, T2 should acquire the same `cli_lock_` + // before calling `try_update_precommit_index()`. This ensures that T2 + // cannot update the precommit index between T1's `store_log_entry()` + // and `state_machine::pre_commit()` calls, maintaining the correct + // order of operations. + recur_lock(cli_lock_); + // Need to progress precommit index for config. try_update_precommit_index(log_index); }