Skip to content

Commit

Permalink
Merge pull request #924 from drmingdrmer/44-fix-920
Browse files Browse the repository at this point in the history
Refactor: fix handling of non-voter leader
  • Loading branch information
drmingdrmer authored Nov 7, 2023
2 parents cf4ffe7 + f1819c2 commit 503fa49
Show file tree
Hide file tree
Showing 14 changed files with 220 additions and 39 deletions.
30 changes: 20 additions & 10 deletions openraft/src/docs/data/vote.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ The only difference between these two modes is the definition of `LeaderId`, and
See: [`leader-id`].


## Vote defines the server state
## Vote and Membership define the server state

In the default mode, the `Vote` defines the server state(leader, candidate, follower or learner).
A server state has a unique corresponding `vote`, thus `vote` can be used to identify different server
Expand All @@ -55,19 +55,29 @@ new membership log is replicated to a follower or learner.

E.g.:

- A vote `(term=1, node_id=2, committed=false)` is in a candidate state for
node-2.
- Node-2 with vote `(term=1, node_id=2, committed=true)`:
- is a leader if it is **present** in config, either a voter or non-voter.
- is a learner if it is **absent** in config.

- A vote `(term=1, node_id=2, committed=true)` is in a leader state for
node-2.
- Node-2 with vote `(term=1, node_id=2, committed=false)`:
- is a candidate if it is **present** in config, either a voter or non-voter.
- is a learner if it is **absent** in config.

- A vote `(term=1, node_id=2, committed=false|true)` is in a follower/learner
state for node-3.
- Node-3 with vote `(term=1, node_id=99, committed=false|true)`:
- is a follower if it is a **voter** in config,
- is a learner if it is a **non-voter** or **absent** in config.

For node-2:

| vote \ membership | Voter | Non-voter | Absent |
|---------------------------------------|-----------|-----------|---------|
| (term=1, node_id=2, committed=true) | leader | leader | learner |
| (term=1, node_id=2, committed=false) | candidate | candidate | learner |
| (term=1, node_id=99, committed=true) | follower | learner | learner |
| (term=1, node_id=99, committed=false) | follower | learner | learner |

- A vote `(term=1, node_id=1, committed=false|true)` is in another different
follower/learner state for node-3.


[`Vote`]: `crate::vote::Vote`
[`single-term-leader`]: `crate::docs::feature_flags`
[`leader-id`]: `crate::docs::data::leader_id`
[`leader-id`]: `crate::docs::data::leader_id`
6 changes: 1 addition & 5 deletions openraft/src/engine/engine_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ where C: RaftTypeConfig
}
}

// TODO: test it
#[tracing::instrument(level = "debug", skip_all)]
pub(crate) fn startup(&mut self) {
// Allows starting up as a leader.
Expand Down Expand Up @@ -517,10 +516,7 @@ where C: RaftTypeConfig

#[allow(clippy::collapsible_if)]
if em.log_id().as_ref() <= self.state.committed() {
if !em.is_voter(&self.config.id) && self.state.is_leading(&self.config.id) {
tracing::debug!("leader {} is stepping down", self.config.id);
self.vote_handler().become_following();
}
self.vote_handler().update_internal_server_state();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ fn m1_2() -> Membership<u64, ()> {
}

fn m23() -> Membership<u64, ()> {
Membership::<u64, ()>::new(vec![btreeset! {2,3}], None)
Membership::<u64, ()>::new(vec![btreeset! {2,3}], btreeset! {1,2,3})
}

fn eng() -> Engine<UTConfig> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ fn m01() -> Membership<u64, ()> {
}

fn m23() -> Membership<u64, ()> {
Membership::<u64, ()>::new(vec![btreeset! {2,3}], None)
Membership::<u64, ()>::new(vec![btreeset! {2,3}], btreeset! {1,2,3})
}

fn eng() -> Engine<UTConfig> {
Expand Down
2 changes: 1 addition & 1 deletion openraft/src/engine/handler/vote_handler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ where C: RaftTypeConfig

/// Enter leading or following state by checking `vote`.
pub(crate) fn update_internal_server_state(&mut self) {
if self.state.vote_ref().leader_id().voted_for() == Some(self.config.id) {
if self.state.is_leading(&self.config.id) {
self.become_leading();
} else {
self.become_following();
Expand Down
23 changes: 23 additions & 0 deletions openraft/src/engine/tests/startup_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ use crate::ServerState;
use crate::TokioInstant;
use crate::Vote;

fn m_empty() -> Membership<u64, ()> {
Membership::<u64, ()>::new(vec![btreeset! {}], None)
}

fn m23() -> Membership<u64, ()> {
Membership::<u64, ()>::new(vec![btreeset! {2,3}], None)
}
Expand Down Expand Up @@ -63,6 +67,25 @@ fn test_startup_as_leader() -> anyhow::Result<()> {
Ok(())
}

/// When starting up, a leader that is not a voter should not panic.
#[test]
fn test_startup_as_leader_not_voter_issue_920() -> anyhow::Result<()> {
let mut eng = eng();
// self.id==2 is a voter:
eng.state
.membership_state
.set_effective(Arc::new(EffectiveMembership::new(Some(log_id(2, 1, 3)), m_empty())));
// Committed vote makes it a leader at startup.
eng.state.vote = UTime::new(TokioInstant::now(), Vote::new_committed(1, 2));

eng.startup();

assert_eq!(ServerState::Learner, eng.state.server_state);
assert_eq!(eng.output.take_commands(), vec![]);

Ok(())
}

#[test]
fn test_startup_candidate_becomes_follower() -> anyhow::Result<()> {
let mut eng = eng();
Expand Down
5 changes: 5 additions & 0 deletions openraft/src/membership/membership.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,11 @@ where
N: Node,
NID: NodeId,
{
/// Return true if the given node id is an either voter or learner.
pub(crate) fn contains(&self, node_id: &NID) -> bool {
self.nodes.contains_key(node_id)
}

/// Check if the given `NodeId` exists and is a voter.
pub(crate) fn is_voter(&self, node_id: &NID) -> bool {
for c in self.configs.iter() {
Expand Down
3 changes: 1 addition & 2 deletions openraft/src/raft/message/client_write.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::fmt::Debug;

use crate::AppDataResponse;
use crate::LogId;
use crate::Membership;
use crate::MessageSummary;
Expand All @@ -10,7 +9,7 @@ use crate::RaftTypeConfig;
#[cfg_attr(
feature = "serde",
derive(serde::Deserialize, serde::Serialize),
serde(bound = "C::R: AppDataResponse")
serde(bound = "C::R: crate::AppDataResponse")
)]
pub struct ClientWriteResponse<C: RaftTypeConfig> {
/// The id of the log that is applied.
Expand Down
6 changes: 6 additions & 0 deletions openraft/src/raft_state/membership_state/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ where
Self { committed, effective }
}

/// Return true if the given node id is an either voter or learner.
pub(crate) fn contains(&self, id: &NID) -> bool {
self.effective.membership().contains(id)
}

/// Check if the given `NodeId` exists and is a voter.
pub(crate) fn is_voter(&self, id: &NID) -> bool {
self.effective.membership().is_voter(id)
}
Expand Down
45 changes: 34 additions & 11 deletions openraft/src/raft_state/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -319,24 +319,34 @@ where
}

/// Determine the current server state by state.
///
/// See [Determine Server State][] for more details about determining the server state.
///
/// [Determine Server State]: crate::docs::data::vote#vote-and-membership-define-the-server-state
#[tracing::instrument(level = "debug", skip_all)]
pub(crate) fn calc_server_state(&self, id: &NID) -> ServerState {
tracing::debug!(
is_member = display(self.is_voter(id)),
contains = display(self.membership_state.contains(id)),
is_voter = display(self.is_voter(id)),
is_leader = display(self.is_leader(id)),
is_leading = display(self.is_leading(id)),
"states"
);
if self.is_voter(id) {
if self.is_leader(id) {
ServerState::Leader
} else if self.is_leading(id) {
ServerState::Candidate
} else {

// Openraft does not require Leader/Candidate to be a voter, i.e., a learner node could
// also be possible to be a leader. Although currently it is not supported.
// Allowing this will simplify leader step down: The leader just run as long as it wants to,
#[allow(clippy::collapsible_else_if)]
if self.is_leader(id) {
ServerState::Leader
} else if self.is_leading(id) {
ServerState::Candidate
} else {
if self.is_voter(id) {
ServerState::Follower
} else {
ServerState::Learner
}
} else {
ServerState::Learner
}
}

Expand All @@ -346,12 +356,25 @@ where

/// The node is candidate(leadership is not granted by a quorum) or leader(leadership is granted
/// by a quorum)
///
/// Note that in Openraft Leader does not have to be a voter. See [Determine Server State][] for
/// more details about determining the server state.
///
/// [Determine Server State]: crate::docs::data::vote#vote-and-membership-define-the-server-state
pub(crate) fn is_leading(&self, id: &NID) -> bool {
self.vote.leader_id().voted_for().as_ref() == Some(id)
self.membership_state.contains(id) && self.vote.leader_id().voted_for().as_ref() == Some(id)
}

/// The node is leader
///
/// Leadership is granted by a quorum and the vote is committed.
///
/// Note that in Openraft Leader does not have to be a voter. See [Determine Server State][] for
/// more details about determining the server state.
///
/// [Determine Server State]: crate::docs::data::vote#vote-and-membership-define-the-server-state
pub(crate) fn is_leader(&self, id: &NID) -> bool {
self.vote.leader_id().voted_for().as_ref() == Some(id) && self.vote.is_committed()
self.is_leading(id) && self.vote.is_committed()
}

pub(crate) fn assign_log_ids<'a, Ent: RaftEntry<NID, N> + 'a>(
Expand Down
1 change: 1 addition & 0 deletions tests/tests/life_cycle/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ mod t50_follower_restart_does_not_interrupt;
mod t50_single_follower_restart;
mod t50_single_leader_restart_re_apply_logs;
mod t90_issue_607_single_restart;
mod t90_issue_920_non_voter_leader_restart;
41 changes: 41 additions & 0 deletions tests/tests/life_cycle/t90_issue_920_non_voter_leader_restart.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
use std::sync::Arc;
use std::time::Duration;

use openraft::storage::RaftLogStorage;
use openraft::Config;
use openraft::ServerState;
use openraft::Vote;

use crate::fixtures::init_default_ut_tracing;
use crate::fixtures::RaftRouter;

/// Special case: A leader that is not a member(neither a voter or non-voter) should be started too,
/// as a learner.
#[async_entry::test(worker_threads = 8, init = "init_default_ut_tracing()", tracing_span = "debug")]
async fn issue_920_non_member_leader_restart() -> anyhow::Result<()> {
let config = Arc::new(
Config {
enable_heartbeat: false,
..Default::default()
}
.validate()?,
);

let mut router = RaftRouter::new(config.clone());

let (mut log_store, sm) = router.new_store();
// Set committed vote that believes node 0 is the leader.
log_store.save_vote(&Vote::new_committed(1, 0)).await?;
router.new_raft_node_with_sto(0, log_store, sm).await;

router
.wait(&0, timeout())
.state(ServerState::Learner, "node 0 becomes learner when startup")
.await?;

Ok(())
}

fn timeout() -> Option<Duration> {
Some(Duration::from_millis(1000))
}
27 changes: 21 additions & 6 deletions tests/tests/membership/t11_add_learner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,27 @@ async fn add_learner_non_blocking() -> Result<()> {
let raft = router.get_raft_handle(&0)?;
raft.add_learner(1, (), false).await?;

sleep(Duration::from_millis(500)).await;

let metrics = router.get_raft_handle(&0)?.metrics().borrow().clone();
let repl = metrics.replication.as_ref().unwrap();
let n1_repl = repl.get(&1);
assert_eq!(Some(&None), n1_repl, "no replication state to the learner is reported");
let n = 6;
for i in 0..=n {
if i == n {
unreachable!("no replication status is reported to metrics!");
}

let metrics = router.get_raft_handle(&0)?.metrics().borrow().clone();
let repl = metrics.replication.as_ref().unwrap();

// The result is Some(&None) when there is no success replication is made,
// and is None if no replication attempt is made(no success or failure is reported to metrics).
let n1_repl = repl.get(&1);
if n1_repl.is_none() {
tracing::info!("--- no replication attempt is made, sleep and retry: {}-th attempt", i);

sleep(Duration::from_millis(500)).await;
continue;
}
assert_eq!(Some(&None), n1_repl, "no replication state to the learner is reported");
break;
}
}

Ok(())
Expand Down
Loading

0 comments on commit 503fa49

Please sign in to comment.