From 9100aba0f8947711887ff2d1f5c68bfad2e738e8 Mon Sep 17 00:00:00 2001 From: Wen <113942165+wen-coding@users.noreply.github.com> Date: Mon, 7 Oct 2024 20:00:33 -0700 Subject: [PATCH] wen_restart: Add wen_restart_coordinator argument. (#2975) * wen_restart: Add wen_restart_coordinator argument. * rename LEADER_INDEX with COORDINATOR_INDEX --- core/src/validator.rs | 3 +++ local-cluster/src/validator_configs.rs | 1 + multinode-demo/bootstrap-validator.sh | 3 +++ multinode-demo/validator.sh | 3 +++ net/net.sh | 2 +- net/remote/remote-node.sh | 3 ++- validator/src/cli.rs | 16 ++++++++++++++++ validator/src/main.rs | 1 + wen-restart/src/wen_restart.rs | 13 ++++++++++++- 9 files changed, 42 insertions(+), 3 deletions(-) diff --git a/core/src/validator.rs b/core/src/validator.rs index 3ea9593a5efbb1..c05a3fa8474357 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -282,6 +282,7 @@ pub struct ValidatorConfig { pub generator_config: Option, pub use_snapshot_archives_at_startup: UseSnapshotArchivesAtStartup, pub wen_restart_proto_path: Option, + pub wen_restart_coordinator: Option, pub unified_scheduler_handler_threads: Option, pub ip_echo_server_threads: NonZeroUsize, pub replay_forks_threads: NonZeroUsize, @@ -355,6 +356,7 @@ impl Default for ValidatorConfig { generator_config: None, use_snapshot_archives_at_startup: UseSnapshotArchivesAtStartup::default(), wen_restart_proto_path: None, + wen_restart_coordinator: None, unified_scheduler_handler_threads: None, ip_echo_server_threads: NonZeroUsize::new(1).expect("1 is non-zero"), replay_forks_threads: NonZeroUsize::new(1).expect("1 is non-zero"), @@ -1417,6 +1419,7 @@ impl Validator { info!("Waiting for wen_restart phase one to finish"); wait_for_wen_restart(WenRestartConfig { wen_restart_path: config.wen_restart_proto_path.clone().unwrap(), + wen_restart_coordinator: config.wen_restart_coordinator.unwrap(), last_vote, blockstore: blockstore.clone(), cluster_info: cluster_info.clone(), diff --git a/local-cluster/src/validator_configs.rs b/local-cluster/src/validator_configs.rs index bbcd1067851805..786d2e39e57aa4 100644 --- a/local-cluster/src/validator_configs.rs +++ b/local-cluster/src/validator_configs.rs @@ -68,6 +68,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig { generator_config: config.generator_config.clone(), use_snapshot_archives_at_startup: config.use_snapshot_archives_at_startup, wen_restart_proto_path: config.wen_restart_proto_path.clone(), + wen_restart_coordinator: config.wen_restart_coordinator, unified_scheduler_handler_threads: config.unified_scheduler_handler_threads, ip_echo_server_threads: config.ip_echo_server_threads, replay_forks_threads: config.replay_forks_threads, diff --git a/multinode-demo/bootstrap-validator.sh b/multinode-demo/bootstrap-validator.sh index 471756254cb5db..d21ee1aaa8b73f 100755 --- a/multinode-demo/bootstrap-validator.sh +++ b/multinode-demo/bootstrap-validator.sh @@ -115,6 +115,9 @@ while [[ -n $1 ]]; do elif [[ $1 == --wen-restart ]]; then args+=("$1" "$2") shift 2 + elif [[ $1 == --wen-restart-coordinator ]]; then + args+=("$1" "$2") + shift 2 else echo "Unknown argument: $1" $program --help diff --git a/multinode-demo/validator.sh b/multinode-demo/validator.sh index d4e081c8893858..c97812c6cbb910 100755 --- a/multinode-demo/validator.sh +++ b/multinode-demo/validator.sh @@ -185,6 +185,9 @@ while [[ -n $1 ]]; do elif [[ $1 == --wen-restart ]]; then args+=("$1" "$2") shift 2 + elif [[ $1 == --wen-restart-coordinator ]]; then + args+=("$1" "$2") + shift 2 elif [[ $1 = -h ]]; then usage "$@" else diff --git a/net/net.sh b/net/net.sh index 94fa429ace5086..3ef7430ebd54d6 100755 --- a/net/net.sh +++ b/net/net.sh @@ -146,7 +146,7 @@ Operate a configured testnet -i [ip address] - IP Address of the node to start or stop startnode specific option: - --wen-restart [proto_file] - Use given proto file (create if non-exist) and apply wen_restat + --wen-restart [coordinator_pubkey] - Use given coordinator pubkey and apply wen_restat startclients-specific options: $CLIENT_OPTIONS diff --git a/net/remote/remote-node.sh b/net/remote/remote-node.sh index fe3f6a1d38dbca..edd21ba73145b4 100755 --- a/net/remote/remote-node.sh +++ b/net/remote/remote-node.sh @@ -436,7 +436,8 @@ EOF fi if [[ -n "$maybeWenRestart" ]]; then - args+=(--wen-restart "$maybeWenRestart") + args+=(--wen-restart wen_restart.proto3) + args+=(--wen-restart-coordinator "$maybeWenRestart") fi cat >> ~/solana/on-reboot <(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { .takes_value(true) .required(false) .conflicts_with("wait_for_supermajority") + .requires("wen_restart_coordinator") .help( "Only used during coordinated cluster restarts.\ \n\n\ + Need to also specify the leader's pubkey in --wen-restart-leader.\ + \n\n\ When specified, the validator will enter Wen Restart mode which \ pauses normal activity. Validators in this mode will gossip their last \ vote to reach consensus on a safe restart slot and repair all blocks \ @@ -1610,6 +1613,19 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { further debugging and watch the discord channel for instructions.", ), ) + .arg( + Arg::with_name("wen_restart_coordinator") + .long("wen-restart-coordinator") + .hidden(hidden_unless_forced()) + .value_name("PUBKEY") + .takes_value(true) + .required(false) + .requires("wen_restart") + .help( + "Specifies the pubkey of the leader used in wen restart. \ + May get stuck if the leader used is different from others.", + ), + ) .args(&thread_args(&default_args.thread_args)) .args(&get_deprecated_arguments()) .after_help("The default subcommand is run") diff --git a/validator/src/main.rs b/validator/src/main.rs index c01f6a1c2c2507..74ad2d0926eae2 100644 --- a/validator/src/main.rs +++ b/validator/src/main.rs @@ -1540,6 +1540,7 @@ pub fn main() { delay_leader_block_for_pending_fork: matches .is_present("delay_leader_block_for_pending_fork"), wen_restart_proto_path: value_t!(matches, "wen_restart", PathBuf).ok(), + wen_restart_coordinator: value_t!(matches, "wen_restart_coordinator", Pubkey).ok(), ..ValidatorConfig::default() }; diff --git a/wen-restart/src/wen_restart.rs b/wen-restart/src/wen_restart.rs index 924debb2adf226..c2e8cf5191b1c2 100644 --- a/wen-restart/src/wen_restart.rs +++ b/wen-restart/src/wen_restart.rs @@ -44,7 +44,7 @@ use { purge_all_bank_snapshots, }, }, - solana_sdk::{shred_version::compute_shred_version, timing::timestamp}, + solana_sdk::{pubkey::Pubkey, shred_version::compute_shred_version, timing::timestamp}, solana_timings::ExecuteTimings, solana_vote_program::vote_state::VoteTransaction, std::{ @@ -874,6 +874,7 @@ pub(crate) fn aggregate_restart_heaviest_fork( #[derive(Clone)] pub struct WenRestartConfig { pub wen_restart_path: PathBuf, + pub wen_restart_coordinator: Pubkey, pub last_vote: VoteTransaction, pub blockstore: Arc, pub cluster_info: Arc, @@ -1333,6 +1334,7 @@ mod tests { const TICKS_PER_SLOT: u64 = 2; const TOTAL_VALIDATOR_COUNT: u16 = 20; const MY_INDEX: usize = TOTAL_VALIDATOR_COUNT as usize - 1; + const COORDINATOR_INDEX: usize = 0; const WAIT_FOR_THREAD_TIMEOUT: u64 = 10_000; const WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT: u64 = 80; const NON_CONFORMING_VALIDATOR_PERCENT: u64 = 5; @@ -1404,6 +1406,7 @@ mod tests { pub bank_forks: Arc>, pub last_voted_fork_slots: Vec, pub wen_restart_proto_path: PathBuf, + pub wen_restart_coordinator: Pubkey, pub last_blockhash: Hash, pub genesis_config_hash: Hash, } @@ -1439,6 +1442,9 @@ mod tests { .node_keypair .insecure_clone(), ); + let wen_restart_coordinator = validator_voting_keypairs[COORDINATOR_INDEX] + .node_keypair + .pubkey(); let cluster_info = Arc::new(ClusterInfo::new( { let mut contact_info = @@ -1500,6 +1506,7 @@ mod tests { bank_forks, last_voted_fork_slots, wen_restart_proto_path, + wen_restart_coordinator, last_blockhash, genesis_config_hash: genesis_config.hash(), } @@ -1556,6 +1563,7 @@ mod tests { let last_vote_slot: Slot = test_state.last_voted_fork_slots[0]; let wen_restart_config = WenRestartConfig { wen_restart_path: test_state.wen_restart_proto_path.clone(), + wen_restart_coordinator: test_state.wen_restart_coordinator, last_vote: VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash)), blockstore: test_state.blockstore.clone(), cluster_info: test_state.cluster_info.clone(), @@ -1623,6 +1631,7 @@ mod tests { let exit = Arc::new(AtomicBool::new(false)); let wen_restart_config = WenRestartConfig { wen_restart_path: test_state.wen_restart_proto_path.clone(), + wen_restart_coordinator: test_state.wen_restart_coordinator, last_vote: VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash)), blockstore: test_state.blockstore.clone(), cluster_info: test_state.cluster_info.clone(), @@ -1984,6 +1993,7 @@ mod tests { assert_eq!( wait_for_wen_restart(WenRestartConfig { wen_restart_path: test_state.wen_restart_proto_path, + wen_restart_coordinator: test_state.wen_restart_coordinator, last_vote: VoteTransaction::from(Vote::new( vec![new_root_slot], last_vote_bankhash @@ -3375,6 +3385,7 @@ mod tests { let last_vote_bankhash = Hash::new_unique(); let config = WenRestartConfig { wen_restart_path: test_state.wen_restart_proto_path.clone(), + wen_restart_coordinator: test_state.wen_restart_coordinator, last_vote: VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash)), blockstore: test_state.blockstore.clone(), cluster_info: test_state.cluster_info.clone(),