Skip to content

Commit

Permalink
wen_restart: Add wen_restart_coordinator argument. (anza-xyz#2975)
Browse files Browse the repository at this point in the history
* wen_restart: Add wen_restart_coordinator argument.

* rename LEADER_INDEX with COORDINATOR_INDEX
  • Loading branch information
wen-coding authored and ray-kast committed Nov 27, 2024
1 parent 818fba1 commit 9100aba
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 3 deletions.
3 changes: 3 additions & 0 deletions core/src/validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ pub struct ValidatorConfig {
pub generator_config: Option<GeneratorConfig>,
pub use_snapshot_archives_at_startup: UseSnapshotArchivesAtStartup,
pub wen_restart_proto_path: Option<PathBuf>,
pub wen_restart_coordinator: Option<Pubkey>,
pub unified_scheduler_handler_threads: Option<usize>,
pub ip_echo_server_threads: NonZeroUsize,
pub replay_forks_threads: NonZeroUsize,
Expand Down Expand Up @@ -355,6 +356,7 @@ impl Default for ValidatorConfig {
generator_config: None,
use_snapshot_archives_at_startup: UseSnapshotArchivesAtStartup::default(),
wen_restart_proto_path: None,
wen_restart_coordinator: None,
unified_scheduler_handler_threads: None,
ip_echo_server_threads: NonZeroUsize::new(1).expect("1 is non-zero"),
replay_forks_threads: NonZeroUsize::new(1).expect("1 is non-zero"),
Expand Down Expand Up @@ -1417,6 +1419,7 @@ impl Validator {
info!("Waiting for wen_restart phase one to finish");
wait_for_wen_restart(WenRestartConfig {
wen_restart_path: config.wen_restart_proto_path.clone().unwrap(),
wen_restart_coordinator: config.wen_restart_coordinator.unwrap(),
last_vote,
blockstore: blockstore.clone(),
cluster_info: cluster_info.clone(),
Expand Down
1 change: 1 addition & 0 deletions local-cluster/src/validator_configs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig {
generator_config: config.generator_config.clone(),
use_snapshot_archives_at_startup: config.use_snapshot_archives_at_startup,
wen_restart_proto_path: config.wen_restart_proto_path.clone(),
wen_restart_coordinator: config.wen_restart_coordinator,
unified_scheduler_handler_threads: config.unified_scheduler_handler_threads,
ip_echo_server_threads: config.ip_echo_server_threads,
replay_forks_threads: config.replay_forks_threads,
Expand Down
3 changes: 3 additions & 0 deletions multinode-demo/bootstrap-validator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ while [[ -n $1 ]]; do
elif [[ $1 == --wen-restart ]]; then
args+=("$1" "$2")
shift 2
elif [[ $1 == --wen-restart-coordinator ]]; then
args+=("$1" "$2")
shift 2
else
echo "Unknown argument: $1"
$program --help
Expand Down
3 changes: 3 additions & 0 deletions multinode-demo/validator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@ while [[ -n $1 ]]; do
elif [[ $1 == --wen-restart ]]; then
args+=("$1" "$2")
shift 2
elif [[ $1 == --wen-restart-coordinator ]]; then
args+=("$1" "$2")
shift 2
elif [[ $1 = -h ]]; then
usage "$@"
else
Expand Down
2 changes: 1 addition & 1 deletion net/net.sh
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ Operate a configured testnet
-i [ip address] - IP Address of the node to start or stop
startnode specific option:
--wen-restart [proto_file] - Use given proto file (create if non-exist) and apply wen_restat
--wen-restart [coordinator_pubkey] - Use given coordinator pubkey and apply wen_restat
startclients-specific options:
$CLIENT_OPTIONS
Expand Down
3 changes: 2 additions & 1 deletion net/remote/remote-node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,8 @@ EOF
fi

if [[ -n "$maybeWenRestart" ]]; then
args+=(--wen-restart "$maybeWenRestart")
args+=(--wen-restart wen_restart.proto3)
args+=(--wen-restart-coordinator "$maybeWenRestart")
fi

cat >> ~/solana/on-reboot <<EOF
Expand Down
16 changes: 16 additions & 0 deletions validator/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1588,9 +1588,12 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> {
.takes_value(true)
.required(false)
.conflicts_with("wait_for_supermajority")
.requires("wen_restart_coordinator")
.help(
"Only used during coordinated cluster restarts.\
\n\n\
Need to also specify the leader's pubkey in --wen-restart-leader.\
\n\n\
When specified, the validator will enter Wen Restart mode which \
pauses normal activity. Validators in this mode will gossip their last \
vote to reach consensus on a safe restart slot and repair all blocks \
Expand All @@ -1610,6 +1613,19 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> {
further debugging and watch the discord channel for instructions.",
),
)
.arg(
Arg::with_name("wen_restart_coordinator")
.long("wen-restart-coordinator")
.hidden(hidden_unless_forced())
.value_name("PUBKEY")
.takes_value(true)
.required(false)
.requires("wen_restart")
.help(
"Specifies the pubkey of the leader used in wen restart. \
May get stuck if the leader used is different from others.",
),
)
.args(&thread_args(&default_args.thread_args))
.args(&get_deprecated_arguments())
.after_help("The default subcommand is run")
Expand Down
1 change: 1 addition & 0 deletions validator/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,7 @@ pub fn main() {
delay_leader_block_for_pending_fork: matches
.is_present("delay_leader_block_for_pending_fork"),
wen_restart_proto_path: value_t!(matches, "wen_restart", PathBuf).ok(),
wen_restart_coordinator: value_t!(matches, "wen_restart_coordinator", Pubkey).ok(),
..ValidatorConfig::default()
};

Expand Down
13 changes: 12 additions & 1 deletion wen-restart/src/wen_restart.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ use {
purge_all_bank_snapshots,
},
},
solana_sdk::{shred_version::compute_shred_version, timing::timestamp},
solana_sdk::{pubkey::Pubkey, shred_version::compute_shred_version, timing::timestamp},
solana_timings::ExecuteTimings,
solana_vote_program::vote_state::VoteTransaction,
std::{
Expand Down Expand Up @@ -874,6 +874,7 @@ pub(crate) fn aggregate_restart_heaviest_fork(
#[derive(Clone)]
pub struct WenRestartConfig {
pub wen_restart_path: PathBuf,
pub wen_restart_coordinator: Pubkey,
pub last_vote: VoteTransaction,
pub blockstore: Arc<Blockstore>,
pub cluster_info: Arc<ClusterInfo>,
Expand Down Expand Up @@ -1333,6 +1334,7 @@ mod tests {
const TICKS_PER_SLOT: u64 = 2;
const TOTAL_VALIDATOR_COUNT: u16 = 20;
const MY_INDEX: usize = TOTAL_VALIDATOR_COUNT as usize - 1;
const COORDINATOR_INDEX: usize = 0;
const WAIT_FOR_THREAD_TIMEOUT: u64 = 10_000;
const WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT: u64 = 80;
const NON_CONFORMING_VALIDATOR_PERCENT: u64 = 5;
Expand Down Expand Up @@ -1404,6 +1406,7 @@ mod tests {
pub bank_forks: Arc<RwLock<BankForks>>,
pub last_voted_fork_slots: Vec<Slot>,
pub wen_restart_proto_path: PathBuf,
pub wen_restart_coordinator: Pubkey,
pub last_blockhash: Hash,
pub genesis_config_hash: Hash,
}
Expand Down Expand Up @@ -1439,6 +1442,9 @@ mod tests {
.node_keypair
.insecure_clone(),
);
let wen_restart_coordinator = validator_voting_keypairs[COORDINATOR_INDEX]
.node_keypair
.pubkey();
let cluster_info = Arc::new(ClusterInfo::new(
{
let mut contact_info =
Expand Down Expand Up @@ -1500,6 +1506,7 @@ mod tests {
bank_forks,
last_voted_fork_slots,
wen_restart_proto_path,
wen_restart_coordinator,
last_blockhash,
genesis_config_hash: genesis_config.hash(),
}
Expand Down Expand Up @@ -1556,6 +1563,7 @@ mod tests {
let last_vote_slot: Slot = test_state.last_voted_fork_slots[0];
let wen_restart_config = WenRestartConfig {
wen_restart_path: test_state.wen_restart_proto_path.clone(),
wen_restart_coordinator: test_state.wen_restart_coordinator,
last_vote: VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash)),
blockstore: test_state.blockstore.clone(),
cluster_info: test_state.cluster_info.clone(),
Expand Down Expand Up @@ -1623,6 +1631,7 @@ mod tests {
let exit = Arc::new(AtomicBool::new(false));
let wen_restart_config = WenRestartConfig {
wen_restart_path: test_state.wen_restart_proto_path.clone(),
wen_restart_coordinator: test_state.wen_restart_coordinator,
last_vote: VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash)),
blockstore: test_state.blockstore.clone(),
cluster_info: test_state.cluster_info.clone(),
Expand Down Expand Up @@ -1984,6 +1993,7 @@ mod tests {
assert_eq!(
wait_for_wen_restart(WenRestartConfig {
wen_restart_path: test_state.wen_restart_proto_path,
wen_restart_coordinator: test_state.wen_restart_coordinator,
last_vote: VoteTransaction::from(Vote::new(
vec![new_root_slot],
last_vote_bankhash
Expand Down Expand Up @@ -3375,6 +3385,7 @@ mod tests {
let last_vote_bankhash = Hash::new_unique();
let config = WenRestartConfig {
wen_restart_path: test_state.wen_restart_proto_path.clone(),
wen_restart_coordinator: test_state.wen_restart_coordinator,
last_vote: VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash)),
blockstore: test_state.blockstore.clone(),
cluster_info: test_state.cluster_info.clone(),
Expand Down

0 comments on commit 9100aba

Please sign in to comment.