Skip to content

Commit

Permalink
pre stake and skip warmup (#3509)
Browse files Browse the repository at this point in the history
  • Loading branch information
bw-solana authored Nov 7, 2024
1 parent 00432e3 commit 54906e7
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 16 deletions.
32 changes: 18 additions & 14 deletions local-cluster/src/cluster_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use {
solana_ledger::blockstore::Blockstore,
solana_rpc_client::rpc_client::RpcClient,
solana_sdk::{
clock::{self, Slot, NUM_CONSECUTIVE_LEADER_SLOTS},
clock::{self, Slot},
commitment_config::CommitmentConfig,
epoch_schedule::MINIMUM_SLOTS_PER_EPOCH,
exit::Exit,
Expand Down Expand Up @@ -237,6 +237,8 @@ pub fn kill_entry_and_spend_and_verify_rest(
socket_addr_space: SocketAddrSpace,
) {
info!("kill_entry_and_spend_and_verify_rest...");

// Ensure all nodes have spun up and are funded.
let cluster_nodes = discover_cluster(
&entry_point_info.gossip().unwrap(),
nodes,
Expand All @@ -245,33 +247,29 @@ pub fn kill_entry_and_spend_and_verify_rest(
.unwrap();
assert!(cluster_nodes.len() >= nodes);
let client = new_tpu_quic_client(entry_point_info, connection_cache.clone()).unwrap();

// sleep long enough to make sure we are in epoch 3
let first_two_epoch_slots = MINIMUM_SLOTS_PER_EPOCH * (3 + 1);

for ingress_node in &cluster_nodes {
client
.rpc_client()
.poll_get_balance_with_commitment(ingress_node.pubkey(), CommitmentConfig::processed())
.unwrap_or_else(|err| panic!("Node {} has no balance: {}", ingress_node.pubkey(), err));
}

info!("sleeping for 2 leader fortnights");
sleep(Duration::from_millis(slot_millis * first_two_epoch_slots));
info!("done sleeping for first 2 warmup epochs");
// Kill the entry point node and wait for it to die.
info!("killing entry point: {}", entry_point_info.pubkey());
entry_point_validator_exit.write().unwrap().exit();
info!("sleeping for some time");
sleep(Duration::from_millis(
slot_millis * NUM_CONSECUTIVE_LEADER_SLOTS,
));
info!("done sleeping for 2 fortnights");
info!("sleeping for some time to let entry point exit and partitions to resolve...");
sleep(Duration::from_millis(slot_millis * MINIMUM_SLOTS_PER_EPOCH));
info!("done sleeping");

// Ensure all other nodes are still alive and able to ingest and confirm
// transactions.
for ingress_node in &cluster_nodes {
if ingress_node.pubkey() == entry_point_info.pubkey() {
info!("ingress_node.id == entry_point_info.id, continuing...");
continue;
}

// Ensure the current ingress node is still funded.
let client = new_tpu_quic_client(ingress_node, connection_cache.clone()).unwrap();
let balance = client
.rpc_client()
Expand All @@ -284,12 +282,16 @@ pub fn kill_entry_and_spend_and_verify_rest(

let mut result = Ok(());
let mut retries = 0;

// Retry sending a transaction to the current ingress node until it is
// observed by the entire cluster or we exhaust all retries.
loop {
retries += 1;
if retries > 5 {
result.unwrap();
}

// Send a simple transfer transaction to the current ingress node.
let random_keypair = Keypair::new();
let (blockhash, _) = client
.rpc_client()
Expand All @@ -301,7 +303,6 @@ pub fn kill_entry_and_spend_and_verify_rest(
1,
blockhash,
);

let confs = VOTE_THRESHOLD_DEPTH + 1;
let sig = {
let sig = LocalCluster::send_transaction_with_retries(
Expand All @@ -320,6 +321,9 @@ pub fn kill_entry_and_spend_and_verify_rest(
Ok(sig) => sig,
}
};

// Ensure all non-entry point nodes are able to confirm the
// transaction.
info!("poll_all_nodes_for_signature()");
match poll_all_nodes_for_signature(
entry_point_info,
Expand Down
23 changes: 21 additions & 2 deletions local-cluster/tests/local_cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4384,18 +4384,37 @@ fn test_cluster_partition_1_1_1() {
)
}

// Cluster needs a supermajority to remain, so the minimum size for this test is 4
#[test]
#[serial]
fn test_leader_failure_4() {
solana_logger::setup_with_default(RUST_LOG_FILTER);
error!("test_leader_failure_4");
// Cluster needs a supermajority to remain even after taking 1 node offline,
// so the minimum number of nodes for this test is 4.
let num_nodes = 4;
let validator_config = ValidatorConfig::default_for_test();
// Embed vote and stake account in genesis to avoid waiting for stake
// activation and race conditions around accepting gossip votes, repairing
// blocks, etc. before we advance through too many epochs.
let validator_keys: Option<Vec<(Arc<Keypair>, bool)>> = Some(
(0..num_nodes)
.map(|_| (Arc::new(Keypair::new()), true))
.collect(),
);
// Skip the warmup slots because these short epochs can cause problems when
// bringing multiple fresh validators online that are pre-staked in genesis.
// The problems arise because we skip their leader slots while they're still
// starting up, experience partitioning, and can fail to generate leader
// schedules in time because the short epochs have the same slots per epoch
// as the total tower height, so any skipped slots can lead to not rooting,
// not generating leader schedule, and stalling the cluster.
let skip_warmup_slots = true;
let mut config = ClusterConfig {
cluster_lamports: DEFAULT_CLUSTER_LAMPORTS,
node_stakes: vec![DEFAULT_NODE_STAKE; 4],
node_stakes: vec![DEFAULT_NODE_STAKE; num_nodes],
validator_configs: make_identical_validator_configs(&validator_config, num_nodes),
validator_keys,
skip_warmup_slots,
..ClusterConfig::default()
};
let local = LocalCluster::new(&mut config, SocketAddrSpace::Unspecified);
Expand Down

0 comments on commit 54906e7

Please sign in to comment.