Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pre stake and skip warmup #3509

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 18 additions & 14 deletions local-cluster/src/cluster_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use {
solana_ledger::blockstore::Blockstore,
solana_rpc_client::rpc_client::RpcClient,
solana_sdk::{
clock::{self, Slot, NUM_CONSECUTIVE_LEADER_SLOTS},
clock::{self, Slot},
commitment_config::CommitmentConfig,
epoch_schedule::MINIMUM_SLOTS_PER_EPOCH,
exit::Exit,
Expand Down Expand Up @@ -237,6 +237,8 @@ pub fn kill_entry_and_spend_and_verify_rest(
socket_addr_space: SocketAddrSpace,
) {
info!("kill_entry_and_spend_and_verify_rest...");

// Ensure all nodes have spun up and are funded.
let cluster_nodes = discover_cluster(
&entry_point_info.gossip().unwrap(),
nodes,
Expand All @@ -245,33 +247,29 @@ pub fn kill_entry_and_spend_and_verify_rest(
.unwrap();
assert!(cluster_nodes.len() >= nodes);
let client = new_tpu_quic_client(entry_point_info, connection_cache.clone()).unwrap();

// sleep long enough to make sure we are in epoch 3
let first_two_epoch_slots = MINIMUM_SLOTS_PER_EPOCH * (3 + 1);

for ingress_node in &cluster_nodes {
client
.rpc_client()
.poll_get_balance_with_commitment(ingress_node.pubkey(), CommitmentConfig::processed())
.unwrap_or_else(|err| panic!("Node {} has no balance: {}", ingress_node.pubkey(), err));
}

info!("sleeping for 2 leader fortnights");
sleep(Duration::from_millis(slot_millis * first_two_epoch_slots));
info!("done sleeping for first 2 warmup epochs");
// Kill the entry point node and wait for it to die.
info!("killing entry point: {}", entry_point_info.pubkey());
entry_point_validator_exit.write().unwrap().exit();
info!("sleeping for some time");
sleep(Duration::from_millis(
slot_millis * NUM_CONSECUTIVE_LEADER_SLOTS,
));
info!("done sleeping for 2 fortnights");
info!("sleeping for some time to let entry point exit and partitions to resolve...");
sleep(Duration::from_millis(slot_millis * MINIMUM_SLOTS_PER_EPOCH));
info!("done sleeping");

// Ensure all other nodes are still alive and able to ingest and confirm
// transactions.
for ingress_node in &cluster_nodes {
if ingress_node.pubkey() == entry_point_info.pubkey() {
info!("ingress_node.id == entry_point_info.id, continuing...");
continue;
}

// Ensure the current ingress node is still funded.
let client = new_tpu_quic_client(ingress_node, connection_cache.clone()).unwrap();
let balance = client
.rpc_client()
Expand All @@ -284,12 +282,16 @@ pub fn kill_entry_and_spend_and_verify_rest(

let mut result = Ok(());
let mut retries = 0;

// Retry sending a transaction to the current ingress node until it is
// observed by the entire cluster or we exhaust all retries.
loop {
retries += 1;
if retries > 5 {
result.unwrap();
}

// Send a simple transfer transaction to the current ingress node.
let random_keypair = Keypair::new();
let (blockhash, _) = client
.rpc_client()
Expand All @@ -301,7 +303,6 @@ pub fn kill_entry_and_spend_and_verify_rest(
1,
blockhash,
);

let confs = VOTE_THRESHOLD_DEPTH + 1;
let sig = {
let sig = LocalCluster::send_transaction_with_retries(
Expand All @@ -320,6 +321,9 @@ pub fn kill_entry_and_spend_and_verify_rest(
Ok(sig) => sig,
}
};

// Ensure all non-entry point nodes are able to confirm the
// transaction.
info!("poll_all_nodes_for_signature()");
match poll_all_nodes_for_signature(
entry_point_info,
Expand Down
23 changes: 21 additions & 2 deletions local-cluster/tests/local_cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4384,18 +4384,37 @@ fn test_cluster_partition_1_1_1() {
)
}

// Cluster needs a supermajority to remain, so the minimum size for this test is 4
#[test]
#[serial]
fn test_leader_failure_4() {
solana_logger::setup_with_default(RUST_LOG_FILTER);
error!("test_leader_failure_4");
// Cluster needs a supermajority to remain even after taking 1 node offline,
// so the minimum number of nodes for this test is 4.
let num_nodes = 4;
let validator_config = ValidatorConfig::default_for_test();
// Embed vote and stake account in genesis to avoid waiting for stake
// activation and race conditions around accepting gossip votes, repairing
// blocks, etc. before we advance through too many epochs.
let validator_keys: Option<Vec<(Arc<Keypair>, bool)>> = Some(
(0..num_nodes)
.map(|_| (Arc::new(Keypair::new()), true))
.collect(),
);
// Skip the warmup slots because these short epochs can cause problems when
// bringing multiple fresh validators online that are pre-staked in genesis.
// The problems arise because we skip their leader slots while they're still
// starting up, experience partitioning, and can fail to generate leader
// schedules in time because the short epochs have the same slots per epoch
// as the total tower height, so any skipped slots can lead to not rooting,
// not generating leader schedule, and stalling the cluster.
let skip_warmup_slots = true;
let mut config = ClusterConfig {
cluster_lamports: DEFAULT_CLUSTER_LAMPORTS,
node_stakes: vec![DEFAULT_NODE_STAKE; 4],
node_stakes: vec![DEFAULT_NODE_STAKE; num_nodes],
validator_configs: make_identical_validator_configs(&validator_config, num_nodes),
validator_keys,
skip_warmup_slots,
..ClusterConfig::default()
};
let local = LocalCluster::new(&mut config, SocketAddrSpace::Unspecified);
Expand Down
Loading