From d16852862acbc7a142594140f689f27063157831 Mon Sep 17 00:00:00 2001 From: dmakarov Date: Tue, 15 Oct 2024 14:35:04 -0400 Subject: [PATCH] Aggressively shrink ancient storages when shrink isn't too busy. (#2946) * Tweak ancient packing algorithm * Minor change * Feedback * Remove redundancy * Correction * Revert correction * Loop * Add test * Fix clippy * Comments * Comment * Comments * Pop ancients * Revert * Checks * Move reverse * Typo * Popped * Sort * Format * Revert sort, back to reverse * Fix comment --- accounts-db/src/accounts_db.rs | 125 ++++++++++++++++++++++++- accounts-db/src/ancient_append_vecs.rs | 19 +++- 2 files changed, 141 insertions(+), 3 deletions(-) diff --git a/accounts-db/src/accounts_db.rs b/accounts-db/src/accounts_db.rs index 29582ab7e1ff7f..e2353b99fce81d 100644 --- a/accounts-db/src/accounts_db.rs +++ b/accounts-db/src/accounts_db.rs @@ -141,6 +141,11 @@ const MAX_ITEMS_PER_CHUNK: Slot = 2_500; // This allows us to split up accounts index accesses across multiple threads. const SHRINK_COLLECT_CHUNK_SIZE: usize = 50; +/// The number of shrink candidate slots that is small enough so that +/// additional storages from ancient slots can be added to the +/// candidates for shrinking. +const SHRINK_INSERT_ANCIENT_THRESHOLD: usize = 10; + #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] pub enum CreateAncientStorage { /// ancient storages are created by appending @@ -1501,6 +1506,14 @@ pub struct AccountsDb { /// Flag to indicate if the experimental accounts lattice hash is enabled. /// (For R&D only; a feature-gate also exists to turn this on and make it a part of consensus.) pub is_experimental_accumulator_hash_enabled: AtomicBool, + + /// These are the ancient storages that could be valuable to + /// shrink, sorted by amount of dead bytes. The elements + /// are popped from the end of the vector, hence the sorting is + /// expected to be from the smallest dead bytes to the largest. + /// Members are Slot and capacity. If capacity is smaller, then + /// that means the storage was already shrunk. + pub(crate) best_ancient_slots_to_shrink: RwLock>, } /// results from 'split_storages_ancient' @@ -1860,6 +1873,7 @@ impl AccountsDb { is_experimental_accumulator_hash_enabled: default_accounts_db_config .enable_experimental_accumulator_hash .into(), + best_ancient_slots_to_shrink: RwLock::default(), } } @@ -4401,8 +4415,12 @@ impl AccountsDb { let shrink_candidates_slots = std::mem::take(&mut *self.shrink_candidate_slots.lock().unwrap()); + self.shrink_stats + .initial_candidates_count + .store(shrink_candidates_slots.len() as u64, Ordering::Relaxed); + let candidates_count = shrink_candidates_slots.len(); - let ((shrink_slots, shrink_slots_next_batch), select_time_us) = measure_us!({ + let ((mut shrink_slots, shrink_slots_next_batch), select_time_us) = measure_us!({ if let AccountShrinkThreshold::TotalSpace { shrink_ratio } = self.shrink_ratio { let (shrink_slots, shrink_slots_next_batch) = self.select_candidates_by_total_usage(&shrink_candidates_slots, shrink_ratio); @@ -4423,6 +4441,30 @@ impl AccountsDb { } }); + // If there are too few slots to shrink, add an ancient slot + // for shrinking. The best ancient slots to shrink are + // assumed to be in reverse order. + if shrink_slots.len() < SHRINK_INSERT_ANCIENT_THRESHOLD { + let mut ancients = self.best_ancient_slots_to_shrink.write().unwrap(); + while let Some((slot, capacity)) = ancients.pop() { + if let Some(store) = self.storage.get_slot_storage_entry(slot) { + if !shrink_slots.contains(&slot) + && capacity == store.capacity() + && Self::is_candidate_for_shrink(self, &store) + { + let ancient_bytes_added_to_shrink = store.alive_bytes() as u64; + shrink_slots.insert(slot, store); + self.shrink_stats + .ancient_bytes_added_to_shrink + .fetch_add(ancient_bytes_added_to_shrink, Ordering::Relaxed); + self.shrink_stats + .ancient_slots_added_to_shrink + .fetch_add(1, Ordering::Relaxed); + break; + } + } + } + } if shrink_slots.is_empty() && shrink_slots_next_batch .as_ref() @@ -9252,7 +9294,9 @@ pub mod tests { accounts_hash::MERKLE_FANOUT, accounts_index::{tests::*, AccountSecondaryIndexesIncludeExclude}, ancient_append_vecs, - append_vec::{test_utils::TempFile, AppendVec, AppendVecStoredAccountMeta}, + append_vec::{ + aligned_stored_size, test_utils::TempFile, AppendVec, AppendVecStoredAccountMeta, + }, storable_accounts::AccountForStorage, }, assert_matches::assert_matches, @@ -12154,6 +12198,83 @@ pub mod tests { ); } + /// This test creates an ancient storage with three alive accounts + /// of various sizes. It then simulates killing one of the + /// accounts in a more recent (non-ancient) slot by overwriting + /// the account that has the smallest data size. The dead account + /// is expected to be deleted from its ancient storage in the + /// process of shrinking candidate slots. The capacity of the + /// storage after shrinking is expected to be the sum of alive + /// bytes of the two remaining alive ancient accounts. + #[test] + fn test_shrink_candidate_slots_with_dead_ancient_account() { + solana_logger::setup(); + let epoch_schedule = EpochSchedule::default(); + let num_ancient_slots = 3; + // Prepare 3 append vecs to combine [medium, big, small] + let account_data_sizes = vec![1000, 2000, 150]; + let (db, starting_ancient_slot) = + create_db_with_storages_and_index_with_customized_account_size_per_slot( + true, + num_ancient_slots, + account_data_sizes, + ); + db.add_root(starting_ancient_slot); + let slots_to_combine: Vec = + (starting_ancient_slot..starting_ancient_slot + num_ancient_slots as Slot).collect(); + db.combine_ancient_slots(slots_to_combine, CAN_RANDOMLY_SHRINK_FALSE); + let storage = db.get_storage_for_slot(starting_ancient_slot).unwrap(); + let ancient_accounts = db.get_unique_accounts_from_storage(&storage); + // Check that three accounts are indeed present in the combined storage. + assert_eq!(ancient_accounts.stored_accounts.len(), 3); + // Find an ancient account with smallest data length. + // This will be a dead account, overwritten in the current slot. + let modified_account_pubkey = ancient_accounts + .stored_accounts + .iter() + .min_by(|a, b| a.data_len.cmp(&b.data_len)) + .unwrap() + .pubkey; + let modified_account_owner = *AccountSharedData::default().owner(); + let modified_account = AccountSharedData::new(223, 0, &modified_account_owner); + let ancient_append_vec_offset = db.ancient_append_vec_offset.unwrap().abs(); + let current_slot = epoch_schedule.slots_per_epoch + ancient_append_vec_offset as u64 + 1; + // Simulate killing of the ancient account by overwriting it in the current slot. + db.store_for_tests( + current_slot, + &[(&modified_account_pubkey, &modified_account)], + ); + db.calculate_accounts_delta_hash(current_slot); + db.add_root_and_flush_write_cache(current_slot); + // This should remove the dead ancient account from the index. + db.clean_accounts_for_tests(); + db.shrink_ancient_slots(&epoch_schedule); + let storage = db.get_storage_for_slot(starting_ancient_slot).unwrap(); + let created_accounts = db.get_unique_accounts_from_storage(&storage); + // The dead account should still be in the ancient storage, + // because the storage wouldn't be shrunk with normal alive to + // capacity ratio. + assert_eq!(created_accounts.stored_accounts.len(), 3); + db.shrink_candidate_slots(&epoch_schedule); + let storage = db.get_storage_for_slot(starting_ancient_slot).unwrap(); + let created_accounts = db.get_unique_accounts_from_storage(&storage); + // At this point the dead ancient account should be removed + // and storage capacity shrunk to the sum of alive bytes of + // accounts it holds. This is the data lengths of the + // accounts plus the length of their metadata. + assert_eq!( + created_accounts.capacity as usize, + aligned_stored_size(1000) + aligned_stored_size(2000) + ); + // The above check works only when the AppendVec storage is + // used. More generally the pubkey of the smallest account + // shouldn't be present in the shrunk storage, which is + // validated by the following scan of the storage accounts. + storage.accounts.scan_pubkeys(|pubkey| { + assert_ne!(pubkey, &modified_account_pubkey); + }); + } + #[test] fn test_select_candidates_by_total_usage_no_candidates() { // no input candidates -- none should be selected diff --git a/accounts-db/src/ancient_append_vecs.rs b/accounts-db/src/ancient_append_vecs.rs index 9c788c8e668917..5d645c9560cc39 100644 --- a/accounts-db/src/ancient_append_vecs.rs +++ b/accounts-db/src/ancient_append_vecs.rs @@ -79,6 +79,9 @@ struct AncientSlotInfos { total_alive_bytes_shrink: Saturating, /// total alive bytes across all slots total_alive_bytes: Saturating, + /// slots that have dead accounts and thus the corresponding slot + /// storages can be shrunk + best_slots_to_shrink: Vec<(Slot, u64)>, } impl AncientSlotInfos { @@ -177,8 +180,13 @@ impl AncientSlotInfos { * tuning.percent_of_alive_shrunk_data / 100, ); + // At this point self.shrink_indexes have been sorted by the + // largest amount of dead bytes first in the corresponding + // storages. + self.best_slots_to_shrink = Vec::with_capacity(self.shrink_indexes.len()); for info_index in &self.shrink_indexes { let info = &mut self.all_infos[*info_index]; + self.best_slots_to_shrink.push((info.slot, info.capacity)); if bytes_to_shrink_due_to_ratio.0 >= threshold_bytes { // we exceeded the amount to shrink due to alive ratio, so don't shrink this one just due to 'should_shrink' // It MAY be shrunk based on total capacity still. @@ -188,6 +196,10 @@ impl AncientSlotInfos { bytes_to_shrink_due_to_ratio += info.alive_bytes; } } + // Reverse the vector so that the elements with the largest + // dead bytes are popped first when used to extend the + // shrinking candidates. + self.best_slots_to_shrink.reverse(); } /// after this function, only slots that were chosen to shrink are marked with @@ -396,7 +408,12 @@ impl AccountsDb { self.shrink_ancient_stats .slots_considered .fetch_add(sorted_slots.len() as u64, Ordering::Relaxed); - let ancient_slot_infos = self.collect_sort_filter_ancient_slots(sorted_slots, &tuning); + let mut ancient_slot_infos = self.collect_sort_filter_ancient_slots(sorted_slots, &tuning); + + std::mem::swap( + &mut *self.best_ancient_slots_to_shrink.write().unwrap(), + &mut ancient_slot_infos.best_slots_to_shrink, + ); if ancient_slot_infos.all_infos.is_empty() { return; // nothing to do