From 2107adcf358c0710cc7da3df2e10289b5cdb9e2c Mon Sep 17 00:00:00 2001
From: Alessandro Decina <alessandro.d@gmail.com>
Date: Thu, 20 Jun 2024 08:57:11 +0700
Subject: [PATCH] bpf_loader: use an explicit thread-local pool for stack and
 heap memory (#1370)

* Rename ComputeBudget::max_invoke_stack_height to max_instruction_stack_depth

The new name is consistent with the existing
ComputeBudget::max_instruction_trace_length.

Also expose compute_budget:MAX_INSTRUCTION_DEPTH.

* bpf_loader: use an explicit thread-local pool for stack and heap memory

Use a fixed thread-local pool to hold stack and heap memory. This
mitigates the long standing issue of jemalloc causing TLB shootdowns to
serve such frequent large allocations.

Because we need 1 stack and 1 heap region per instruction, and the
current max instruction nesting is hardcoded to 5, the pre-allocated
size is (MAX_STACK + MAX_HEAP) * 5 * NUM_THREADS. With the current
limits that's about 2.5MB per thread. Note that this is memory that
would eventually get allocated anyway, we're just pre-allocating it now.

* programs/sbf: add test for stack/heap zeroing

Add TEST_STACK_HEAP_ZEROED which tests that stack and heap regions are
zeroed across reuse from the memory pool.
---
 compute-budget/src/compute_budget.rs          |  22 ++-
 .../src/compute_budget_processor.rs           |  14 +-
 ledger-tool/src/program.rs                    |   2 +-
 program-runtime/src/invoke_context.rs         |   4 +-
 program-runtime/src/lib.rs                    |   1 +
 program-runtime/src/mem_pool.rs               | 146 ++++++++++++++++++
 programs/bpf_loader/src/lib.rs                |  51 +++---
 programs/sbf/benches/bpf_loader.rs            |   4 +-
 programs/sbf/rust/invoke/src/lib.rs           |  51 ++++++
 programs/sbf/rust/invoke_dep/src/lib.rs       |   1 +
 programs/sbf/tests/programs.rs                | 137 +++++++++++-----
 .../bank/builtins/core_bpf_migration/mod.rs   |   2 +-
 svm/src/account_loader.rs                     |   2 +-
 svm/src/transaction_processor.rs              |   2 +-
 svm/tests/conformance.rs                      |   2 +-
 15 files changed, 358 insertions(+), 83 deletions(-)
 create mode 100644 program-runtime/src/mem_pool.rs

diff --git a/compute-budget/src/compute_budget.rs b/compute-budget/src/compute_budget.rs
index 6b91affca4dd1c..e24f4b9f18a4ac 100644
--- a/compute-budget/src/compute_budget.rs
+++ b/compute-budget/src/compute_budget.rs
@@ -13,6 +13,16 @@ impl ::solana_frozen_abi::abi_example::AbiExample for ComputeBudget {
     }
 }
 
+/// Max instruction stack depth. This is the maximum nesting of instructions that can happen during
+/// a transaction.
+pub const MAX_INSTRUCTION_STACK_DEPTH: usize = 5;
+
+/// Max call depth. This is the maximum nesting of SBF to SBF call that can happen within a program.
+pub const MAX_CALL_DEPTH: usize = 64;
+
+/// The size of one SBF stack frame.
+pub const STACK_FRAME_SIZE: usize = 4096;
+
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub struct ComputeBudget {
     /// Number of compute units that a transaction or individual instruction is
@@ -26,11 +36,11 @@ pub struct ComputeBudget {
     /// Number of compute units consumed by an invoke call (not including the cost incurred by
     /// the called program)
     pub invoke_units: u64,
-    /// Maximum program instruction invocation stack height. Invocation stack
-    /// height starts at 1 for transaction instructions and the stack height is
+    /// Maximum program instruction invocation stack depth. Invocation stack
+    /// depth starts at 1 for transaction instructions and the stack depth is
     /// incremented each time a program invokes an instruction and decremented
     /// when a program returns.
-    pub max_invoke_stack_height: usize,
+    pub max_instruction_stack_depth: usize,
     /// Maximum cross-program invocation and instructions per transaction
     pub max_instruction_trace_length: usize,
     /// Base number of compute units consumed to call SHA256
@@ -133,13 +143,13 @@ impl ComputeBudget {
             log_64_units: 100,
             create_program_address_units: 1500,
             invoke_units: 1000,
-            max_invoke_stack_height: 5,
+            max_instruction_stack_depth: MAX_INSTRUCTION_STACK_DEPTH,
             max_instruction_trace_length: 64,
             sha256_base_cost: 85,
             sha256_byte_cost: 1,
             sha256_max_slices: 20_000,
-            max_call_depth: 64,
-            stack_frame_size: 4_096,
+            max_call_depth: MAX_CALL_DEPTH,
+            stack_frame_size: STACK_FRAME_SIZE,
             log_pubkey_units: 100,
             max_cpi_instruction_size: 1280, // IPv6 Min MTU size
             cpi_bytes_per_unit: 250,        // ~50MB at 200,000 units
diff --git a/compute-budget/src/compute_budget_processor.rs b/compute-budget/src/compute_budget_processor.rs
index dc568f82f169bb..af60bbe22cf254 100644
--- a/compute-budget/src/compute_budget_processor.rs
+++ b/compute-budget/src/compute_budget_processor.rs
@@ -3,7 +3,7 @@ use {
     solana_sdk::{
         borsh1::try_from_slice_unchecked,
         compute_budget::{self, ComputeBudgetInstruction},
-        entrypoint::HEAP_LENGTH as MIN_HEAP_FRAME_BYTES,
+        entrypoint::HEAP_LENGTH,
         fee::FeeBudgetLimits,
         instruction::{CompiledInstruction, InstructionError},
         pubkey::Pubkey,
@@ -11,12 +11,13 @@ use {
     },
 };
 
-const MAX_HEAP_FRAME_BYTES: u32 = 256 * 1024;
 /// Roughly 0.5us/page, where page is 32K; given roughly 15CU/us, the
 /// default heap page cost = 0.5 * 15 ~= 8CU/page
 pub const DEFAULT_HEAP_COST: u64 = 8;
 pub const DEFAULT_INSTRUCTION_COMPUTE_UNIT_LIMIT: u32 = 200_000;
 pub const MAX_COMPUTE_UNIT_LIMIT: u32 = 1_400_000;
+pub const MAX_HEAP_FRAME_BYTES: u32 = 256 * 1024;
+pub const MIN_HEAP_FRAME_BYTES: u32 = HEAP_LENGTH as u32;
 
 /// The total accounts data a transaction can load is limited to 64MiB to not break
 /// anyone in Mainnet-beta today. It can be set by set_loaded_accounts_data_size_limit instruction
@@ -33,7 +34,7 @@ pub struct ComputeBudgetLimits {
 impl Default for ComputeBudgetLimits {
     fn default() -> Self {
         ComputeBudgetLimits {
-            updated_heap_bytes: u32::try_from(MIN_HEAP_FRAME_BYTES).unwrap(),
+            updated_heap_bytes: MIN_HEAP_FRAME_BYTES,
             compute_unit_limit: MAX_COMPUTE_UNIT_LIMIT,
             compute_unit_price: 0,
             loaded_accounts_bytes: MAX_LOADED_ACCOUNTS_DATA_SIZE_BYTES,
@@ -122,7 +123,7 @@ pub fn process_compute_budget_instructions<'a>(
 
     // sanitize limits
     let updated_heap_bytes = requested_heap_size
-        .unwrap_or(u32::try_from(MIN_HEAP_FRAME_BYTES).unwrap()) // loader's default heap_size
+        .unwrap_or(MIN_HEAP_FRAME_BYTES) // loader's default heap_size
         .min(MAX_HEAP_FRAME_BYTES);
 
     let compute_unit_limit = updated_compute_unit_limit
@@ -147,8 +148,7 @@ pub fn process_compute_budget_instructions<'a>(
 }
 
 fn sanitize_requested_heap_size(bytes: u32) -> bool {
-    (u32::try_from(MIN_HEAP_FRAME_BYTES).unwrap()..=MAX_HEAP_FRAME_BYTES).contains(&bytes)
-        && bytes % 1024 == 0
+    (MIN_HEAP_FRAME_BYTES..=MAX_HEAP_FRAME_BYTES).contains(&bytes) && bytes % 1024 == 0
 }
 
 #[cfg(test)]
@@ -377,7 +377,7 @@ mod tests {
         test!(
             &[
                 Instruction::new_with_bincode(Pubkey::new_unique(), &0_u8, vec![]),
-                ComputeBudgetInstruction::request_heap_frame(MIN_HEAP_FRAME_BYTES as u32),
+                ComputeBudgetInstruction::request_heap_frame(MIN_HEAP_FRAME_BYTES),
                 ComputeBudgetInstruction::request_heap_frame(MAX_HEAP_FRAME_BYTES),
             ],
             Err(TransactionError::DuplicateInstruction(2))
diff --git a/ledger-tool/src/program.rs b/ledger-tool/src/program.rs
index 012def5fef1209..463d017b17dbed 100644
--- a/ledger-tool/src/program.rs
+++ b/ledger-tool/src/program.rs
@@ -557,7 +557,7 @@ pub fn program(ledger_path: &Path, matches: &ArgMatches<'_>) {
         account_lengths,
         &mut invoke_context,
     );
-    let mut vm = vm.unwrap();
+    let (mut vm, _, _) = vm.unwrap();
     let start_time = Instant::now();
     if matches.value_of("mode").unwrap() == "debugger" {
         vm.debug_port = Some(matches.value_of("port").unwrap().parse::<u16>().unwrap());
diff --git a/program-runtime/src/invoke_context.rs b/program-runtime/src/invoke_context.rs
index 393098b61b154e..05404aa51c2c98 100644
--- a/program-runtime/src/invoke_context.rs
+++ b/program-runtime/src/invoke_context.rs
@@ -693,7 +693,7 @@ macro_rules! with_mock_invoke_context {
         let mut $transaction_context = TransactionContext::new(
             $transaction_accounts,
             Rent::default(),
-            compute_budget.max_invoke_stack_height,
+            compute_budget.max_instruction_stack_depth,
             compute_budget.max_instruction_trace_length,
         );
         let mut sysvar_cache = SysvarCache::default();
@@ -940,7 +940,7 @@ mod tests {
     #[test]
     fn test_instruction_stack_height() {
         let one_more_than_max_depth = ComputeBudget::default()
-            .max_invoke_stack_height
+            .max_instruction_stack_depth
             .saturating_add(1);
         let mut invoke_stack = vec![];
         let mut transaction_accounts = vec![];
diff --git a/program-runtime/src/lib.rs b/program-runtime/src/lib.rs
index ef2832522db95b..3627569bc568ee 100644
--- a/program-runtime/src/lib.rs
+++ b/program-runtime/src/lib.rs
@@ -12,6 +12,7 @@ pub use solana_rbpf;
 pub mod invoke_context;
 pub mod loaded_programs;
 pub mod log_collector;
+pub mod mem_pool;
 pub mod stable_log;
 pub mod sysvar_cache;
 pub mod timings;
diff --git a/program-runtime/src/mem_pool.rs b/program-runtime/src/mem_pool.rs
new file mode 100644
index 00000000000000..398de5bfa64c57
--- /dev/null
+++ b/program-runtime/src/mem_pool.rs
@@ -0,0 +1,146 @@
+use {
+    solana_compute_budget::{
+        compute_budget::{MAX_CALL_DEPTH, MAX_INSTRUCTION_STACK_DEPTH, STACK_FRAME_SIZE},
+        compute_budget_processor::{MAX_HEAP_FRAME_BYTES, MIN_HEAP_FRAME_BYTES},
+    },
+    solana_rbpf::{aligned_memory::AlignedMemory, ebpf::HOST_ALIGN},
+    std::array,
+};
+
+trait Reset {
+    fn reset(&mut self);
+}
+
+struct Pool<T: Reset, const SIZE: usize> {
+    items: [Option<T>; SIZE],
+    next_empty: usize,
+}
+
+impl<T: Reset, const SIZE: usize> Pool<T, SIZE> {
+    fn new(items: [T; SIZE]) -> Self {
+        Self {
+            items: items.map(|i| Some(i)),
+            next_empty: SIZE,
+        }
+    }
+
+    fn len(&self) -> usize {
+        SIZE
+    }
+
+    fn get(&mut self) -> Option<T> {
+        if self.next_empty == 0 {
+            return None;
+        }
+        self.next_empty = self.next_empty.saturating_sub(1);
+        self.items
+            .get_mut(self.next_empty)
+            .and_then(|item| item.take())
+    }
+
+    fn put(&mut self, mut value: T) -> bool {
+        self.items
+            .get_mut(self.next_empty)
+            .map(|item| {
+                value.reset();
+                item.replace(value);
+                self.next_empty = self.next_empty.saturating_add(1);
+                true
+            })
+            .unwrap_or(false)
+    }
+}
+
+impl Reset for AlignedMemory<{ HOST_ALIGN }> {
+    fn reset(&mut self) {
+        self.as_slice_mut().fill(0)
+    }
+}
+
+pub struct VmMemoryPool {
+    stack: Pool<AlignedMemory<{ HOST_ALIGN }>, MAX_INSTRUCTION_STACK_DEPTH>,
+    heap: Pool<AlignedMemory<{ HOST_ALIGN }>, MAX_INSTRUCTION_STACK_DEPTH>,
+}
+
+impl VmMemoryPool {
+    pub fn new() -> Self {
+        Self {
+            stack: Pool::new(array::from_fn(|_| {
+                AlignedMemory::zero_filled(STACK_FRAME_SIZE * MAX_CALL_DEPTH)
+            })),
+            heap: Pool::new(array::from_fn(|_| {
+                AlignedMemory::zero_filled(MAX_HEAP_FRAME_BYTES as usize)
+            })),
+        }
+    }
+
+    pub fn stack_len(&self) -> usize {
+        self.stack.len()
+    }
+
+    pub fn heap_len(&self) -> usize {
+        self.heap.len()
+    }
+
+    pub fn get_stack(&mut self, size: usize) -> AlignedMemory<{ HOST_ALIGN }> {
+        debug_assert!(size == STACK_FRAME_SIZE * MAX_CALL_DEPTH);
+        self.stack
+            .get()
+            .unwrap_or_else(|| AlignedMemory::zero_filled(size))
+    }
+
+    pub fn put_stack(&mut self, stack: AlignedMemory<{ HOST_ALIGN }>) -> bool {
+        self.stack.put(stack)
+    }
+
+    pub fn get_heap(&mut self, heap_size: u32) -> AlignedMemory<{ HOST_ALIGN }> {
+        debug_assert!((MIN_HEAP_FRAME_BYTES..=MAX_HEAP_FRAME_BYTES).contains(&heap_size));
+        self.heap
+            .get()
+            .unwrap_or_else(|| AlignedMemory::zero_filled(MAX_HEAP_FRAME_BYTES as usize))
+    }
+
+    pub fn put_heap(&mut self, heap: AlignedMemory<{ HOST_ALIGN }>) -> bool {
+        let heap_size = heap.len();
+        debug_assert!(
+            heap_size >= MIN_HEAP_FRAME_BYTES as usize
+                && heap_size <= MAX_HEAP_FRAME_BYTES as usize
+        );
+        self.heap.put(heap)
+    }
+}
+
+impl Default for VmMemoryPool {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[derive(Debug, Eq, PartialEq)]
+    struct Item(u8, u8);
+    impl Reset for Item {
+        fn reset(&mut self) {
+            self.1 = 0;
+        }
+    }
+
+    #[test]
+    fn test_pool() {
+        let mut pool = Pool::<Item, 2>::new([Item(0, 1), Item(1, 1)]);
+        assert_eq!(pool.get(), Some(Item(1, 1)));
+        assert_eq!(pool.get(), Some(Item(0, 1)));
+        assert_eq!(pool.get(), None);
+        pool.put(Item(1, 1));
+        assert_eq!(pool.get(), Some(Item(1, 0)));
+        pool.put(Item(2, 2));
+        pool.put(Item(3, 3));
+        assert!(!pool.put(Item(4, 4)));
+        assert_eq!(pool.get(), Some(Item(3, 0)));
+        assert_eq!(pool.get(), Some(Item(2, 0)));
+        assert_eq!(pool.get(), None);
+    }
+}
diff --git a/programs/bpf_loader/src/lib.rs b/programs/bpf_loader/src/lib.rs
index 016fcf7ecd30d1..2c2b16245c88ce 100644
--- a/programs/bpf_loader/src/lib.rs
+++ b/programs/bpf_loader/src/lib.rs
@@ -5,6 +5,7 @@ pub mod serialization;
 pub mod syscalls;
 
 use {
+    solana_compute_budget::compute_budget::MAX_INSTRUCTION_STACK_DEPTH,
     solana_measure::measure::Measure,
     solana_program_runtime::{
         ic_logger_msg, ic_msg,
@@ -14,13 +15,13 @@ use {
             DELAY_VISIBILITY_SLOT_OFFSET,
         },
         log_collector::LogCollector,
+        mem_pool::VmMemoryPool,
         stable_log,
         sysvar_cache::get_sysvar_with_account_check,
     },
     solana_rbpf::{
-        aligned_memory::AlignedMemory,
         declare_builtin_function,
-        ebpf::{self, HOST_ALIGN, MM_HEAP_START},
+        ebpf::{self, MM_HEAP_START},
         elf::Executable,
         error::{EbpfError, ProgramResult},
         memory_region::{AccessType, MemoryCowCallback, MemoryMapping, MemoryRegion},
@@ -55,6 +56,10 @@ pub const DEFAULT_LOADER_COMPUTE_UNITS: u64 = 570;
 pub const DEPRECATED_LOADER_COMPUTE_UNITS: u64 = 1_140;
 pub const UPGRADEABLE_LOADER_COMPUTE_UNITS: u64 = 2_370;
 
+thread_local! {
+    pub static MEMORY_POOL: RefCell<VmMemoryPool> = RefCell::new(VmMemoryPool::new());
+}
+
 #[allow(clippy::too_many_arguments)]
 pub fn load_program_from_bytes(
     log_collector: Option<Rc<RefCell<LogCollector>>>,
@@ -240,8 +245,8 @@ pub fn create_vm<'a, 'b>(
     regions: Vec<MemoryRegion>,
     accounts_metadata: Vec<SerializedAccountMetadata>,
     invoke_context: &'a mut InvokeContext<'b>,
-    stack: &mut AlignedMemory<HOST_ALIGN>,
-    heap: &mut AlignedMemory<HOST_ALIGN>,
+    stack: &mut [u8],
+    heap: &mut [u8],
 ) -> Result<EbpfVm<'a, InvokeContext<'b>>, Box<dyn std::error::Error>> {
     let stack_size = stack.len();
     let heap_size = heap.len();
@@ -295,24 +300,23 @@ macro_rules! create_vm {
             heap_size,
             invoke_context.get_compute_budget().heap_cost,
         ));
-        let mut allocations = None;
         let $vm = heap_cost_result.and_then(|_| {
-            let mut stack = solana_rbpf::aligned_memory::AlignedMemory::<
-                { solana_rbpf::ebpf::HOST_ALIGN },
-            >::zero_filled(stack_size);
-            let mut heap = solana_rbpf::aligned_memory::AlignedMemory::<
-                { solana_rbpf::ebpf::HOST_ALIGN },
-            >::zero_filled(usize::try_from(heap_size).unwrap());
+            let (mut stack, mut heap) = $crate::MEMORY_POOL
+                .with_borrow_mut(|pool| (pool.get_stack(stack_size), pool.get_heap(heap_size)));
             let vm = $crate::create_vm(
                 $program,
                 $regions,
                 $accounts_metadata,
                 $invoke_context,
-                &mut stack,
-                &mut heap,
+                stack
+                    .as_slice_mut()
+                    .get_mut(..stack_size)
+                    .expect("invalid stack size"),
+                heap.as_slice_mut()
+                    .get_mut(..heap_size as usize)
+                    .expect("invalid heap size"),
             );
-            allocations = Some((stack, heap));
-            vm
+            vm.map(|vm| (vm, stack, heap))
         });
     };
 }
@@ -339,13 +343,14 @@ macro_rules! mock_create_vm {
             $accounts_metadata,
             $invoke_context,
         );
+        let $vm = $vm.map(|(vm, _, _)| vm);
     };
 }
 
 fn create_memory_mapping<'a, 'b, C: ContextObject>(
     executable: &'a Executable<C>,
-    stack: &'b mut AlignedMemory<{ HOST_ALIGN }>,
-    heap: &'b mut AlignedMemory<{ HOST_ALIGN }>,
+    stack: &'b mut [u8],
+    heap: &'b mut [u8],
     additional_regions: Vec<MemoryRegion>,
     cow_cb: Option<MemoryCowCallback>,
 ) -> Result<MemoryMapping<'a>, Box<dyn std::error::Error>> {
@@ -354,7 +359,7 @@ fn create_memory_mapping<'a, 'b, C: ContextObject>(
     let regions: Vec<MemoryRegion> = vec![
         executable.get_ro_region(),
         MemoryRegion::new_writable_gapped(
-            stack.as_slice_mut(),
+            stack,
             ebpf::MM_STACK_START,
             if !sbpf_version.dynamic_stack_frames() && config.enable_stack_frame_gaps {
                 config.stack_frame_size as u64
@@ -362,7 +367,7 @@ fn create_memory_mapping<'a, 'b, C: ContextObject>(
                 0
             },
         ),
-        MemoryRegion::new_writable(heap.as_slice_mut(), MM_HEAP_START),
+        MemoryRegion::new_writable(heap, MM_HEAP_START),
     ]
     .into_iter()
     .chain(additional_regions)
@@ -1387,7 +1392,7 @@ fn execute<'a, 'b: 'a>(
     let execution_result = {
         let compute_meter_prev = invoke_context.get_remaining();
         create_vm!(vm, executable, regions, accounts_metadata, invoke_context);
-        let mut vm = match vm {
+        let (mut vm, stack, heap) = match vm {
             Ok(info) => info,
             Err(e) => {
                 ic_logger_msg!(log_collector, "Failed to create SBF VM: {}", e);
@@ -1398,6 +1403,12 @@ fn execute<'a, 'b: 'a>(
 
         vm.context_object_pointer.execute_time = Some(Measure::start("execute"));
         let (compute_units_consumed, result) = vm.execute_program(executable, !use_jit);
+        MEMORY_POOL.with_borrow_mut(|memory_pool| {
+            memory_pool.put_stack(stack);
+            memory_pool.put_heap(heap);
+            debug_assert!(memory_pool.stack_len() <= MAX_INSTRUCTION_STACK_DEPTH);
+            debug_assert!(memory_pool.heap_len() <= MAX_INSTRUCTION_STACK_DEPTH);
+        });
         drop(vm);
         if let Some(execute_time) = invoke_context.execute_time.as_mut() {
             execute_time.stop();
diff --git a/programs/sbf/benches/bpf_loader.rs b/programs/sbf/benches/bpf_loader.rs
index 489c5224f24ee1..ab5e950faab874 100644
--- a/programs/sbf/benches/bpf_loader.rs
+++ b/programs/sbf/benches/bpf_loader.rs
@@ -139,7 +139,7 @@ fn bench_program_alu(bencher: &mut Bencher) {
         vec![],
         &mut invoke_context,
     );
-    let mut vm = vm.unwrap();
+    let (mut vm, _, _) = vm.unwrap();
 
     println!("Interpreted:");
     vm.context_object_pointer
@@ -314,7 +314,7 @@ fn bench_instruction_count_tuner(_bencher: &mut Bencher) {
         account_lengths,
         &mut invoke_context,
     );
-    let mut vm = vm.unwrap();
+    let (mut vm, _, _) = vm.unwrap();
 
     let mut measure = Measure::start("tune");
     let (instructions, _result) = vm.execute_program(&executable, true);
diff --git a/programs/sbf/rust/invoke/src/lib.rs b/programs/sbf/rust/invoke/src/lib.rs
index 82461c2bc5595c..1f1eb97abf8281 100644
--- a/programs/sbf/rust/invoke/src/lib.rs
+++ b/programs/sbf/rust/invoke/src/lib.rs
@@ -1393,6 +1393,57 @@ fn process_instruction<'a>(
                 account.data.borrow_mut()[write_offset] ^= 0xe5;
             }
         }
+        TEST_STACK_HEAP_ZEROED => {
+            msg!("TEST_STACK_HEAP_ZEROED");
+            const MM_STACK_START: u64 = 0x200000000;
+            const MM_HEAP_START: u64 = 0x300000000;
+            const ZEROS: [u8; 256 * 1024] = [0; 256 * 1024];
+            const STACK_FRAME_SIZE: usize = 4096;
+            const MAX_CALL_DEPTH: usize = 64;
+
+            // Check that the heap is always zeroed.
+            //
+            // At this point the code up to here will have allocated some values on the heap. The
+            // bump allocator writes the current heap pointer to the start of the memory region. We
+            // read it to find the slice of unallocated memory and check that it's zeroed. We then
+            // fill this memory with a sentinel value, and in the next nested invocation check that
+            // it's been zeroed as expected.
+            let heap_len = usize::from_le_bytes(instruction_data[1..9].try_into().unwrap());
+            let heap = unsafe { slice::from_raw_parts_mut(MM_HEAP_START as *mut u8, heap_len) };
+            let pos = usize::from_le_bytes(heap[0..8].try_into().unwrap())
+                .saturating_sub(MM_HEAP_START as usize);
+            assert!(heap[8..pos] == ZEROS[8..pos], "heap not zeroed");
+            heap[8..pos].fill(42);
+
+            // Check that the stack is zeroed too.
+            //
+            // We don't know in which frame we are now, so we skip a few (10) frames at the start
+            // which might have been used by the current call stack. We check that the memory for
+            // the 10..MAX_CALL_DEPTH frames is zeroed. Then we write a sentinel value, and in the
+            // next nested invocation check that it's been zeroed.
+            let stack =
+                unsafe { slice::from_raw_parts_mut(MM_STACK_START as *mut u8, 0x100000000) };
+            for i in 10..MAX_CALL_DEPTH {
+                let stack = &mut stack[i * STACK_FRAME_SIZE..][..STACK_FRAME_SIZE];
+                assert!(stack == &ZEROS[..STACK_FRAME_SIZE], "stack not zeroed");
+                stack.fill(42);
+            }
+
+            // Recurse to check that the stack and heap are zeroed.
+            //
+            // We recurse until we go over max CPI depth and error out. Stack and heap allocations
+            // are reused across CPI, by going over max depth we ensure that it's impossible to get
+            // non-zeroed regions through execution.
+            invoke(
+                &create_instruction(
+                    *program_id,
+                    &[(program_id, false, false)],
+                    instruction_data.to_vec(),
+                ),
+                accounts,
+            )
+            .unwrap();
+        }
         _ => panic!("unexpected program data"),
     }
 
diff --git a/programs/sbf/rust/invoke_dep/src/lib.rs b/programs/sbf/rust/invoke_dep/src/lib.rs
index f33515f602bd8f..066e900b7f9d2e 100644
--- a/programs/sbf/rust/invoke_dep/src/lib.rs
+++ b/programs/sbf/rust/invoke_dep/src/lib.rs
@@ -40,6 +40,7 @@ pub const TEST_CPI_INVALID_DATA_POINTER: u8 = 37;
 pub const TEST_CPI_CHANGE_ACCOUNT_DATA_MEMORY_ALLOCATION: u8 = 38;
 pub const TEST_WRITE_ACCOUNT: u8 = 39;
 pub const TEST_CALLEE_ACCOUNT_UPDATES: u8 = 40;
+pub const TEST_STACK_HEAP_ZEROED: u8 = 41;
 
 pub const MINT_INDEX: usize = 0;
 pub const ARGUMENT_INDEX: usize = 1;
diff --git a/programs/sbf/tests/programs.rs b/programs/sbf/tests/programs.rs
index 9fdcee23aa4749..74fb71f39ecb7b 100644
--- a/programs/sbf/tests/programs.rs
+++ b/programs/sbf/tests/programs.rs
@@ -18,10 +18,15 @@ use {
         compute_budget_processor::process_compute_budget_instructions,
     },
     solana_ledger::token_balances::collect_token_balances,
-    solana_program_runtime::timings::ExecuteTimings,
+    solana_program_runtime::{invoke_context::mock_process_instruction, timings::ExecuteTimings},
     solana_rbpf::vm::ContextObject,
     solana_runtime::{
-        bank::TransactionBalancesSet,
+        bank::{Bank, TransactionBalancesSet},
+        bank_client::BankClient,
+        genesis_utils::{
+            bootstrap_validator_stake_lamports, create_genesis_config,
+            create_genesis_config_with_leader_ex, GenesisConfigInfo,
+        },
         loader_utils::{
             create_program, load_program_from_file, load_upgradeable_buffer,
             load_upgradeable_program, load_upgradeable_program_and_advance_slot,
@@ -32,60 +37,44 @@ use {
     solana_sbf_rust_realloc_dep::*,
     solana_sbf_rust_realloc_invoke_dep::*,
     solana_sdk::{
-        account::{ReadableAccount, WritableAccount},
+        account::{AccountSharedData, ReadableAccount, WritableAccount},
         account_utils::StateMut,
-        bpf_loader_upgradeable,
-        clock::MAX_PROCESSING_AGE,
+        bpf_loader, bpf_loader_deprecated, bpf_loader_upgradeable,
+        client::SyncClient,
+        clock::{UnixTimestamp, MAX_PROCESSING_AGE},
         compute_budget::ComputeBudgetInstruction,
         entrypoint::MAX_PERMITTED_DATA_INCREASE,
         feature_set::{self, FeatureSet},
         fee::FeeStructure,
-        message::{v0::LoadedAddresses, SanitizedMessage},
-        signature::keypair_from_seed,
+        fee_calculator::FeeRateGovernor,
+        genesis_config::ClusterType,
+        hash::Hash,
+        instruction::{AccountMeta, Instruction, InstructionError},
+        message::{v0::LoadedAddresses, Message, SanitizedMessage},
+        pubkey::Pubkey,
+        rent::Rent,
+        reserved_account_keys::ReservedAccountKeys,
+        signature::{keypair_from_seed, Keypair, Signer},
         stake,
         system_instruction::{self, MAX_PERMITTED_DATA_LENGTH},
+        system_program,
         sysvar::{self, clock},
-        transaction::VersionedTransaction,
+        transaction::{SanitizedTransaction, Transaction, TransactionError, VersionedTransaction},
     },
-    solana_svm::transaction_processor::ExecutionRecordingConfig,
-    solana_svm::transaction_results::{
-        InnerInstruction, TransactionExecutionDetails, TransactionExecutionResult,
-        TransactionResults,
+    solana_svm::{
+        transaction_processor::ExecutionRecordingConfig,
+        transaction_results::{
+            InnerInstruction, TransactionExecutionDetails, TransactionExecutionResult,
+            TransactionResults,
+        },
     },
     solana_transaction_status::{
         map_inner_instructions, ConfirmedTransactionWithStatusMeta, TransactionStatusMeta,
         TransactionWithStatusMeta, VersionedTransactionWithStatusMeta,
     },
-    std::collections::HashMap,
-};
-use {
-    solana_program_runtime::invoke_context::mock_process_instruction,
-    solana_runtime::{
-        bank::Bank,
-        bank_client::BankClient,
-        genesis_utils::{
-            bootstrap_validator_stake_lamports, create_genesis_config,
-            create_genesis_config_with_leader_ex, GenesisConfigInfo,
-        },
-    },
-    solana_sdk::{
-        account::AccountSharedData,
-        bpf_loader, bpf_loader_deprecated,
-        client::SyncClient,
-        clock::UnixTimestamp,
-        fee_calculator::FeeRateGovernor,
-        genesis_config::ClusterType,
-        hash::Hash,
-        instruction::{AccountMeta, Instruction, InstructionError},
-        message::Message,
-        pubkey::Pubkey,
-        rent::Rent,
-        reserved_account_keys::ReservedAccountKeys,
-        signature::{Keypair, Signer},
-        system_program,
-        transaction::{SanitizedTransaction, Transaction, TransactionError},
+    std::{
+        assert_eq, cell::RefCell, collections::HashMap, str::FromStr, sync::Arc, time::Duration,
     },
-    std::{assert_eq, cell::RefCell, str::FromStr, sync::Arc, time::Duration},
 };
 
 #[cfg(feature = "sbf_rust")]
@@ -4678,3 +4667,69 @@ fn test_update_callee_account() {
         });
     }
 }
+
+#[test]
+fn test_stack_heap_zeroed() {
+    solana_logger::setup();
+
+    let GenesisConfigInfo {
+        genesis_config,
+        mint_keypair,
+        ..
+    } = create_genesis_config(100_123_456_789);
+
+    let bank = Bank::new_for_tests(&genesis_config);
+
+    let (bank, bank_forks) = bank.wrap_with_bank_forks_for_tests();
+    let mut bank_client = BankClient::new_shared(bank);
+    let authority_keypair = Keypair::new();
+
+    let (bank, invoke_program_id) = load_upgradeable_program_and_advance_slot(
+        &mut bank_client,
+        bank_forks.as_ref(),
+        &mint_keypair,
+        &authority_keypair,
+        "solana_sbf_rust_invoke",
+    );
+
+    let account_keypair = Keypair::new();
+    let mint_pubkey = mint_keypair.pubkey();
+    let account_metas = vec![
+        AccountMeta::new(mint_pubkey, true),
+        AccountMeta::new(account_keypair.pubkey(), false),
+        AccountMeta::new_readonly(invoke_program_id, false),
+    ];
+
+    // Check multiple heap sizes. It's generally a good idea, and also it's needed to ensure that
+    // pooled heap and stack values are reused - and therefore zeroed - across executions.
+    for heap_len in [32usize * 1024, 64 * 1024, 128 * 1024, 256 * 1024] {
+        // TEST_STACK_HEAP_ZEROED will recursively check that stack and heap are zeroed until it
+        // reaches max CPI invoke depth. We make it fail at max depth so we're sure that there's no
+        // legit way to access non-zeroed stack and heap regions.
+        let mut instruction_data = vec![TEST_STACK_HEAP_ZEROED];
+        instruction_data.extend_from_slice(&heap_len.to_le_bytes());
+
+        let instruction = Instruction::new_with_bytes(
+            invoke_program_id,
+            &instruction_data,
+            account_metas.clone(),
+        );
+
+        let message = Message::new(
+            &[
+                ComputeBudgetInstruction::set_compute_unit_limit(1_400_000),
+                ComputeBudgetInstruction::request_heap_frame(heap_len as u32),
+                instruction,
+            ],
+            Some(&mint_pubkey),
+        );
+        let tx = Transaction::new(&[&mint_keypair], message.clone(), bank.last_blockhash());
+        let (result, _, logs) = process_transaction_and_record_inner(&bank, tx);
+        assert!(result.is_err(), "{result:?}");
+        assert!(
+            logs.iter()
+                .any(|log| log.contains("Cross-program invocation call depth too deep")),
+            "{logs:?}"
+        );
+    }
+}
diff --git a/runtime/src/bank/builtins/core_bpf_migration/mod.rs b/runtime/src/bank/builtins/core_bpf_migration/mod.rs
index 54f1a8eabdfdd0..871b173e5fa772 100644
--- a/runtime/src/bank/builtins/core_bpf_migration/mod.rs
+++ b/runtime/src/bank/builtins/core_bpf_migration/mod.rs
@@ -190,7 +190,7 @@ impl Bank {
             let mut dummy_transaction_context = TransactionContext::new(
                 vec![],
                 self.rent_collector.rent.clone(),
-                compute_budget.max_invoke_stack_height,
+                compute_budget.max_instruction_stack_depth,
                 compute_budget.max_instruction_trace_length,
             );
 
diff --git a/svm/src/account_loader.rs b/svm/src/account_loader.rs
index 8eb2f9fa3e81f3..84599f1d5afa88 100644
--- a/svm/src/account_loader.rs
+++ b/svm/src/account_loader.rs
@@ -1733,7 +1733,7 @@ mod tests {
         let transaction_context = TransactionContext::new(
             loaded_txs[0].as_ref().unwrap().accounts.clone(),
             Rent::default(),
-            compute_budget.max_invoke_stack_height,
+            compute_budget.max_instruction_stack_depth,
             compute_budget.max_instruction_trace_length,
         );
 
diff --git a/svm/src/transaction_processor.rs b/svm/src/transaction_processor.rs
index 10fe96632dafd3..166cda5bc903e6 100644
--- a/svm/src/transaction_processor.rs
+++ b/svm/src/transaction_processor.rs
@@ -745,7 +745,7 @@ impl<FG: ForkGraph> TransactionBatchProcessor<FG> {
         let mut transaction_context = TransactionContext::new(
             transaction_accounts,
             rent.clone(),
-            compute_budget.max_invoke_stack_height,
+            compute_budget.max_instruction_stack_depth,
             compute_budget.max_instruction_trace_length,
         );
         #[cfg(debug_assertions)]
diff --git a/svm/tests/conformance.rs b/svm/tests/conformance.rs
index ebd0578bf7a7af..c3b96e383bb746 100644
--- a/svm/tests/conformance.rs
+++ b/svm/tests/conformance.rs
@@ -413,7 +413,7 @@ fn execute_fixture_as_instr(
     let mut transaction_context = TransactionContext::new(
         transaction_accounts,
         rent,
-        compute_budget.max_invoke_stack_height,
+        compute_budget.max_instruction_stack_depth,
         compute_budget.max_instruction_trace_length,
     );