From 436f5dd690f6c37635c89b982c7931be03ae137f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 14 Dec 2024 11:39:37 -0800 Subject: [PATCH] gimlet-seq: Record why the power state changed Currently, when the Gimlet CPU sequencer changes the system's power state, no information about *why* the power state changed is recorded by the SP firmware. A system may power off or reboot for a variety of reasons: it may be requested by the host OS over IPCC, by the control plane over the management network, or triggered by the thermal task due to an overheat condition. This makes debugging an unexpected reboot or power off difficult, as the SP ringbuffers and other diagnostics do not indicate why an unexpected power state change occurred. See #1950 for a motivating example. This commit resolves this as described in #1950 by adding a new field to the `SetState` variant in the `drv-gimlet-seq-server` ringbuffer, so that the reason a power state change occurred can be recorded. Clients of the `cpu_seq` IPC API must now provide a `StateChangeReason` when calling `Sequencer.set_state`, along with the desired power state, and the sequencer task will record the provided reason in its ringbuffer. This way, we can distinguish between the various reasons a power state change may have occurred when debugging such issues. The `StateChangeReason` enum also generates counters, so that the total number of power state changes can be tracked. Fixes #1950 --- drv/cpu-seq-api/src/lib.rs | 17 ++++++++++++++ drv/gimlet-seq-server/src/main.rs | 23 ++++++++++++++----- idl/cpu-seq.idol | 6 ++++- .../src/mgs_compute_sled.rs | 5 +++- task/host-sp-comms/src/main.rs | 12 ++++++++-- task/thermal/src/bsp/gimlet_bcdef.rs | 5 ++-- 6 files changed, 56 insertions(+), 12 deletions(-) diff --git a/drv/cpu-seq-api/src/lib.rs b/drv/cpu-seq-api/src/lib.rs index 8ab006e09..d95a8562b 100644 --- a/drv/cpu-seq-api/src/lib.rs +++ b/drv/cpu-seq-api/src/lib.rs @@ -9,6 +9,7 @@ use counters::Count; use derive_idol_err::IdolError; use userlib::{sys_send, FromPrimitive}; +use zerocopy::AsBytes; // Re-export PowerState for client convenience. pub use drv_cpu_power_state::PowerState; @@ -31,6 +32,22 @@ pub enum SeqError { ServerRestarted, } +#[derive(Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, AsBytes, Count)] +#[repr(u8)] +pub enum StateChangeReason { + /// TThe system has just received power, so the sequencer has booted the + /// host CPU. + InitialPowerOn = 1, + /// A power state change was requested by the control plane. + ControlPlane, + /// The host OS requested that the system power off without rebooting. + HostPowerOff, + /// The host OS requested that the system reboot. + HostReboot, + /// The system powered off because a component has overheated. + Overheat, +} + // On Gimlet, we have two banks of up to 8 DIMMs apiece. Export the "two banks" // bit of knowledge here so it can be used by gimlet-seq-server, spd, and // packrat, all of which want to know at compile-time how many banks there are. diff --git a/drv/gimlet-seq-server/src/main.rs b/drv/gimlet-seq-server/src/main.rs index 0c3776e96..cca31aaf4 100644 --- a/drv/gimlet-seq-server/src/main.rs +++ b/drv/gimlet-seq-server/src/main.rs @@ -17,7 +17,7 @@ use userlib::{ sys_set_timer, task_slot, units, RecvMessage, TaskId, UnwrapLite, }; -use drv_cpu_seq_api::{PowerState, SeqError}; +use drv_cpu_seq_api::{PowerState, SeqError, StateChangeReason}; use drv_hf_api as hf_api; use drv_i2c_api as i2c; use drv_ice40_spi_program as ice40; @@ -90,7 +90,12 @@ enum Trace { RailsOn, UartEnabled, A0(u16), - SetState(PowerState, PowerState, u64), + SetState( + PowerState, + PowerState, + #[count(children)] StateChangeReason, + u64, + ), UpdateState(#[count(children)] PowerState), ClockConfigWrite, ClockConfigSuccess, @@ -482,7 +487,10 @@ impl ServerImpl { // Power on, unless suppressed by the `stay-in-a2` feature if !cfg!(feature = "stay-in-a2") { - _ = server.set_state_internal(PowerState::A0); + _ = server.set_state_internal( + PowerState::A0, + StateChangeReason::InitialPowerOn, + ); } // @@ -666,11 +674,12 @@ impl ServerImpl { fn set_state_internal( &mut self, state: PowerState, + reason: StateChangeReason, ) -> Result<(), SeqError> { let sys = sys_api::Sys::from(SYS.get_task_id()); let now = sys_get_timer().now; - ringbuf_entry!(Trace::SetState(self.state, state, now)); + ringbuf_entry!(Trace::SetState(self.state, state, reason, now)); ringbuf_entry_v3p3_sys_a0_vout(); @@ -1032,8 +1041,10 @@ impl idl::InOrderSequencerImpl for ServerImpl { &mut self, _: &RecvMessage, state: PowerState, + reason: StateChangeReason, ) -> Result<(), RequestError> { - self.set_state_internal(state).map_err(RequestError::from) + self.set_state_internal(state, reason) + .map_err(RequestError::from) } fn send_hardware_nmi( @@ -1403,7 +1414,7 @@ cfg_if::cfg_if! { } mod idl { - use super::SeqError; + use super::{SeqError, StateChangeReason}; include!(concat!(env!("OUT_DIR"), "/server_stub.rs")); } diff --git a/idl/cpu-seq.idol b/idl/cpu-seq.idol index f345f36ed..ba1909788 100644 --- a/idl/cpu-seq.idol +++ b/idl/cpu-seq.idol @@ -18,7 +18,11 @@ Interface( "state": ( type: "drv_cpu_power_state::PowerState", recv: FromPrimitive("u8"), - ) + ), + "reason": ( + type: "StateChangeReason", + recv: FromPrimitive("u8"), + ), }, reply: Result( ok: "()", diff --git a/task/control-plane-agent/src/mgs_compute_sled.rs b/task/control-plane-agent/src/mgs_compute_sled.rs index d191cefcd..18d43f838 100644 --- a/task/control-plane-agent/src/mgs_compute_sled.rs +++ b/task/control-plane-agent/src/mgs_compute_sled.rs @@ -728,7 +728,10 @@ impl SpHandler for MgsHandler { }; self.sequencer - .set_state(power_state) + .set_state( + power_state, + drv_cpu_seq_api::StateChangeReason::ControlPlane, + ) .map_err(|e| SpError::PowerStateError(e as u32)) } diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index f0d39438c..78ac633d7 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -374,11 +374,16 @@ impl ServerImpl { // basically only ever succeed in our initial set_state() request, so I // don't know how we'd test it fn power_off_host(&mut self, reboot: bool) { + let reason = if reboot { + drv_cpu_seq_api::StateChangeReason::HostReboot + } else { + drv_cpu_seq_api::StateChangeReason::HostPowerOff + }; loop { // Attempt to move to A2; given we only call this function in // response to a host request, we expect we're currently in A0 and // this should work. - let err = match self.sequencer.set_state(PowerState::A2) { + let err = match self.sequencer.set_state(PowerState::A2, reason) { Ok(()) => { ringbuf_entry!(Trace::SetState { now: sys_get_timer().now, @@ -1420,7 +1425,10 @@ fn handle_reboot_waiting_in_a2_timer( now: sys_get_timer().now, state: PowerState::A0, }); - _ = sequencer.set_state(PowerState::A0); + _ = sequencer.set_state( + PowerState::A0, + drv_cpu_seq_api::StateChangeReason::HostReboot, + ); *reboot_state = None; } } diff --git a/task/thermal/src/bsp/gimlet_bcdef.rs b/task/thermal/src/bsp/gimlet_bcdef.rs index 220a2fe2d..92ba23341 100644 --- a/task/thermal/src/bsp/gimlet_bcdef.rs +++ b/task/thermal/src/bsp/gimlet_bcdef.rs @@ -12,7 +12,7 @@ use crate::{ i2c_config::{devices, sensors}, }; pub use drv_cpu_seq_api::SeqError; -use drv_cpu_seq_api::{PowerState, Sequencer}; +use drv_cpu_seq_api::{PowerState, Sequencer, StateChangeReason}; use task_sensor_api::SensorId; use task_thermal_api::ThermalProperties; use userlib::{task_slot, units::Celsius, TaskId, UnwrapLite}; @@ -109,7 +109,8 @@ impl Bsp { } pub fn power_down(&self) -> Result<(), SeqError> { - self.seq.set_state(PowerState::A2) + self.seq + .set_state(PowerState::A2, StateChangeReason::Overheat) } pub fn power_mode(&self) -> PowerBitmask {