Skip to content

Commit

Permalink
gimlet-seq: Record why the power state changed
Browse files Browse the repository at this point in the history
Currently, when the Gimlet CPU sequencer changes the system's power
state, no information about *why* the power state changed is recorded by
the SP firmware. A system may power off or reboot for a variety of
reasons: it may be requested by the host OS over IPCC, by the control
plane over the management network, or triggered by the thermal task due
to an overheat condition. This makes debugging an unexpected reboot or
power off difficult, as the SP ringbuffers and other diagnostics do not
indicate why an unexpected power state change occurred. See #1950 for a
motivating example.

This commit resolves this as described in #1950 by adding a new field to
the `SetState` variant in the `drv-gimlet-seq-server` ringbuffer, so
that the reason a power state change occurred can be recorded. Clients
of the `cpu_seq` IPC API must now provide a `StateChangeReason` when
calling `Sequencer.set_state`, along with the desired power state, and
the sequencer task will record the provided reason in its ringbuffer.
This way, we can distinguish between the various reasons a power state
change may have occurred when debugging such issues.

The `StateChangeReason` enum also generates counters, so that the total
number of power state changes can be tracked.

Fixes #1950
  • Loading branch information
hawkw committed Dec 14, 2024
1 parent 2c8ac9b commit 436f5dd
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 12 deletions.
17 changes: 17 additions & 0 deletions drv/cpu-seq-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
use counters::Count;
use derive_idol_err::IdolError;
use userlib::{sys_send, FromPrimitive};
use zerocopy::AsBytes;

// Re-export PowerState for client convenience.
pub use drv_cpu_power_state::PowerState;
Expand All @@ -31,6 +32,22 @@ pub enum SeqError {
ServerRestarted,
}

#[derive(Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, AsBytes, Count)]
#[repr(u8)]
pub enum StateChangeReason {
/// TThe system has just received power, so the sequencer has booted the
/// host CPU.
InitialPowerOn = 1,
/// A power state change was requested by the control plane.
ControlPlane,
/// The host OS requested that the system power off without rebooting.
HostPowerOff,
/// The host OS requested that the system reboot.
HostReboot,
/// The system powered off because a component has overheated.
Overheat,
}

// On Gimlet, we have two banks of up to 8 DIMMs apiece. Export the "two banks"
// bit of knowledge here so it can be used by gimlet-seq-server, spd, and
// packrat, all of which want to know at compile-time how many banks there are.
Expand Down
23 changes: 17 additions & 6 deletions drv/gimlet-seq-server/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use userlib::{
sys_set_timer, task_slot, units, RecvMessage, TaskId, UnwrapLite,
};

use drv_cpu_seq_api::{PowerState, SeqError};
use drv_cpu_seq_api::{PowerState, SeqError, StateChangeReason};
use drv_hf_api as hf_api;
use drv_i2c_api as i2c;
use drv_ice40_spi_program as ice40;
Expand Down Expand Up @@ -90,7 +90,12 @@ enum Trace {
RailsOn,
UartEnabled,
A0(u16),
SetState(PowerState, PowerState, u64),
SetState(
PowerState,
PowerState,
#[count(children)] StateChangeReason,
u64,
),
UpdateState(#[count(children)] PowerState),
ClockConfigWrite,
ClockConfigSuccess,
Expand Down Expand Up @@ -482,7 +487,10 @@ impl<S: SpiServer + Clone> ServerImpl<S> {

// Power on, unless suppressed by the `stay-in-a2` feature
if !cfg!(feature = "stay-in-a2") {
_ = server.set_state_internal(PowerState::A0);
_ = server.set_state_internal(
PowerState::A0,
StateChangeReason::InitialPowerOn,
);
}

//
Expand Down Expand Up @@ -666,11 +674,12 @@ impl<S: SpiServer> ServerImpl<S> {
fn set_state_internal(
&mut self,
state: PowerState,
reason: StateChangeReason,
) -> Result<(), SeqError> {
let sys = sys_api::Sys::from(SYS.get_task_id());

let now = sys_get_timer().now;
ringbuf_entry!(Trace::SetState(self.state, state, now));
ringbuf_entry!(Trace::SetState(self.state, state, reason, now));

ringbuf_entry_v3p3_sys_a0_vout();

Expand Down Expand Up @@ -1032,8 +1041,10 @@ impl<S: SpiServer> idl::InOrderSequencerImpl for ServerImpl<S> {
&mut self,
_: &RecvMessage,
state: PowerState,
reason: StateChangeReason,
) -> Result<(), RequestError<SeqError>> {
self.set_state_internal(state).map_err(RequestError::from)
self.set_state_internal(state, reason)
.map_err(RequestError::from)
}

fn send_hardware_nmi(
Expand Down Expand Up @@ -1403,7 +1414,7 @@ cfg_if::cfg_if! {
}

mod idl {
use super::SeqError;
use super::{SeqError, StateChangeReason};

include!(concat!(env!("OUT_DIR"), "/server_stub.rs"));
}
Expand Down
6 changes: 5 additions & 1 deletion idl/cpu-seq.idol
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ Interface(
"state": (
type: "drv_cpu_power_state::PowerState",
recv: FromPrimitive("u8"),
)
),
"reason": (
type: "StateChangeReason",
recv: FromPrimitive("u8"),
),
},
reply: Result(
ok: "()",
Expand Down
5 changes: 4 additions & 1 deletion task/control-plane-agent/src/mgs_compute_sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,10 @@ impl SpHandler for MgsHandler {
};

self.sequencer
.set_state(power_state)
.set_state(
power_state,
drv_cpu_seq_api::StateChangeReason::ControlPlane,
)
.map_err(|e| SpError::PowerStateError(e as u32))
}

Expand Down
12 changes: 10 additions & 2 deletions task/host-sp-comms/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -374,11 +374,16 @@ impl ServerImpl {
// basically only ever succeed in our initial set_state() request, so I
// don't know how we'd test it
fn power_off_host(&mut self, reboot: bool) {
let reason = if reboot {
drv_cpu_seq_api::StateChangeReason::HostReboot
} else {
drv_cpu_seq_api::StateChangeReason::HostPowerOff
};
loop {
// Attempt to move to A2; given we only call this function in
// response to a host request, we expect we're currently in A0 and
// this should work.
let err = match self.sequencer.set_state(PowerState::A2) {
let err = match self.sequencer.set_state(PowerState::A2, reason) {
Ok(()) => {
ringbuf_entry!(Trace::SetState {
now: sys_get_timer().now,
Expand Down Expand Up @@ -1420,7 +1425,10 @@ fn handle_reboot_waiting_in_a2_timer(
now: sys_get_timer().now,
state: PowerState::A0,
});
_ = sequencer.set_state(PowerState::A0);
_ = sequencer.set_state(
PowerState::A0,
drv_cpu_seq_api::StateChangeReason::HostReboot,
);
*reboot_state = None;
}
}
Expand Down
5 changes: 3 additions & 2 deletions task/thermal/src/bsp/gimlet_bcdef.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::{
i2c_config::{devices, sensors},
};
pub use drv_cpu_seq_api::SeqError;
use drv_cpu_seq_api::{PowerState, Sequencer};
use drv_cpu_seq_api::{PowerState, Sequencer, StateChangeReason};
use task_sensor_api::SensorId;
use task_thermal_api::ThermalProperties;
use userlib::{task_slot, units::Celsius, TaskId, UnwrapLite};
Expand Down Expand Up @@ -109,7 +109,8 @@ impl Bsp {
}

pub fn power_down(&self) -> Result<(), SeqError> {
self.seq.set_state(PowerState::A2)
self.seq
.set_state(PowerState::A2, StateChangeReason::Overheat)
}

pub fn power_mode(&self) -> PowerBitmask {
Expand Down

0 comments on commit 436f5dd

Please sign in to comment.