Skip to content

Commit

Permalink
feat: add eventTime
Browse files Browse the repository at this point in the history
Signed-off-by: happyfx <[email protected]>
  • Loading branch information
happyfx committed Dec 6, 2024
1 parent 59038de commit f4adc95
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 28 deletions.
79 changes: 55 additions & 24 deletions collector_sel_events.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ var (
selEventsCountByStateDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "sel_events", "count_by_state"),
"Current number of log entries in the SEL by state.",
[]string{"state"},
[]string{"state", "type"},
nil,
)
selEventsCountByNameDesc = prometheus.NewDesc(
Expand All @@ -45,10 +45,28 @@ var (
[]string{"name"},
nil,
)
selEventsLog = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "sel_events", "time"),
"Latest timestamp of custom log entries in the SEL by event.",
[]string{"name", "type", "state", "event"},
nil,
)
)

type SELEventsCollector struct{}

type stateCountKey struct {
State string
Type string
}

type eventTimeKey struct {
State string
Type string
Name string
Event string
}

func (c SELEventsCollector) Name() CollectorName {
return SELEventsCollectorName
}
Expand Down Expand Up @@ -78,7 +96,8 @@ func (c SELEventsCollector) Collect(result freeipmi.Result, ch chan<- prometheus
return 0, err
}

selEventByStateCount := map[string]float64{}
selEventByStateCount := map[stateCountKey]float64{}
seleventTime := map[eventTimeKey]float64{}
selEventByNameCount := map[string]float64{}
selEventByNameTimestamp := map[string]float64{}

Expand All @@ -89,48 +108,60 @@ func (c SELEventsCollector) Collect(result freeipmi.Result, ch chan<- prometheus
}

for _, data := range events {
var newTimestamp float64 = 0
datetime := data.Date + " " + data.Time
t, err := time.Parse(SELDateTimeFormat, datetime)
// ignore errors with invalid date or time
// NOTE: in some cases ipmi-sel can return "PostInit" in Date and Time fields
// Example:
// $ ipmi-sel --comma-separated-output --output-event-state --interpret-oem-data --output-oem-event-strings
// ID,Date,Time,Name,Type,State,Event
// 3,PostInit,PostInit,Sensor #211,Memory,Warning,Correctable memory error ; Event Data3 = 34h
if err != nil {
logger.Debug("Failed to parse time", "target", targetName(target.host), "error", err)
} else {
newTimestamp = float64(t.Unix())
}
for _, metricConfig := range selEventConfigs {
match := metricConfig.Regex.FindStringSubmatch(data.Event)
if match != nil {
var newTimestamp float64 = 0
datetime := data.Date + " " + data.Time
t, err := time.Parse(SELDateTimeFormat, datetime)
// ignore errors with invalid date or time
// NOTE: in some cases ipmi-sel can return "PostInit" in Date and Time fields
// Example:
// $ ipmi-sel --comma-separated-output --output-event-state --interpret-oem-data --output-oem-event-strings
// ID,Date,Time,Name,Type,State,Event
// 3,PostInit,PostInit,Sensor #211,Memory,Warning,Correctable memory error ; Event Data3 = 34h
if err != nil {
logger.Debug("Failed to parse time", "target", targetName(target.host), "error", err)
} else {
newTimestamp = float64(t.Unix())
}
// save latest timestamp by name metrics
if newTimestamp > selEventByNameTimestamp[metricConfig.Name] {
selEventByNameTimestamp[metricConfig.Name] = newTimestamp
}
// save count by name metrics
selEventByNameCount[metricConfig.Name]++
}
}
// save event metrics
stateeventTimeKey := eventTimeKey{State: data.State, Type: data.Type, Event: data.Event, Name: data.Name}
oldTimestamp, okLog := seleventTime[stateeventTimeKey]
if !okLog || oldTimestamp < newTimestamp {
seleventTime[stateeventTimeKey] = newTimestamp
}
// save count by state metrics
_, ok := selEventByStateCount[data.State]
stateCountKey := stateCountKey{State: data.State, Type: data.Type}
_, ok := selEventByStateCount[stateCountKey]
if !ok {
selEventByStateCount[data.State] = 0
selEventByStateCount[stateCountKey] = 0
}
selEventByStateCount[data.State]++
selEventByStateCount[stateCountKey]++
}

for state, value := range selEventByStateCount {
for stateCount, value := range selEventByStateCount {
ch <- prometheus.MustNewConstMetric(
selEventsCountByStateDesc,
prometheus.GaugeValue,
value,
state,
stateCount.State, stateCount.Type,
)
}
for eventTime, value := range seleventTime {
ch <- prometheus.MustNewConstMetric(
selEventsLog,
prometheus.GaugeValue,
value,
eventTime.Name, eventTime.Type, eventTime.State, eventTime.Event,
)
}

for name, value := range selEventByNameCount {
ch <- prometheus.MustNewConstMetric(
selEventsCountByNameDesc,
Expand Down
29 changes: 25 additions & 4 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,31 @@ last occurrence. Example:
ipmi_sel_events_count_by_name{name="my_custom_event_from_config"} 77
ipmi_sel_events_latest_timestamp{name="my_custom_event_from_config"} 1.703613275e+09

also next aggregated metrics will be exported:

ipmi_sel_events_count_by_state{state="Nominal"} 10
ipmi_sel_events_count_by_state{state="Warning"} 5
Next aggregated metrics will be exported:

ipmi_sel_events_count_by_state{type="Drive Slot",state="Critical"} 1
ipmi_sel_events_count_by_state{type="Event Logging Disabled",state="Nominal"} 1
ipmi_sel_events_count_by_state{type="Physical Security",state="Critical"} 2
ipmi_sel_events_count_by_state{type="Power Supply",state="Critical"} 18
ipmi_sel_events_count_by_state{type="Power Supply",state="Nominal"} 4
ipmi_sel_events_count_by_state{type="Power Supply",state="Warning"} 2
ipmi_sel_events_count_by_state{type="Voltage",state="Nominal"} 2
ipmi_sel_events_count_by_state{type="Voltage",state="Warning"} 4

Next log events will be exported:

ipmi_sel_events_time{event="Drive Fault ; OEM Event Data2 code = 01h ; OEM Event Data3 code = 03h",name="System Board Drive 0",state="Critical",type="Drive Slot"} 1.733305672e+09
ipmi_sel_events_time{event="Fully Redundant",name="System Board PS Redundancy",state="Nominal",type="Power Supply"} 1.727789882e+09
ipmi_sel_events_time{event="General Chassis Intrusion ; Intrusion while system Off",name="System Board Intrusion",state="Critical",type="Physical Security"} 1.722868317e+09
ipmi_sel_events_time{event="Log Area Reset/Cleared",name="SEL",state="Nominal",type="Event Logging Disabled"} 1.709312411e+09
ipmi_sel_events_time{event="Power Supply Failure detected",name="Power Supply 2 Status",state="Critical",type="Power Supply"} 1.731064451e+09
ipmi_sel_events_time{event="Power Supply Failure detected ; Fan Fault",name="Power Supply 2 Status",state="Critical",type="Power Supply"} 1.731064449e+09
ipmi_sel_events_time{event="Power Supply input lost (AC/DC)",name="Power Supply 2 Status",state="Critical",type="Power Supply"} 1.727789819e+09
ipmi_sel_events_time{event="Power Supply input lost (AC/DC)",name="Power Supply 2 Status",state="Warning",type="Power Supply"} 1.727789882e+09
ipmi_sel_events_time{event="Presence detected",name="Power Supply 2 Status",state="Nominal",type="Power Supply"} 1.727789864e+09
ipmi_sel_events_time{event="Redundancy Lost",name="System Board PS Redundancy",state="Critical",type="Power Supply"} 1.731064452e+09
ipmi_sel_events_time{event="State Asserted",name="System Board PS2 PG FAIL",state="Warning",type="Voltage"} 1.731064449e+09
ipmi_sel_events_time{event="State Deasserted",name="System Board PS2 PG FAIL",state="Nominal",type="Voltage"} 1.727789879e+09

## Supermicro LAN mode setting

Expand Down

0 comments on commit f4adc95

Please sign in to comment.