From 319184ce66d88b706d4ce4939b68e0c68a712b0e Mon Sep 17 00:00:00 2001 From: Piotr Dobrowolski Date: Sun, 14 Jul 2024 23:29:05 +0200 Subject: [PATCH] Rework device label, fix SATA discovery, per-device type specification Signed-off-by: Piotr Dobrowolski --- main.go | 79 +++++++++++++++++++++++++++++++----------------- readjson.go | 29 ++++++++++-------- smartctl.go | 30 ++++++------------ smartctl_test.go | 44 +++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 61 deletions(-) create mode 100644 smartctl_test.go diff --git a/main.go b/main.go index 07df636..5ddcdc1 100644 --- a/main.go +++ b/main.go @@ -35,9 +35,13 @@ import ( // Device type Device struct { - Name string `json:"name"` - Info_Name string `json:"info_name"` - Type string `json:"type"` + Name string + Type string + Label string +} + +func (d Device) String() string { + return d.Name + ";" + d.Type + " (" + d.Label + ")" } // SMARTctlManagerCollector implements the Collector interface. @@ -81,6 +85,7 @@ func (i *SMARTctlManagerCollector) RescanForDevices() { time.Sleep(*smartctlRescanInterval) level.Info(i.logger).Log("msg", "Rescanning for devices") devices := scanDevices(i.logger) + devices = buildDevicesFromFlag(devices) i.mutex.Lock() i.Devices = devices i.mutex.Unlock() @@ -97,8 +102,9 @@ var ( smartctlRescanInterval = kingpin.Flag("smartctl.rescan", "The interval between rescanning for new/disappeared devices. If the interval is smaller than 1s no rescanning takes place. If any devices are configured with smartctl.device also no rescanning takes place.", ).Default("10m").Duration() + smartctlScan = kingpin.Flag("smartctl.scan", "Enable scanning. This is a default if no devices are specified").Default("false").Bool() smartctlDevices = kingpin.Flag("smartctl.device", - "The device to monitor (repeatable)", + "The device to monitor. Device type can be specified after a semicolon, eg. '/dev/bus/0;megaraid,1' (repeatable)", ).Strings() smartctlDeviceExclude = kingpin.Flag( "smartctl.device-exclude", @@ -108,8 +114,8 @@ var ( "smartctl.device-include", "Regexp of devices to exclude from automatic scanning. (mutually exclusive to device-exclude)", ).Default("").String() - smartctlDeviceTypes = kingpin.Flag( - "smartctl.device-type", + smartctlScanDeviceTypes = kingpin.Flag( + "smartctl.scan-device-type", "Device type to use during automatic scan. Special by-id value forces predictable device names. (repeatable)", ).Strings() smartctlFakeData = kingpin.Flag("smartctl.fake-data", @@ -125,15 +131,23 @@ func scanDevices(logger log.Logger) []Device { scanDevices := json.Get("devices").Array() var scanDeviceResult []Device for _, d := range scanDevices { - deviceName := extractDiskName(strings.TrimSpace(d.Get("info_name").String())) - if filter.ignored(deviceName) { - level.Info(logger).Log("msg", "Ignoring device", "name", deviceName) + deviceName := d.Get("name").String() + deviceType := d.Get("type").String() + + // SATA devices are reported as SCSI during scan - fallback to auto scraping + if deviceType == "scsi" { + deviceType = "auto" + } + + deviceLabel := buildDeviceLabel(deviceName, deviceType) + if filter.ignored(deviceLabel) { + level.Info(logger).Log("msg", "Ignoring device", "name", deviceLabel) } else { - level.Info(logger).Log("msg", "Found device", "name", deviceName) + level.Info(logger).Log("msg", "Found device", "name", deviceLabel) device := Device{ - Name: d.Get("name").String(), - Info_Name: deviceName, - Type: d.Get("type").String(), + Name: deviceName, + Type: deviceType, + Label: deviceLabel, } scanDeviceResult = append(scanDeviceResult, device) } @@ -141,18 +155,21 @@ func scanDevices(logger log.Logger) []Device { return scanDeviceResult } -func filterDevices(logger log.Logger, devices []Device, filters []string) []Device { - var filtered []Device - for _, d := range devices { - for _, filter := range filters { - level.Debug(logger).Log("msg", "filterDevices", "device", d.Info_Name, "filter", filter) - if strings.Contains(d.Info_Name, filter) { - filtered = append(filtered, d) - break - } +func buildDevicesFromFlag(devices []Device) []Device { + // TODO: deduplication? + for _, device := range *smartctlDevices { + deviceName, deviceType, _ := strings.Cut(device, ";") + if deviceType == "" { + deviceType = "auto" } + + devices = append(devices, Device{ + Name: deviceName, + Type: deviceType, + Label: buildDeviceLabel(deviceName, deviceType), + }) } - return filtered + return devices } func main() { @@ -172,11 +189,19 @@ func main() { level.Info(logger).Log("msg", "Build context", "build_context", version.BuildContext()) var devices []Device - devices = scanDevices(logger) - level.Info(logger).Log("msg", "Number of devices found", "count", len(devices)) + + if len(*smartctlDevices) == 0 { + *smartctlScan = true + } + + if *smartctlScan { + devices = scanDevices(logger) + level.Info(logger).Log("msg", "Number of devices found", "count", len(devices)) + } + if len(*smartctlDevices) > 0 { level.Info(logger).Log("msg", "Devices specified", "devices", strings.Join(*smartctlDevices, ", ")) - devices = filterDevices(logger, devices, *smartctlDevices) + devices = buildDevicesFromFlag(devices) level.Info(logger).Log("msg", "Devices filtered", "count", len(devices)) } @@ -185,7 +210,7 @@ func main() { logger: logger, } - if *smartctlRescanInterval >= 1*time.Second { + if *smartctlScan && *smartctlRescanInterval >= 1*time.Second { level.Info(logger).Log("msg", "Start background scan process") level.Info(logger).Log("msg", "Rescanning for devices every", "rescanInterval", *smartctlRescanInterval) go collector.RescanForDevices() diff --git a/readjson.go b/readjson.go index 4f4571d..760abcb 100644 --- a/readjson.go +++ b/readjson.go @@ -64,21 +64,24 @@ func readFakeSMARTctl(logger log.Logger, device Device) gjson.Result { // Get json from smartctl and parse it func readSMARTctl(logger log.Logger, device Device) (gjson.Result, bool) { start := time.Now() - out, err := exec.Command(*smartctlPath, "--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device="+device.Type, device.Name).Output() + var smartctlArgs = []string{"--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device=" + device.Type, device.Name} + + level.Debug(logger).Log("msg", "Calling smartctl with args", "args", strings.Join(smartctlArgs, " ")) + out, err := exec.Command(*smartctlPath, smartctlArgs...).Output() if err != nil { - level.Warn(logger).Log("msg", "S.M.A.R.T. output reading", "err", err, "device", device.Info_Name) + level.Warn(logger).Log("msg", "S.M.A.R.T. output reading", "err", err, "device", device) } json := parseJSON(string(out)) rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int()) jsonOk := jsonIsOk(logger, json) - level.Debug(logger).Log("msg", "Collected S.M.A.R.T. json data", "device", device.Info_Name, "duration", time.Since(start)) + level.Debug(logger).Log("msg", "Collected S.M.A.R.T. json data", "device", device, "duration", time.Since(start)) return json, rcOk && jsonOk } func readSMARTctlDevices(logger log.Logger) gjson.Result { level.Debug(logger).Log("msg", "Scanning for devices") var scanArgs []string = []string{"--json", "--scan"} - for _, d := range *smartctlDeviceTypes { + for _, d := range *smartctlScanDeviceTypes { scanArgs = append(scanArgs, "--device", d) } out, err := exec.Command(*smartctlPath, scanArgs...).Output() @@ -109,7 +112,7 @@ func readData(logger log.Logger, device Device) gjson.Result { jsonCache.Store(device, JSONCache{JSON: json, LastCollect: time.Now()}) j, found := jsonCache.Load(device) if !found { - level.Warn(logger).Log("msg", "device not found", "device", device.Info_Name) + level.Warn(logger).Log("msg", "device not found", "device", device) } return j.(JSONCache).JSON } @@ -124,30 +127,30 @@ func resultCodeIsOk(logger log.Logger, device Device, SMARTCtlResult int64) bool if SMARTCtlResult > 0 { b := SMARTCtlResult if (b & 1) != 0 { - level.Error(logger).Log("msg", "Command line did not parse", "device", device.Info_Name) + level.Error(logger).Log("msg", "Command line did not parse", "device", device) result = false } if (b & (1 << 1)) != 0 { - level.Error(logger).Log("msg", "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device.Info_Name) + level.Error(logger).Log("msg", "Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device) result = false } if (b & (1 << 2)) != 0 { - level.Warn(logger).Log("msg", "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device.Info_Name) + level.Warn(logger).Log("msg", "Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device) } if (b & (1 << 3)) != 0 { - level.Warn(logger).Log("msg", "SMART status check returned 'DISK FAILING'", "device", device.Info_Name) + level.Warn(logger).Log("msg", "SMART status check returned 'DISK FAILING'", "device", device) } if (b & (1 << 4)) != 0 { - level.Warn(logger).Log("msg", "We found prefail Attributes <= threshold", "device", device.Info_Name) + level.Warn(logger).Log("msg", "We found prefail Attributes <= threshold", "device", device) } if (b & (1 << 5)) != 0 { - level.Warn(logger).Log("msg", "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device.Info_Name) + level.Warn(logger).Log("msg", "SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device) } if (b & (1 << 6)) != 0 { - level.Warn(logger).Log("msg", "The device error log contains records of errors", "device", device.Info_Name) + level.Warn(logger).Log("msg", "The device error log contains records of errors", "device", device) } if (b & (1 << 7)) != 0 { - level.Warn(logger).Log("msg", "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device.Info_Name) + level.Warn(logger).Log("msg", "The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device) } } return result diff --git a/smartctl.go b/smartctl.go index 2a549d9..777dde2 100644 --- a/smartctl.go +++ b/smartctl.go @@ -43,28 +43,16 @@ type SMARTctl struct { device SMARTDevice } -func extractDiskName(input string) string { - re := regexp.MustCompile(`^(?:/dev/(?P\S+)/(?P\S+)\s\[|/dev/(?:disk\/by-id\/|disk\/by-path\/|)|\[)(?:\s\[|)(?P[A-Za-z0-9_\-]+)(?:\].*|)$`) - match := re.FindStringSubmatch(input) - - if len(match) > 0 { - busNameIndex := re.SubexpIndex("bus_name") - busNumIndex := re.SubexpIndex("bus_num") - diskIndex := re.SubexpIndex("disk") - var name []string - if busNameIndex != -1 && match[busNameIndex] != "" { - name = append(name, match[busNameIndex]) - } - if busNumIndex != -1 && match[busNumIndex] != "" { - name = append(name, match[busNumIndex]) - } - if diskIndex != -1 && match[diskIndex] != "" { - name = append(name, match[diskIndex]) - } +func buildDeviceLabel(inputName string, inputType string) string { + // Strip /dev prefix and replace / with _ (/dev/bus/0 becomes bus_0, /dev/disk/by-id/abcd becomes abcd) + devReg := regexp.MustCompile(`^/dev/(?:disk/by-id/|disk/by-path/|)`) + deviceName := strings.ReplaceAll(devReg.ReplaceAllString(inputName, ""), "/", "_") - return strings.Join(name, "_") + if strings.Contains(inputType, ",") { + return deviceName + "_" + strings.ReplaceAll(inputType, ",", "_") } - return "" + + return deviceName } // NewSMARTctl is smartctl constructor @@ -85,7 +73,7 @@ func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metr json: json, logger: logger, device: SMARTDevice{ - device: extractDiskName(strings.TrimSpace(json.Get("device.info_name").String())), + device: buildDeviceLabel(json.Get("device.name").String(), json.Get("device.type").String()), serial: strings.TrimSpace(json.Get("serial_number").String()), family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")), model: strings.TrimSpace(model_name), diff --git a/smartctl_test.go b/smartctl_test.go new file mode 100644 index 0000000..8c9836c --- /dev/null +++ b/smartctl_test.go @@ -0,0 +1,44 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "testing" +) + +func TestBuildDeviceLabel(t *testing.T) { + tests := []struct { + deviceName string + deviceType string + expectedLabel string + }{ + {"/dev/bus/0", "megaraid,1", "bus_0_megaraid_1"}, + {"/dev/sda", "auto", "sda"}, + {"/dev/disk/by-id/ata-CT500MX500SSD1_ABCDEFGHIJ", "auto", "ata-CT500MX500SSD1_ABCDEFGHIJ"}, + // Some cases extracted from smartctl docs. Are these the prettiest? + // Probably not. Are they unique enough. Definitely. + {"/dev/sg1", "cciss,1", "sg1_cciss_1"}, + {"/dev/bsg/sssraid0", "sssraid,0,1", "bsg_sssraid0_sssraid_0_1"}, + {"/dev/cciss/c0d0", "cciss,0", "cciss_c0d0_cciss_0"}, + {"/dev/sdb", "aacraid,1,0,4", "sdb_aacraid_1_0_4"}, + {"/dev/twl0", "3ware,1", "twl0_3ware_1"}, + } + + for _, test := range tests { + result := buildDeviceLabel(test.deviceName, test.deviceType) + if result != test.expectedLabel { + t.Errorf("deviceName=%v deviceType=%v expected=%v result=%v", test.deviceName, test.deviceType, test.expectedLabel, result) + } + } +}