diff --git a/nvme_metrics.py b/nvme_metrics.py index bb6b1a3..c7dbaec 100755 --- a/nvme_metrics.py +++ b/nvme_metrics.py @@ -74,6 +74,9 @@ "Device error log entry count", ["device"], namespace=namespace, registry=registry, ), + # FIXME: The "nvmecli" metric ought to be an Info type, not a Gauge. However, making this change + # will result in the metric having a "_info" suffix automatically appended, which is arguably + # a breaking change. "nvmecli": Gauge( "nvmecli", "nvme-cli tool information", @@ -142,7 +145,11 @@ def exec_nvme_json(*args): """ Execute nvme CLI tool with specified arguments and return parsed JSON output. """ - output = exec_nvme(*args, "--output-format", "json") + # Note: nvme-cli v2.11 effectively introduced a breaking change by forcing JSON output to always + # be verbose. Older versions of nvme-cli optionally produced verbose output if the --verbose + # flag was specified. In order to avoid having to handle two different JSON schemas, always + # add the --verbose flag. + output = exec_nvme(*args, "--output-format", "json", "--verbose") return json.loads(output) @@ -157,49 +164,70 @@ def main(): device_list = exec_nvme_json("list") for device in device_list["Devices"]: - device_path = device["DevicePath"] - device_name = os.path.basename(device_path) - - metrics["device_info"].labels( - device_name, - device["ModelNumber"], - device["Firmware"], - device["SerialNumber"].strip(), - ) - - metrics["sector_size"].labels(device_name).set(device["SectorSize"]) - metrics["physical_size"].labels(device_name).set(device["PhysicalSize"]) - metrics["used_bytes"].labels(device_name).set(device["UsedBytes"]) - - smart_log = exec_nvme_json("smart-log", device_path) - - # Various counters in the NVMe specification are 128-bit, which would have to discard - # resolution if converted to a JSON number (i.e., float64_t). Instead, nvme-cli marshals - # them as strings. As such, they need to be explicitly cast to int or float when using them - # in Counter metrics. - metrics["data_units_read"].labels(device_name).inc(int(smart_log["data_units_read"])) - metrics["data_units_written"].labels(device_name).inc(int(smart_log["data_units_written"])) - metrics["host_read_commands"].labels(device_name).inc(int(smart_log["host_read_commands"])) - metrics["host_write_commands"].labels(device_name).inc( - int(smart_log["host_write_commands"]) - ) - metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100) - metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100) - metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100) - metrics["critical_warning"].labels(device_name).set(smart_log["critical_warning"]) - metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"])) - metrics["num_err_log_entries"].labels(device_name).inc( - int(smart_log["num_err_log_entries"]) - ) - metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"])) - metrics["power_on_hours"].labels(device_name).inc(int(smart_log["power_on_hours"])) - metrics["controller_busy_time"].labels(device_name).inc( - int(smart_log["controller_busy_time"]) - ) - metrics["unsafe_shutdowns"].labels(device_name).inc(int(smart_log["unsafe_shutdowns"])) - - # NVMe reports temperature in kelvins; convert it to degrees Celsius. - metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273) + for subsys in device["Subsystems"]: + for ctrl in subsys["Controllers"]: + for ns in ctrl["Namespaces"]: + device_name = ns["NameSpace"] + + # FIXME: This metric ought to be refactored into a "controller_info" metric, + # since it contains information that is not unique to the namespace. However, + # previous versions of this collector erroneously referred to namespaces, e.g. + # "nvme0n1", as devices, so preserve the former behaviour for now. + metrics["device_info"].labels( + device_name, + ctrl["ModelNumber"], + ctrl["Firmware"], + ctrl["SerialNumber"].strip(), + ) + + metrics["sector_size"].labels(device_name).set(ns["SectorSize"]) + metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"]) + metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"]) + + # FIXME: The smart-log should only need to be fetched once per controller, not + # per namespace. However, in order to preserve legacy metric labels, fetch it + # per namespace anyway. Most consumer grade SSDs will only have one namespace. + smart_log = exec_nvme_json("smart-log", os.path.join("/dev", device_name)) + + # Various counters in the NVMe specification are 128-bit, which would have to + # discard resolution if converted to a JSON number (i.e., float64_t). Instead, + # nvme-cli marshals them as strings. As such, they need to be explicitly cast + # to int or float when using them in Counter metrics. + metrics["data_units_read"].labels(device_name).inc( + int(smart_log["data_units_read"]) + ) + metrics["data_units_written"].labels(device_name).inc( + int(smart_log["data_units_written"]) + ) + metrics["host_read_commands"].labels(device_name).inc( + int(smart_log["host_read_commands"]) + ) + metrics["host_write_commands"].labels(device_name).inc( + int(smart_log["host_write_commands"]) + ) + metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100) + metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100) + metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100) + metrics["critical_warning"].labels(device_name).set( + smart_log["critical_warning"]["value"] + ) + metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"])) + metrics["num_err_log_entries"].labels(device_name).inc( + int(smart_log["num_err_log_entries"]) + ) + metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"])) + metrics["power_on_hours"].labels(device_name).inc( + int(smart_log["power_on_hours"]) + ) + metrics["controller_busy_time"].labels(device_name).inc( + int(smart_log["controller_busy_time"]) + ) + metrics["unsafe_shutdowns"].labels(device_name).inc( + int(smart_log["unsafe_shutdowns"]) + ) + + # NVMe reports temperature in kelvins; convert it to degrees Celsius. + metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273) if __name__ == "__main__": diff --git a/nvme_metrics.sh b/nvme_metrics.sh deleted file mode 100755 index 9291cc3..0000000 --- a/nvme_metrics.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env bash -# -# Dependencies: nvme-cli, jq (packages) -# Based on code from -# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/smartmon.sh -# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/mellanox_hca_temp -# - https://github.com/vorlon/check_nvme/blob/master/check_nvme.sh -# -# Author: Henk - -set -eu - -# Ensure predictable numeric / date formats, etc. -export LC_ALL=C - -# Check if we are root -if [ "$EUID" -ne 0 ]; then - echo "${0##*/}: Please run as root!" >&2 - exit 1 -fi - -# Check if programs are installed -if ! command -v nvme >/dev/null 2>&1; then - echo "${0##*/}: nvme is not installed. Aborting." >&2 - exit 1 -fi - -output_format_awk="$( - cat <<'OUTPUTAWK' -BEGIN { v = "" } -v != $1 { - print "# HELP nvme_" $1 " SMART metric " $1; - if ($1 ~ /_total$/) - print "# TYPE nvme_" $1 " counter"; - else - print "# TYPE nvme_" $1 " gauge"; - v = $1 -} -{print "nvme_" $0} -OUTPUTAWK -)" - -format_output() { - sort | awk -F'{' "${output_format_awk}" -} - -# Get the nvme-cli version -nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')" -echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output - -# Get devices -device_list="$(nvme list -o json | jq -r '.Devices | .[].DevicePath')" - -# Loop through the NVMe devices -for device in ${device_list}; do - json_check="$(nvme smart-log -o json "${device}")" - disk="${device##*/}" - - # The temperature value in JSON is in Kelvin, we want Celsius - value_temperature="$(echo "$json_check" | jq '.temperature - 273')" - echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}" - - value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')" - echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}" - - value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')" - echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}" - - value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')" - echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}" - - value_critical_warning="$(echo "$json_check" | jq '.critical_warning')" - echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}" - - value_media_errors="$(echo "$json_check" | jq -r '.media_errors')" - echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}" - - value_num_err_log_entries="$(echo "$json_check" | jq -r '.num_err_log_entries')" - echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}" - - value_power_cycles="$(echo "$json_check" | jq -r '.power_cycles')" - echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}" - - value_power_on_hours="$(echo "$json_check" | jq -r '.power_on_hours')" - echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}" - - value_controller_busy_time="$(echo "$json_check" | jq -r '.controller_busy_time')" - echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}" - - value_data_units_written="$(echo "$json_check" | jq -r '.data_units_written')" - echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}" - - value_data_units_read="$(echo "$json_check" | jq -r '.data_units_read')" - echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}" - - value_host_read_commands="$(echo "$json_check" | jq -r '.host_read_commands')" - echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}" - - value_host_write_commands="$(echo "$json_check" | jq -r '.host_write_commands')" - echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}" -done | format_output