Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nvme_metrics: support nvme-cli v2.11+ verbose JSON output #227

Merged
merged 2 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 72 additions & 44 deletions nvme_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@
"Device error log entry count",
["device"], namespace=namespace, registry=registry,
),
# FIXME: The "nvmecli" metric ought to be an Info type, not a Gauge. However, making this change
# will result in the metric having a "_info" suffix automatically appended, which is arguably
# a breaking change.
"nvmecli": Gauge(
"nvmecli",
"nvme-cli tool information",
Expand Down Expand Up @@ -142,7 +145,11 @@ def exec_nvme_json(*args):
"""
Execute nvme CLI tool with specified arguments and return parsed JSON output.
"""
output = exec_nvme(*args, "--output-format", "json")
# Note: nvme-cli v2.11 effectively introduced a breaking change by forcing JSON output to always
# be verbose. Older versions of nvme-cli optionally produced verbose output if the --verbose
# flag was specified. In order to avoid having to handle two different JSON schemas, always
# add the --verbose flag.
output = exec_nvme(*args, "--output-format", "json", "--verbose")
return json.loads(output)


Expand All @@ -157,49 +164,70 @@ def main():
device_list = exec_nvme_json("list")

for device in device_list["Devices"]:
device_path = device["DevicePath"]
device_name = os.path.basename(device_path)

metrics["device_info"].labels(
device_name,
device["ModelNumber"],
device["Firmware"],
device["SerialNumber"].strip(),
)

metrics["sector_size"].labels(device_name).set(device["SectorSize"])
metrics["physical_size"].labels(device_name).set(device["PhysicalSize"])
metrics["used_bytes"].labels(device_name).set(device["UsedBytes"])

smart_log = exec_nvme_json("smart-log", device_path)

# Various counters in the NVMe specification are 128-bit, which would have to discard
# resolution if converted to a JSON number (i.e., float64_t). Instead, nvme-cli marshals
# them as strings. As such, they need to be explicitly cast to int or float when using them
# in Counter metrics.
metrics["data_units_read"].labels(device_name).inc(int(smart_log["data_units_read"]))
metrics["data_units_written"].labels(device_name).inc(int(smart_log["data_units_written"]))
metrics["host_read_commands"].labels(device_name).inc(int(smart_log["host_read_commands"]))
metrics["host_write_commands"].labels(device_name).inc(
int(smart_log["host_write_commands"])
)
metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100)
metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100)
metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100)
metrics["critical_warning"].labels(device_name).set(smart_log["critical_warning"])
metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"]))
metrics["num_err_log_entries"].labels(device_name).inc(
int(smart_log["num_err_log_entries"])
)
metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"]))
metrics["power_on_hours"].labels(device_name).inc(int(smart_log["power_on_hours"]))
metrics["controller_busy_time"].labels(device_name).inc(
int(smart_log["controller_busy_time"])
)
metrics["unsafe_shutdowns"].labels(device_name).inc(int(smart_log["unsafe_shutdowns"]))

# NVMe reports temperature in kelvins; convert it to degrees Celsius.
metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273)
for subsys in device["Subsystems"]:
for ctrl in subsys["Controllers"]:
for ns in ctrl["Namespaces"]:
device_name = ns["NameSpace"]

# FIXME: This metric ought to be refactored into a "controller_info" metric,
# since it contains information that is not unique to the namespace. However,
# previous versions of this collector erroneously referred to namespaces, e.g.
# "nvme0n1", as devices, so preserve the former behaviour for now.
metrics["device_info"].labels(
device_name,
ctrl["ModelNumber"],
ctrl["Firmware"],
ctrl["SerialNumber"].strip(),
)

metrics["sector_size"].labels(device_name).set(ns["SectorSize"])
metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"])
metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"])

# FIXME: The smart-log should only need to be fetched once per controller, not
# per namespace. However, in order to preserve legacy metric labels, fetch it
# per namespace anyway. Most consumer grade SSDs will only have one namespace.
smart_log = exec_nvme_json("smart-log", os.path.join("/dev", device_name))

# Various counters in the NVMe specification are 128-bit, which would have to
# discard resolution if converted to a JSON number (i.e., float64_t). Instead,
# nvme-cli marshals them as strings. As such, they need to be explicitly cast
# to int or float when using them in Counter metrics.
metrics["data_units_read"].labels(device_name).inc(
int(smart_log["data_units_read"])
)
metrics["data_units_written"].labels(device_name).inc(
int(smart_log["data_units_written"])
)
metrics["host_read_commands"].labels(device_name).inc(
int(smart_log["host_read_commands"])
)
metrics["host_write_commands"].labels(device_name).inc(
int(smart_log["host_write_commands"])
)
metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100)
metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100)
metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100)
metrics["critical_warning"].labels(device_name).set(
smart_log["critical_warning"]["value"]
)
metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"]))
metrics["num_err_log_entries"].labels(device_name).inc(
int(smart_log["num_err_log_entries"])
)
metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"]))
metrics["power_on_hours"].labels(device_name).inc(
int(smart_log["power_on_hours"])
)
metrics["controller_busy_time"].labels(device_name).inc(
int(smart_log["controller_busy_time"])
)
metrics["unsafe_shutdowns"].labels(device_name).inc(
int(smart_log["unsafe_shutdowns"])
)

# NVMe reports temperature in kelvins; convert it to degrees Celsius.
metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273)


if __name__ == "__main__":
Expand Down
101 changes: 0 additions & 101 deletions nvme_metrics.sh

This file was deleted.