diff --git a/README.md b/README.md index 035ca47..2f661dc 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Take a look at `examples` for daemonset manifests for Kubernetes. # User privilleges -User needs to access systemd dbus, typically exporter needs to see node's `/proc`, `/sys/fs/cgroup` to work. +User needs to access systemd dbus, typically exporter needs to see node's `/proc` to work. # Metrics diff --git a/go.mod b/go.mod index bc0dd60..e7aa77f 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,6 @@ require ( github.com/prometheus/client_golang v1.14.0 github.com/prometheus/common v0.42.0 github.com/prometheus/exporter-toolkit v0.10.0 - golang.org/x/sys v0.7.0 ) require ( @@ -32,6 +31,7 @@ require ( golang.org/x/net v0.9.0 // indirect golang.org/x/oauth2 v0.6.0 // indirect golang.org/x/sync v0.1.0 // indirect + golang.org/x/sys v0.7.0 // indirect golang.org/x/text v0.9.0 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/protobuf v1.28.1 // indirect diff --git a/systemd/cgroups.go b/systemd/cgroups.go deleted file mode 100644 index 42bdc6f..0000000 --- a/systemd/cgroups.go +++ /dev/null @@ -1,276 +0,0 @@ -// Copyright 2022 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package systemd - -import ( - "bufio" - "bytes" - "io" - "os" - "path/filepath" - "strconv" - "strings" - - "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/pkg/errors" - "golang.org/x/sys/unix" -) - -// cgUnifiedMountMode constant values describe how cgroup filesystems (aka hierarchies) are -// mounted underneath /sys/fs/cgroup. In cgroups-v1 there are many mounts, -// one per controller (cpu, blkio, etc) and one for systemd itself. In -// cgroups-v2 there is only one mount managed entirely by systemd and -// internally exposing all controller syscalls. As kernel+distros migrate towards -// cgroups-v2, systemd has a hybrid mode where it mounts v2 and uses -// that for process management but also mounts all the v1 filesystem -// hierarchies and uses them for resource accounting and control -type cgUnifiedMountMode int8 - -const ( - // unifModeUnknown indicates that we do not know if/how any - // cgroup filesystems are mounted underneath /sys/fs/cgroup - unifModeUnknown cgUnifiedMountMode = iota - // unifModeNone indicates that both systemd and the controllers - // are using v1 legacy mounts and there is no usage of the v2 - // unified hierarchy. a.k.a "legacy hierarchy" - unifModeNone cgUnifiedMountMode = iota - // unifModeSystemd indicates that systemd is using a v2 unified - // hierarchy for organizing processes into control groups, but all - // controller interaction is using v1 per-controller hierarchies. - // a.k.a. "hybrid hierarchy" - unifModeSystemd cgUnifiedMountMode = iota - // unifModeAll indicates that v2 API is in full usage and there - // are no v1 hierarchies exported. Programs (mainly container orchestrators - // such as docker,runc,etc) that rely on v1 APIs will be broken. - // a.k.a. "unified hierarchy" - unifModeAll cgUnifiedMountMode = iota -) - -// WARNING: We only read this data once at process start, systemd updates -// may require restarting systemd-exporter -var cgroupUnified cgUnifiedMountMode = unifModeUnknown - -// Values copied from https://github.com/torvalds/linux/blob/master/include/uapi/linux/magic.h -const ( - tmpFsMagic = 0x01021994 - cgroupSuperMagic = 0x27e0eb - cgroup2SuperMagic = 0x63677270 -) - -// cgUnifiedCached checks the filesystem types mounted under /sys/fs/cgroup to determine -// which systemd layout (legacy/hybrid/unified) is in use. -// We do not bother to track unified_systemd_v232 as our usage does not -// depend on reading the systemd hierarchy directly, we only focus on reading -// the controllers. If you care if /sys/fs/cgroup/systemd is v1 or v2 you need -// to track this -// WARNING: We cache this data once at process start. Systemd updates -// may require restarting systemd-exporter -func cgUnifiedCached(logger log.Logger) (cgUnifiedMountMode, error) { - if cgroupUnified != unifModeUnknown { - return cgroupUnified, nil - } - - var fs unix.Statfs_t - err := unix.Statfs("/sys/fs/cgroup/", &fs) - if err != nil { - return unifModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup)") - } - - switch fs.Type { - case cgroup2SuperMagic: - level.Debug(logger).Log("msg", "Found cgroup2 on /sys/fs/cgroup, full unified hierarchy") - cgroupUnified = unifModeAll - case tmpFsMagic: - err := unix.Statfs("/sys/fs/cgroup/unified", &fs) - - // Ignore err, we expect path to be missing on v232 - if err == nil && fs.Type == cgroup2SuperMagic { - level.Debug(logger).Log("msg", "Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller") - cgroupUnified = unifModeSystemd - } else { - err := unix.Statfs("/sys/fs/cgroup/systemd", &fs) - if err != nil { - return unifModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup/systemd)") - } - switch fs.Type { - case cgroup2SuperMagic: - level.Debug(logger).Log("msg", "Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)") - cgroupUnified = unifModeSystemd - case cgroupSuperMagic: - level.Debug(logger).Log("msg", "Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy") - cgroupUnified = unifModeNone - default: - return unifModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup/systemd)", fs.Type) - } - } - default: - return unifModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup)", fs.Type) - } - - return cgroupUnified, nil -} - -// cgGetPath returns the absolute path for a specific file in a specific controller -// in the specific cgroup denoted by the passed subpath. -// Input examples: ("cpu", "/system.slice", "cpuacct.usage_all) -func cgGetPath(controller string, subpath string, suffix string, logger log.Logger) (string, error) { - // relevant systemd source code in cgroup-util.[h|c] specifically cg_get_path - // 2. Joins controller name with base path - - unified, err := cgUnifiedCached(logger) - if err != nil { - return "", errors.Wrapf(err, "failed to determine cgroup mounting hierarchy") - } - - // TODO Ensure controller name is valid - // TODO Convert controller name into guaranteed valid directory name - dn := controller - - joined := "" - switch unified { - case unifModeNone, unifModeSystemd: - joined = filepath.Join("/sys/fs/cgroup", dn, subpath, suffix) - case unifModeAll: - joined = filepath.Join("/sys/fs/cgroup", subpath, suffix) - default: - return "", errors.Errorf("unknown cgroup mount mode (e.g. unified mode) %d", unified) - } - return joined, nil -} - -// CPUUsage stores one core's worth of CPU usage for a control group -// (aka cgroup) of tasks (e.g. both processes and threads). -// Equivalent to cpuacct.usage_percpu_user and cpuacct.usage_percpu_system -type CPUUsage struct { - CPUId uint32 - SystemNanosec uint64 - UserNanosec uint64 -} - -// CPUAcct stores CPU accounting information (e.g. cpu usage) for a control -// group (cgroup) of tasks. Equivalent to cpuacct.usage_all -type CPUAcct struct { - CPUs []CPUUsage -} - -// UsageUserNanosecs returns user (e.g. non-kernel) cpu consumption in nanoseconds, across all available cpu -// cores, from the point that CPU accounting was enabled for this control group. -func (c *CPUAcct) UsageUserNanosecs() uint64 { - var nanoseconds uint64 - for _, cpu := range c.CPUs { - nanoseconds += cpu.UserNanosec - } - return nanoseconds -} - -// UsageSystemNanosecs returns system (e.g. kernel) cpu consumption in nanoseconds, across all available cpu -// cores, from the point that CPU accounting was enabled for this control group. -func (c *CPUAcct) UsageSystemNanosecs() uint64 { - var nanoseconds uint64 - for _, cpu := range c.CPUs { - nanoseconds += cpu.SystemNanosec - } - return nanoseconds -} - -// UsageAllNanosecs returns total cpu consumption in nanoseconds, across all available cpu -// cores, from the point that CPU accounting was enabled for this control group. -func (c *CPUAcct) UsageAllNanosecs() uint64 { - var nanoseconds uint64 - for _, cpu := range c.CPUs { - nanoseconds += cpu.SystemNanosec + cpu.UserNanosec - } - return nanoseconds -} - -// ReadFileNoStat uses io.ReadAll to read contents of entire file. -// This is similar to os.ReadFile but without the call to os.Stat, because -// many files in /proc and /sys report incorrect file sizes (either 0 or 4096). -// Reads a max file size of 512kB. For files larger than this, a scanner -// should be used. -// COPIED FROM prometheus/procfs WHICH ALSO USES APACHE 2.0 -func ReadFileNoStat(filename string) ([]byte, error) { - const maxBufferSize = 1024 * 512 - - f, err := os.Open(filename) - if err != nil { - return nil, err - } - defer f.Close() - - reader := io.LimitReader(f, maxBufferSize) - return io.ReadAll(reader) -} - -// NewCPUAcct will locate and read the kernel's cpu accounting info for -// the provided systemd cgroup subpath. -func NewCPUAcct(cgSubpath string, logger log.Logger) (*CPUAcct, error) { - var cpuUsage CPUAcct - - cgPath, err := cgGetPath("cpu", cgSubpath, "cpuacct.usage_all", logger) - if err != nil { - return nil, errors.Wrapf(err, "unable to get cpu controller path") - } - - // Example cpuacct.usage_all - // cpu user system - // 0 21165924 0 - // 1 13334251 0 - b, err := ReadFileNoStat(cgPath) - if err != nil { - return nil, errors.Wrapf(err, "unable to read file %s", cgPath) - } - - scanner := bufio.NewScanner(bytes.NewReader(b)) - if ok := scanner.Scan(); !ok { - return nil, errors.Errorf("unable to scan file %s", cgPath) - } - if err := scanner.Err(); err != nil { - return nil, errors.Wrapf(err, "unable to scan file %s", cgPath) - } - for scanner.Scan() { - if err := scanner.Err(); err != nil { - return nil, errors.Wrapf(err, "unable to scan file %s", cgPath) - } - text := scanner.Text() - vals := strings.Split(text, " ") - if len(vals) != 3 { - return nil, errors.Errorf("unable to parse contents of file %s", cgPath) - } - cpu, err := strconv.ParseUint(vals[0], 10, 32) - if err != nil { - return nil, errors.Wrapf(err, "unable to parse %s as uint32 (from %s)", vals[0], cgPath) - } - user, err := strconv.ParseUint(vals[1], 10, 64) - if err != nil { - return nil, errors.Wrapf(err, "unable to parse %s as uint64 (from %s)", vals[1], cgPath) - } - sys, err := strconv.ParseUint(vals[2], 10, 64) - if err != nil { - return nil, errors.Wrapf(err, "unable to parse %s as an in (from %s)", vals[2], cgPath) - } - onecpu := CPUUsage{ - CPUId: uint32(cpu), - UserNanosec: user, - SystemNanosec: sys, - } - cpuUsage.CPUs = append(cpuUsage.CPUs, onecpu) - } - if len(cpuUsage.CPUs) < 1 { - return nil, errors.Errorf("no CPU/core info extracted from %s", cgPath) - } - - return &cpuUsage, nil -} diff --git a/systemd/systemd.go b/systemd/systemd.go index 6b5d813..6387a33 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -52,7 +52,6 @@ var ( errConvertUint32PropertyMsg = "couldn't convert unit's %s property %v to uint32" errConvertStringPropertyMsg = "couldn't convert unit's %s property %v to string" errUnitMetricsMsg = "couldn't get unit's metrics: %s" - errControlGroupReadMsg = "failed to read %s from control group" infoUnitNoHandler = "no unit type handler for %s" ) @@ -325,11 +324,6 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un level.Warn(logger).Log("msg", errUnitMetricsMsg, "err", err) } - err = c.collectUnitCPUUsageMetrics("Service", conn, ch, unit) - if err != nil { - level.Warn(logger).Log("msg", errUnitMetricsMsg, "err", err) - } - if *enableIPAccountingMetrics { err = c.collectIPAccountingMetrics(conn, ch, unit) if err != nil { @@ -341,10 +335,6 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un if err != nil { level.Warn(logger).Log("msg", errUnitMetricsMsg, "err", err) } - err = c.collectUnitCPUUsageMetrics("Mount", conn, ch, unit) - if err != nil { - level.Warn(logger).Log("msg", errUnitMetricsMsg, "err", err) - } case strings.HasSuffix(unit.Name, ".timer"): err := c.collectTimerTriggerTime(conn, ch, unit) if err != nil { @@ -355,22 +345,6 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un if err != nil { level.Warn(logger).Log("msg", errUnitMetricsMsg, "err", err) } - // Most sockets do not have a cpu cgroupfs entry, but a - // few do, notably docker.socket - err = c.collectUnitCPUUsageMetrics("Socket", conn, ch, unit) - if err != nil { - level.Warn(logger).Log("msg", errUnitMetricsMsg, "err", err) - } - case strings.HasSuffix(unit.Name, ".swap"): - err = c.collectUnitCPUUsageMetrics("Swap", conn, ch, unit) - if err != nil { - level.Warn(logger).Log("msg", errUnitMetricsMsg, "err", err) - } - case strings.HasSuffix(unit.Name, ".slice"): - err = c.collectUnitCPUUsageMetrics("Slice", conn, ch, unit) - if err != nil { - level.Warn(logger).Log("msg", errUnitMetricsMsg, "err", err) - } default: level.Debug(c.logger).Log("msg", infoUnitNoHandler, unit.Name) } @@ -510,87 +484,6 @@ func (c *Collector) collectServiceStartTimeMetrics(conn *dbus.Conn, ch chan<- pr return nil } -func (c *Collector) mustGetUnitStringTypeProperty(unitType string, - propName string, defaultVal string, conn *dbus.Conn, unit dbus.UnitStatus) string { - prop, err := conn.GetUnitTypePropertyContext(c.ctx, unit.Name, unitType, propName) - if err != nil { - level.Debug(c.logger).Log("msg", errGetPropertyMsg, "prop_name", propName) - return defaultVal - } - propVal, ok := prop.Value.Value().(string) - if !ok { - level.Debug(c.logger).Log("msg", errConvertStringPropertyMsg, "prop_name", propName, "prop_value", prop.Value.Value()) - return defaultVal - } - return propVal -} - -// A number of unit types support the 'ControlGroup' property needed to allow us to directly read their -// resource usage from the kernel's cgroupfs cpu hierarchy. The only change is which dbus item we are querying -func (c *Collector) collectUnitCPUUsageMetrics(unitType string, conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { - propCGSubpath, err := conn.GetUnitTypePropertyContext(c.ctx, unit.Name, unitType, "ControlGroup") - if err != nil { - return errors.Wrapf(err, errGetPropertyMsg, "ControlGroup") - } - cgSubpath, ok := propCGSubpath.Value.Value().(string) - if !ok { - return errors.Errorf(errConvertStringPropertyMsg, "ControlGroup", propCGSubpath.Value.Value()) - } - - switch { - case cgSubpath == "" && unit.ActiveState == "inactive", - cgSubpath == "" && unit.ActiveState == "failed": - // Expected condition, systemd has cleaned up and - // we have nothing to record - return nil - case cgSubpath == "" && unit.ActiveState == "active": - // Unexpected. Why is there no cgroup on an active unit? - subType := c.mustGetUnitStringTypeProperty(unitType, "Type", "unknown", conn, unit) - slice := c.mustGetUnitStringTypeProperty(unitType, "Slice", "unknown", conn, unit) - return errors.Errorf("got 'no cgroup' from systemd for active unit (state=%s subtype=%s slice=%s)", unit.ActiveState, subType, slice) - case cgSubpath == "": - // We are likely reading a unit that is currently changing state, so - // we record this and bail - subType := c.mustGetUnitStringTypeProperty(unitType, "Type", "unknown", conn, unit) - slice := c.mustGetUnitStringTypeProperty(unitType, "Slice", "unknown", conn, unit) - level.Debug(c.logger).Log("msg", "Read 'no cgroup' from unit", "unit", unit.Name, "state", unit.ActiveState, "subtype", subType, "slice", slice) - return nil - } - - propCPUAcct, err := conn.GetUnitTypePropertyContext(c.ctx, unit.Name, unitType, "CPUAccounting") - if err != nil { - return errors.Wrapf(err, errGetPropertyMsg, "CPUAccounting") - } - cpuAcct, ok := propCPUAcct.Value.Value().(bool) - if !ok { - return errors.Errorf(errConvertStringPropertyMsg, "CPUAccounting", propCPUAcct.Value.Value()) - } - if !cpuAcct { - return nil - } - - cpuUsage, err := NewCPUAcct(cgSubpath, c.logger) - if err != nil { - if unitType == "Socket" { - level.Debug(c.logger).Log("msg", "unable to read SocketUnit CPU accounting information", "unit", unit.Name) - return nil - } - return errors.Wrapf(err, errControlGroupReadMsg, "CPU usage") - } - - userSeconds := float64(cpuUsage.UsageUserNanosecs()) / 1000000000.0 - sysSeconds := float64(cpuUsage.UsageSystemNanosecs()) / 1000000000.0 - - ch <- prometheus.MustNewConstMetric( - c.unitCPUTotal, prometheus.CounterValue, - userSeconds, unit.Name, parseUnitType(unit), "user") - ch <- prometheus.MustNewConstMetric( - c.unitCPUTotal, prometheus.CounterValue, - sysSeconds, unit.Name, parseUnitType(unit), "system") - - return nil -} - func (c *Collector) collectSocketConnMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { acceptedConnectionCount, err := conn.GetUnitTypePropertyContext(c.ctx, unit.Name, "Socket", "NAccepted") if err != nil {