diff --git a/.gitignore b/.gitignore index 0617875..1714718 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /systemd_exporter /bin/golangci-lint +.idea +coverage.txt diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..686f7cc --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,5 @@ +issues: + exclude: + - "not declared by package utf8" + - "unicode/utf8/utf8.go" + diff --git a/.travis.yml b/.travis.yml index 1ca29c5..9011808 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,36 @@ -language: go +# TODO Use CodeCov 'Flags' to separate coverage for integration tests and unit +# tests. Near 100% on integration is expected - these tests cast a wide net to +# catch many issues (but do not easily tell you where the issue is). Near 100% +# on unit tests is a feat of heroism - these tests identify a specific code +# chunk with an issue. We mainly care about 100% on unit tests, but 100% on +# integration is an easy win and a nice to have +# +# See https://docs.codecov.io/docs/flags +# Use go get github.com/stristr/go-acc && go-acc ./... +# Or use coverpkg=github.com/povilasv/systemd_exporter,github.com/povilasv/systemd_exporter/systemd +# This defines the script for us automatically. By default it installs +# to requested go version then runs make +language: go go: - "1.x" + +before_script: systemd --version +os: linux + +go: + - 1.x + +before_script: systemd --version && systemctl list-units + +after_success: + - bash <(curl -s https://codecov.io/bash) + +jobs: + include: + - dist: xenial + name: xenial-229 + - dist: bionic + name: bionic-237 + + diff --git a/CHANGELOG.md b/CHANGELOG.md index b34087b..60ea02d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,20 +2,18 @@ ### **Breaking changes** -* `systemd_unit_state` label `type` has new meaning - Now shows Unit type (`service`, `scope`, etc), not Service Unit types (`simple`, `forking`, etc) - or mount unit types(`aufs`,`ext3`, etc). Service and mount types have been moved to `systemd_unit_info` +* `systemd_unit_state` label `type` has new meaning. Previously `type` contained service unit type (`simple`, `forking`, etc) or mount unit types (`aufs`, `ext3`, etc). Now `systemd_unit_state{type}` contains overall unit type (`service`, `scope`, etc) to allow easy PromQL group by clauses. Service and mount types have been moved to `systemd_unit_info` ### Changes - [FEATURE] Read unit CPU usage from cgroup. Added `systemd_unit_cpu_seconds_total` metric. **Note** - Untested on unified hierarchy - [FEATURE] Add `systemd_unit_info` with metainformation about units incl. subtype specific info - [ENHANCEMENT] Added `type` label to all metrics named `systemd_unit-*` to support PromQL grouping -* [ENHANCEMENT] `systemd_unit_state` works for all unit types, not just service and mount units -* [ENHANCEMENT] Scrapes are approx 80% faster. If needed, set GOMAXPROCS to limit max concurrency -* [CHANGE] Start tracking metric cardinality in readme -* [CHANGE] Expanded default set of unit types monitored. Only device unit types are not enabled by default -* [BUGFIX] `timer_last_trigger_seconds` metric is now exported as expected for all timers +- [ENHANCEMENT] `systemd_unit_state` works for all unit types, not just service and mount units +- [ENHANCEMENT] Scrapes are approx 80% faster. If needed, set GOMAXPROCS to limit max concurrency +- [CHANGE] Start tracking metric cardinality in readme +- [CHANGE] Expanded default set of unit types monitored. Only device unit types are not enabled by default +- [BUGFIX] `timer_last_trigger_seconds` metric is now exported as expected for all timers ## 0.2.0 / 2019-03-20 diff --git a/Makefile b/Makefile index 5c07f3a..c13eed5 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ BRANCH := $(shell git branch | grep \* | cut -d ' ' -f2) LINT_FLAGS := run --deadline=120s LINTER := ./bin/golangci-lint -TESTFLAGS := -v -cover +TEST_FLAGS := -v -cover -race -coverprofile=coverage.txt -covermode=atomic GO111MODULE := on all: $(LINTER) deps test lint build @@ -23,7 +23,12 @@ deps: .PHONY: test test: - go test $(TESTFLAGS) ./... +ifdef TRAVIS + sudo sh -c 'echo DefaultCPUAccounting=yes >> /etc/systemd/system.conf' + sudo sh -c 'echo DefaultMemoryAccounting=yes >> /etc/systemd/system.conf' + sudo systemctl daemon-reload +endif + go test $(TEST_FLAGS) ./... .PHONY: build build: deps diff --git a/README.md b/README.md index 9d52ad1..e336437 100644 --- a/README.md +++ b/README.md @@ -79,15 +79,15 @@ Note that a number of unit types are filtered by default | ----------------------------------------- | ----------- | -------- | ------------------------------------------------------------------ | | systemd_exporter_build_info | Gauge | UNSTABLE | 1 per systemd-exporter | | systemd_unit_info | Gauge | UNSTABLE | 1 per service + 1 per mount | -| systemd_unit_cpu_seconds_total | Gauge | UNSTABLE | 2 per mount/scope/slice/socket/swap {mode="system/user"} | +| systemd_unit_cpu_seconds_total | Counter | UNSTABLE | 12 per mount/scope/slice/socket/swap {mode="system/user"}| | systemd_unit_state | Gauge | UNSTABLE | 5 per unit {state="activating/active/deactivating/failed/inactive} | | systemd_unit_tasks_current | Gauge | UNSTABLE | 1 per service | | systemd_unit_tasks_max | Gauge | UNSTABLE | 1 per service | | systemd_unit_start_time_seconds | Gauge | UNSTABLE | 1 per service | -| systemd_service_restart_total | Gauge | UNSTABLE | 1 per service | +| systemd_service_restart_total | Counter | UNSTABLE | 1 per service | | systemd_socket_accepted_connections_total | Counter | UNSTABLE | 1 per socket | | systemd_socket_current_connections | Gauge | UNSTABLE | 1 per socket | -| systemd_socket_refused_connections_total | Gauge | UNSTABLE | 1 per socket | +| systemd_socket_refused_connections_total | Counter | UNSTABLE | 1 per socket. Requires systemd>239 | | systemd_timer_last_trigger_seconds | Gauge | UNSTABLE | 1 per timer | | systemd_process_resident_memory_bytes | Gauge | UNSTABLE | 1 per service | | systemd_process_virtual_memory_bytes | Gauge | UNSTABLE | 1 per service | @@ -95,3 +95,5 @@ Note that a number of unit types are filtered by default | systemd_process_open_fds | Gauge | UNSTABLE | 1 per service | | systemd_process_max_fds | Gauge | UNSTABLE | 1 per service | | systemd_process_cpu_seconds_total | Counter | UNSTABLE | 1 per service | + +1Only present for units which have systemd `CPUAccounting` enabled diff --git a/cgroup/Readme.md b/cgroup/Readme.md new file mode 100644 index 0000000..01dfd92 --- /dev/null +++ b/cgroup/Readme.md @@ -0,0 +1,49 @@ + +This package provides functions to retrieve control group metrics from the pseudo-filesystem `/sys/cgroup/`. + +**WARNING:** This package is a work in progress. Its API may still break in backwards-incompatible ways without warnings. Use it at your own risk. + +The Linux kernel supports two APIs for userspace to interact with control groups, the v1 API and the v2 API. See +[this LWN Article](https://lwn.net/Articles/679786/) or +[this kernel documentation](https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#deprecated-v1-core-features) +for background on the two APIs. This package will interact with both v1 and v2 APIs. + + +### Focus on Systemd + +This package is initially focused on reading metrics for systemd units. Therefore, +the following systemd documentation is relevant. + +#### Systemd cgroup mount mode + +The kernel can mount the cgroupfs in any manner it chooses. However, anyone wanting to use that cgroupfs must know +where/how it is mounted. When there was only one cgroup API, it was always mounted at `/sys/fs/cgroup`. With the +transition from v1 to v2, the mounting approach differs per-distro, with some mounting only v2, some mounting only +v1(all hierarchies), and some mounting a combination. For simplicity, this package initially focuses on the three +mount "modes" supported by systemd: + +via [systemd.io](https://systemd.io/CGROUP_DELEGATION/#three-different-tree-setups-) + +1. Unified β€” this is the simplest mode, and exposes a pure cgroup v2 logic +2. Legacy β€” this is the traditional cgroup v1 mode. In this mode the various controllers each get their own cgroup + file system mounted to `/sys/fs/cgroup//` +3. Hybrid β€” this is a hybrid between the unified and legacy mode. It’s set up mostly like legacy + +#### Systemd Supported Controllers + +The initial target controllers this package aims to read from are the controllers supported by systemd. Reading from +other controllers may be supported in the future. Systemd guarantees that all v1 hierarchies are kept in sync. + +Via [systemd.io](https://systemd.io/CGROUP_DELEGATION/#controller-support): + +Systemd supports a number of controllers (but not all). Specifically, supported are: + +on cgroup v1: cpu, cpuacct, blkio, memory, devices, pids +on cgroup v2: cpu, io, memory, pids + +It is our intention to natively support all cgroup v2 controllers as they are added +to the kernel. However, regarding cgroup v1: at this point we will not add support +for any other controllers anymore. This means systemd currently does not and will +never manage the following controllers on cgroup v1: freezer, cpuset, net_cls, +perf_event, net_prio, hugetlb + diff --git a/cgroup/cgroup.go b/cgroup/cgroup.go new file mode 100644 index 0000000..31945b9 --- /dev/null +++ b/cgroup/cgroup.go @@ -0,0 +1,175 @@ +package cgroup + +import ( + "fmt" + "github.com/pkg/errors" + "github.com/prometheus/common/log" + "golang.org/x/sys/unix" + "os" + "path/filepath" +) + +// FS is the pseudo-filesystem cgroupfs, which provides an interface to +// kernel data structures +type FS struct { + mountPoint string + + // WARNING: We only read this data once at process start, systemd updates + // may require restarting systemd-exporter + cgroupUnified MountMode +} + +// DefaultMountPoint is the common mount point of the cgroupfs filesystem +const DefaultMountPoint = "/sys/fs/cgroup" + +// NewDefaultFS returns a new cgroup FS mounted under the default mountPoint. +// It will error if cgroup hierarchies are not laid out in a manner understood +// by systemd. +func NewDefaultFS() (FS, error) { + + mode, err := cgUnifiedCached() + if err != nil || mode == MountModeUnknown { + return FS{}, fmt.Errorf("could not determine cgroupfs mount mode: %s", err) + } + + return NewFS(DefaultMountPoint, mode) +} + +// NewFS returns a new cgroup FS mounted under the given mountPoint. It does not check +// the provided mount mode +func NewFS(mountPoint string, mountMode MountMode) (FS, error) { + info, err := os.Stat(mountPoint) + if err != nil { + return FS{}, fmt.Errorf("could not read %s: %s", mountPoint, err) + } + if !info.IsDir() { + return FS{}, fmt.Errorf("mount point %s is not a directory", mountPoint) + } + return FS{mountPoint, mountMode}, nil +} + +// path appends the given path elements to the filesystem path, adding separators +// as necessary. +func (fs FS) path(p ...string) string { + return filepath.Join(append([]string{string(fs.mountPoint)}, p...)...) +} + +// MountMode constants describe how the kernel has mounted various cgroup filesystems under /sys/fs/cgroup. +// Generally speaking, kernels using the cgroups-v1 API will have many cgroup controller hierarchies, each with +// their own fs and their own mount point. Kernels using cgroups-v2 API will only have the one unified hierarchy. +// To support back compatibility, kernels often mount both the v1 and v2 hierarchies at different points. Systemd +// has to know where the hierarchies are, so it inspects the mounts under /sys/fs/cgroup and decides what +// MountMode this kernel is using. See each constant for a description of that mode. This type corresponds to +// the unified_cache variable in systemd/src/basic/cgroup-util.c +type MountMode int8 + +const ( + // MountModeUnknown indicates we do not recognize the mount pattern of the cgroup filesystems in /sys/fs/cgroup. + // systemd source calls this mode CGROUP_UNIFIED_UNKNOWN + MountModeUnknown MountMode = iota + // MountModeLegacy indicates both systemd and individual cgroups are using cgroup-v1 hierarchies. There is + // typically one mount point per hierarchy, and no usage of the cgroup-v2 unified hierarchy. + // systemd source calls this mode CGROUP_UNIFIED_NONE + MountModeLegacy MountMode = iota + // MountModeHybrid indicates the systemd controller is using cgroup-v2 unified hierarchy for organizing + // processes, but all other cgroups are using cgroup-v1 legacy hierarchies. + // systemd source calls this CGROUP_UNIFIED_SYSTEMD and also stores the unified_systemd_v232 flag + MountModeHybrid MountMode = iota + // MountModeUnified indicates cgroup-v2 API is in full usage and there are no cgroup-v1 hierarchies mounted. + // Non-updated programs (e.g. container orchestrators such as docker/runc) that rely on cgroup-v1 mounts will break. + // systemd source calls this CGROUP_UNIFIED_ALL + MountModeUnified MountMode = iota +) +func (c MountMode) String() string { + return [...]string{"unknown", "none", "systemd", "all"}[c] +} + + +// Values copied from https://github.com/torvalds/linux/blob/master/include/uapi/linux/magic.h +const ( + tmpFsMagic = 0x01021994 + cgroupSuperMagic = 0x27e0eb + cgroup2SuperMagic = 0x63677270 +) + +// cgUnifiedCached checks the filesystem types mounted under /sys/fs/cgroup to determine +// which systemd layout (legacy/hybrid/unified) is in use. +// We do not bother to track unified_systemd_v232 as our usage does not +// depend on reading the systemd hierarchy directly, we only focus on reading +// the controllers. If you care if /sys/fs/cgroup/systemd is v1 or v2 you need +// to track this +// WARNING: We cache this data once at process start. Systemd updates +// may require restarting systemd-exporter +// Equivalent to systemd cgroup-util.c#cg_unified_cached +var statfsFunc = unix.Statfs +func cgUnifiedCached() (MountMode, error) { + // if cgroupUnified != MountModeUnknown { + // return cgroupUnified, nil + // } + + var fs unix.Statfs_t + err := statfsFunc("/sys/fs/cgroup/", &fs) + if err != nil { + return MountModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup)") + } + + switch fs.Type { + case cgroup2SuperMagic: + log.Debugf("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy") + return MountModeUnified, nil + case tmpFsMagic: + err := statfsFunc("/sys/fs/cgroup/unified/", &fs) + + // Ignore err, we expect path to be missing on v232 + if err == nil && fs.Type == cgroup2SuperMagic { + log.Debugf("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller") + return MountModeHybrid, nil + } + + err = statfsFunc("/sys/fs/cgroup/systemd/", &fs) + if err != nil { + return MountModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup/systemd)") + } + + switch fs.Type { + case cgroup2SuperMagic: + log.Debugf("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)") + return MountModeHybrid, nil + case cgroupSuperMagic: + log.Debugf("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy") + return MountModeLegacy, nil + default: + return MountModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup/systemd)", fs.Type) + } + + default: + return MountModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup)", fs.Type) + } +} + +// cgGetPath returns the absolute path for a specific file in a specific controller +// in the specific cgroup denoted by the passed subpath. +// Input examples: ("cpu", "/system.slice", "cpuacct.usage_all") +func (fs FS) cgGetPath(controller string, subpath string, suffix string) (string, error) { + // relevant systemd source code in cgroup-util.[h|c] specifically cg_get_path + // 2. Joins controller name with base path + + if fs.cgroupUnified == MountModeUnknown { + return "", errors.Errorf("Cannot determine path with unknown mounting hierarchy") + } + + // TODO Ensure controller name is valid + // TODO Convert controller name into guaranteed valid directory name + dn := controller + + joined := "" + switch fs.cgroupUnified { + case MountModeLegacy, MountModeHybrid: + joined = fs.path(dn, subpath, suffix) + case MountModeUnified: + joined = fs.path(subpath, suffix) + default: + return "", errors.Errorf("unknown cgroup mount mode (e.g. unified mode) %d", fs.cgroupUnified) + } + return joined, nil +} diff --git a/cgroup/cgroup_test.go b/cgroup/cgroup_test.go new file mode 100644 index 0000000..9c76d4a --- /dev/null +++ b/cgroup/cgroup_test.go @@ -0,0 +1,174 @@ +package cgroup + +import ( + "errors" + "golang.org/x/sys/unix" + "os" + "testing" +) + +const ( + testFixturesHybrid = "fixtures/cgroup-hybrid" +) + +func TestMountModeParsing(t *testing.T) { + // This test cannot (easily) use test fixtures, because it relies on being + // able to call Statfs on mounted filesystems. So we only run inside + // system where we expect to find cgroupfs mounted in a mode systemd expects. + // For now, that's only inside TravisCI, but in future we may expand to run + // this by default on certain Linux systems + if _, inTravisCI := os.LookupEnv("TRAVIS"); inTravisCI == false { + return + } + + if _, err := NewDefaultFS(); err != nil { + t.Errorf("expected success determining mount type inside of travis CI: %s", err) + } +} + + +func TestCgUnifiedCached(t *testing.T) { + // Build some functions we will use to simulate various cgroup mounting scenarios + noCgroupMount := func(path string, stat *unix.Statfs_t) error { + // No fs present on /sys/fs/cgroup/ + return errors.New("boo") + } + unknownCgroupMount := func(path string, stat *unix.Statfs_t) error { + // Unknown fs type present on /sys/fs/cgroup/ + stat.Type = 0x0 + return nil + } + unifiedMount := func(path string, stat *unix.Statfs_t) error { + // unified fs present + switch path { + case "/sys/fs/cgroup/": + stat.Type = cgroup2SuperMagic + return nil + default: + return errors.New("pretend path not found") + } + } + hybridMountSystemdV232 := func(path string, stat *unix.Statfs_t) error { + switch path { + case "/sys/fs/cgroup/": + stat.Type = tmpFsMagic + case "/sys/fs/cgroup/systemd/": + stat.Type = cgroup2SuperMagic + } + return nil + } + hybridMountSystemdV233 := func(path string, stat *unix.Statfs_t) error { + switch path { + case "/sys/fs/cgroup/": + stat.Type = tmpFsMagic + case "/sys/fs/cgroup/unified/": + stat.Type = cgroup2SuperMagic + case "/sys/fs/cgroup/systemd/": + stat.Type = cgroupSuperMagic + } + return nil + } + legacyMount := func(path string, stat *unix.Statfs_t) error { + switch path { + case "/sys/fs/cgroup/": + stat.Type = tmpFsMagic + case "/sys/fs/cgroup/unified/": + return errors.New("pretend unified path not found") + case "/sys/fs/cgroup/systemd/": + stat.Type = cgroupSuperMagic + } + return nil + } + missingSystemdFolder := func(path string, stat *unix.Statfs_t) error { + switch path { + case "/sys/fs/cgroup/": + stat.Type = tmpFsMagic + case "/sys/fs/cgroup/unified/": + return errors.New("pretend unified path not found") + case "/sys/fs/cgroup/systemd/": + return errors.New("pretend we cannot stat systemd dir") + } + return nil + } + unknownSystemdFolderMountType := func(path string, stat *unix.Statfs_t) error { + switch path { + case "/sys/fs/cgroup/": + stat.Type = tmpFsMagic + case "/sys/fs/cgroup/unified/": + return errors.New("pretend unified path not found") + case "/sys/fs/cgroup/systemd/": + stat.Type = 0x0 + } + return nil + } + + tables := []struct { + name string + statFn func(string,*unix.Statfs_t) error + expectedMode MountMode + errExpected bool + }{ + {"NoCgroupMount", noCgroupMount, MountModeUnknown, true}, + {"UnknownCgroupMountType", unknownCgroupMount, MountModeUnknown, true}, + {"LegacyMount", legacyMount, MountModeLegacy, false}, + {"HybridMount, v232", hybridMountSystemdV232, MountModeHybrid, false}, + {"HybridMount, v233+", hybridMountSystemdV233, MountModeHybrid, false}, + {"MissingSystemdFolder", missingSystemdFolder, MountModeUnknown, true}, + {"UnknownSystemdFolderType", unknownSystemdFolderMountType, MountModeUnknown, true}, + {"UnifiedMount", unifiedMount, MountModeUnified, false}, + } + + for _, table := range tables { + statfsFunc = table.statFn + mode, err := cgUnifiedCached() + if table.errExpected && err == nil { + t.Errorf("%s: expected an err, but got mode %s with no error", table.name, mode) + } + if !table.errExpected && err != nil { + t.Errorf("%s: expected no error, but got mode %s with err: %s", table.name, mode, err) + } + if mode != table.expectedMode { + t.Errorf("%s: expected mode %s but got mode %s", table.name, table.expectedMode, mode) + } + } +} + +func TestNewFS(t *testing.T) { + if _, err := NewFS("foobar", MountModeUnknown); err == nil { + t.Error("NewFS should have failed with non-existing path") + } + + if _, err := NewFS("cgroups_test.go", MountModeUnknown); err == nil { + t.Error("want NewFS to fail if mount point is not a dir") + } + + if _, err := NewFS(testFixturesHybrid, MountModeUnknown); err != nil { + t.Error("want NewFS to succeed if mount point exists") + } +} + +func getHybridFixtures(t *testing.T) FS { + fs, err := NewFS(testFixturesHybrid, MountModeHybrid) + if err != nil { + t.Fatal("Unable to create hybrid text fixtures") + } + return fs +} + +func TestCgSubpath(t *testing.T) { + fs := getHybridFixtures(t) + + fs.cgroupUnified = MountModeUnknown + if _, err := fs.cgGetPath("cpu", "/system.slice", "cpuacct.usage_all"); err == nil { + t.Error("should not be able to determine path with unknown mount mode") + } + fs.cgroupUnified = MountModeHybrid + path, err := fs.cgGetPath("cpu", "/system.slice", "cpuacct.usage_all") + if err != nil { + t.Error("should be able to determine path with systemd mount mode") + } + want := testFixturesHybrid + "/cpu/system.slice/cpuacct.usage_all" + if path != want { + t.Errorf("bad response. Wanted %s, got %s", want, path) + } +} diff --git a/cgroup/cpuacct.go b/cgroup/cpuacct.go new file mode 100644 index 0000000..e64eeb7 --- /dev/null +++ b/cgroup/cpuacct.go @@ -0,0 +1,147 @@ +package cgroup + +import ( + "bufio" + "bytes" + "github.com/pkg/errors" + "io" + "io/ioutil" + "os" + "strconv" + "strings" +) + +// CPUUsage stores one core's worth of CPU usage for a control group +// (aka cgroup) of tasks (e.g. both processes and threads). +// Equivalent to cpuacct.usage_percpu_user and cpuacct.usage_percpu_system +type CPUUsage struct { + CPUId uint32 + SystemNanosec uint64 + UserNanosec uint64 +} + +// CPUAcct stores CPU accounting information (e.g. cpu usage) for a control +// group (cgroup) of tasks. Equivalent to cpuacct.usage_all +type CPUAcct struct { + CPUs []CPUUsage +} + +// NewCPUAcct will locate and read the kernel's cpu accounting info for +// the provided systemd cgroup subpath. +func NewCPUAcct(cgSubpath string) (*CPUAcct, error) { + fs, err := NewDefaultFS() + if err != nil { + return nil, err + } + return fs.NewCPUAcct(cgSubpath) +} + +// UsageUserNanosecs returns user (e.g. non-kernel) cpu consumption in nanoseconds, across all available cpu +// cores, from the point that CPU accounting was enabled for this control group. +func (c *CPUAcct) UsageUserNanosecs() uint64 { + var nanoseconds uint64 + for _, cpu := range c.CPUs { + nanoseconds += cpu.UserNanosec + } + return nanoseconds +} + +// UsageSystemNanosecs returns system (e.g. kernel) cpu consumption in nanoseconds, across all available cpu +// cores, from the point that CPU accounting was enabled for this control group. +func (c *CPUAcct) UsageSystemNanosecs() uint64 { + var nanoseconds uint64 + for _, cpu := range c.CPUs { + nanoseconds += cpu.SystemNanosec + } + return nanoseconds +} + +// UsageAllNanosecs returns total cpu consumption in nanoseconds, across all available cpu +// cores, from the point that CPU accounting was enabled for this control group. +func (c *CPUAcct) UsageAllNanosecs() uint64 { + var nanoseconds uint64 + for _, cpu := range c.CPUs { + nanoseconds += cpu.SystemNanosec + cpu.UserNanosec + } + return nanoseconds +} + +// ReadFileNoStat uses ioutil.ReadAll to read contents of entire file. +// This is similar to ioutil.ReadFile but without the call to os.Stat, because +// many files in /proc and /sys report incorrect file sizes (either 0 or 4096). +// Reads a max file size of 512kB. For files larger than this, a scanner +// should be used. +// COPIED FROM prometheus/procfs WHICH ALSO USES APACHE 2.0 +func ReadFileNoStat(filename string) ([]byte, error) { + const maxBufferSize = 1024 * 512 + + f, err := os.Open(filename) + if err != nil { + return nil, err + } + defer f.Close() + + reader := io.LimitReader(f, maxBufferSize) + return ioutil.ReadAll(reader) +} + +// NewCPUAcct will locate and read the kernel's cpu accounting info for +// the provided systemd cgroup subpath. +func (fs FS) NewCPUAcct(cgSubpath string) (*CPUAcct, error) { + var cpuUsage CPUAcct + + cgPath, err := fs.cgGetPath("cpu", cgSubpath, "cpuacct.usage_all") + if err != nil { + return nil, errors.Wrapf(err, "unable to get cpu controller path") + } + + // Example cpuacct.usage_all + // cpu user system + // 0 21165924 0 + // 1 13334251 0 + b, err := ReadFileNoStat(cgPath) + if err != nil { + return nil, errors.Wrapf(err, "unable to read file %s", cgPath) + } + + scanner := bufio.NewScanner(bytes.NewReader(b)) + if ok := scanner.Scan(); !ok { + return nil, errors.Errorf("unable to scan file %s", cgPath) + } + if err := scanner.Err(); err != nil { + return nil, errors.Wrapf(err, "unable to scan file %s", cgPath) + } + for scanner.Scan() { + if err := scanner.Err(); err != nil { + return nil, errors.Wrapf(err, "unable to scan file %s", cgPath) + } + text := scanner.Text() + vals := strings.Split(text, " ") + if len(vals) != 3 { + return nil, errors.Errorf("unable to parse contents of file %s", cgPath) + } + cpu, err := strconv.ParseUint(vals[0], 10, 32) + if err != nil { + return nil, errors.Wrapf(err, "unable to parse %s as uint32 (from %s)", vals[0], cgPath) + } + user, err := strconv.ParseUint(vals[1], 10, 64) + if err != nil { + return nil, errors.Wrapf(err, "unable to parse %s as uint64 (from %s)", vals[1], cgPath) + } + sys, err := strconv.ParseUint(vals[2], 10, 64) + if err != nil { + return nil, errors.Wrapf(err, "unable to parse %s as an in (from %s)", vals[2], cgPath) + } + onecpu := CPUUsage{ + CPUId: uint32(cpu), + UserNanosec: user, + SystemNanosec: sys, + } + cpuUsage.CPUs = append(cpuUsage.CPUs, onecpu) + } + if len(cpuUsage.CPUs) < 1 { + return nil, errors.Errorf("no CPU/core info extracted from %s", cgPath) + } + + return &cpuUsage, nil +} diff --git a/cgroup/cpuacct_test.go b/cgroup/cpuacct_test.go new file mode 100644 index 0000000..fdd4c9a --- /dev/null +++ b/cgroup/cpuacct_test.go @@ -0,0 +1,34 @@ +package cgroup + +import "testing" + +func TestNewCPUAcct(t *testing.T) { + fs := getHybridFixtures(t) + cpu, err := fs.NewCPUAcct("/") + if err != nil { + t.Error("want NewCPUAcct('/') to succeed") + } + + if len(cpu.CPUs) != 4 { + t.Errorf("Wrong number of CPUs. Wanted %d got %d", 4, len(cpu.CPUs)) + } + + var expectedUser uint64 = 29531441016368 + if cpu.UsageUserNanosecs() != expectedUser { + t.Errorf("Wrong user nanoseconds. Wanted %d got %d", expectedUser, cpu.UsageUserNanosecs()) + } + + var expectedSys uint64 = 619186701953 + if cpu.UsageSystemNanosecs() != expectedSys { + t.Errorf("Wrong sys nanoseconds. Wanted %d got %d", expectedSys, cpu.UsageSystemNanosecs()) + } + + expectedTotal := expectedSys + expectedUser + if cpu.UsageAllNanosecs() != expectedTotal { + t.Errorf("Wrong total nanoseconds. Wanted %d got %d", expectedTotal, cpu.UsageAllNanosecs()) + } + + if _, err := fs.NewCPUAcct("foobar"); err == nil { + t.Errorf("expected error getting cpu accounting info for bogus cgroup") + } +} diff --git a/cgroup/fixtures/README.md b/cgroup/fixtures/README.md new file mode 100644 index 0000000..6f071ae --- /dev/null +++ b/cgroup/fixtures/README.md @@ -0,0 +1,11 @@ +Contains fixed state used as a baseline for running tests. The purpose of these test fixtures +is to ensure that there is a well known and fixed environment in which tests are run so that +results are repeatable + +Note: including symlinks into fixtures is important for testing. However this can break +community toolchains and OS'es in unexpected ways. prometheus/procfs addressed this +issue by using ttar to flatten their fixtures directory into a single standard file, and +only folks who are running testing will unflatten this file. This prevents symlinks from +appearing on disk for anyone only doing a git checkout. May be something to consider if +we get problem reports. See https://github.com/prometheus/procfs/pull/79 + diff --git a/cgroup/fixtures/cgroup-hybrid/cpu b/cgroup/fixtures/cgroup-hybrid/cpu new file mode 120000 index 0000000..c5a8e01 --- /dev/null +++ b/cgroup/fixtures/cgroup-hybrid/cpu @@ -0,0 +1 @@ +cpu,cpuacct \ No newline at end of file diff --git a/cgroup/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all b/cgroup/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all new file mode 100644 index 0000000..609c1f0 --- /dev/null +++ b/cgroup/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all @@ -0,0 +1,5 @@ +cpu user system +0 7746241803817 122204678803 +1 7385109326139 107275346559 +2 7307001772824 94093225654 +3 7093088113588 295613450937 diff --git a/cgroup/fixtures/cgroup-hybrid/memory/memory.stat b/cgroup/fixtures/cgroup-hybrid/memory/memory.stat new file mode 100644 index 0000000..a2fd567 --- /dev/null +++ b/cgroup/fixtures/cgroup-hybrid/memory/memory.stat @@ -0,0 +1,36 @@ +cache 69984256 +rss 4866048 +rss_huge 0 +shmem 491520 +mapped_file 9818112 +dirty 8192 +writeback 0 +swap 0 +pgpgin 397887 +pgpgout 379613 +pgfault 541883 +pgmajfault 232 +inactive_anon 4096 +active_anon 5353472 +inactive_file 2621440 +active_file 63873024 +unevictable 2998272 +hierarchical_memory_limit 9223372036854771712 +hierarchical_memsw_limit 9223372036854771712 +total_cache 12469047296 +total_rss 2168885248 +total_rss_huge 10485760 +total_shmem 13168640 +total_mapped_file 228769792 +total_dirty 573440 +total_writeback 0 +total_swap 0 +total_pgpgin 135633232 +total_pgpgout 132074848 +total_pgfault 96879883 +total_pgmajfault 24509 +total_inactive_anon 11632640 +total_active_anon 2134667264 +total_inactive_file 9267785728 +total_active_file 3208708096 +total_unevictable 15052800 diff --git a/cgroup/memory.go b/cgroup/memory.go new file mode 100644 index 0000000..506e392 --- /dev/null +++ b/cgroup/memory.go @@ -0,0 +1,232 @@ +package cgroup + +import ( + "bufio" + "bytes" + "fmt" + "github.com/pkg/errors" + "io" + "strconv" + "strings" +) + +// MemStat represents the memory.stat file exported by the kernel when the memory cgroup controller is enabled. +// See https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt +type MemStat struct { + // bytes of page cache memory + CacheBytes uint64 + // bytes of anon and swap cache, including transparent hugepages. + // Note: Only anonymous and swap cache memory is listed as part of 'rss' stat. + // This should not be confused with the true 'resident set size' or the + // amount of physical memory used by the cgroup. 'rss + file_mapped" will + // give you resident set size of cgroup + RssBytes uint64 + // bytes of anonymous transparent hugepages + RssHugeBytes uint64 + // No kernel documentation + Shmem uint64 + // bytes of mapped files (includes tmpfs/shmem) + MappedFileBytes uint64 + // number of charging events to the memory cgroup. The charging + // event happens each time a page is accounted as either mapped + // anon page(RSS) or cache page(Page CacheBytes) to the cgroup. + PgPgIn uint64 + // # of uncharging events to the memory cgroup. The uncharging + // event happens each time a page is unaccounted from the cgroup. + PgPgOut uint64 + // no kernel documentation + PgFault uint64 + // no kernel documentation + PgMajFault uint64 + // # of bytes of swap usage + SwapBytes uint64 + // # of bytes that are waiting to get written back to the disk. + DirtyBytes uint64 + // writeback - # of bytes of file/anon cache that are queued for syncing to + // disk. + WritebackBytes uint64 + // inactive_anon - # of bytes of anonymous and swap cache memory on inactive + // LRU list. + InactiveAnonBytes uint64 + // active_anon - # of bytes of anonymous and swap cache memory on active + // LRU list. + ActiveAnonBytes uint64 + // inactive_file - # of bytes of file-backed memory on inactive LRU list. + InactiveFileBytes uint64 + // active_file - # of bytes of file-backed memory on active LRU list. + ActiveFileBytes uint64 + // unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). + UnevictableBytes uint64 + + // status considering hierarchy (see memory.use_hierarchy settings) + // # of bytes of memory limit with regard to hierarchy + // under which the memory cgroup is + HierarchialMemoryLimitBytes uint64 + // # of bytes of memory+swap limit with regard to + // hierarchy under which memory cgroup is. + HierarchialMemswLimitBytes uint64 + // total_cache - sum of all children's "cache" + TotalCacheBytes uint64 + // No kernel doc + TotalDirtyBytes uint64 + // total_rss - sum of all children's "rss" + TotalRssBytes uint64 + // No kernel docs + TotalRssHugeBytes uint64 + // total_mapped_file - sum of all children's "cache" + TotalMappedFileBytes uint64 + // No kernel docs + TotalPgFault uint64 + // No kernel docs + TotalPgMajFault uint64 + // total_pgpgout - sum of all children's "pgpgout" + TotalPgPgIn uint64 + // total_pgpgout - sum of all children's "pgpgout" + TotalPgPgOut uint64 + // No kernel doc + TotalShmemBytes uint64 + // total_swap - sum of all children's "swap" + TotalSwapBytes uint64 + // total_inactive_anon - sum of all children's "inactive_anon" + TotalInactiveAnonBytes uint64 + // total_active_anon - sum of all children's "active_anon" + TotalActiveAnonBytes uint64 + // total_inactive_file - sum of all children's "inactive_file" + TotalInactiveFileBytes uint64 + // total_active_file - sum of all children's "active_file" + TotalActiveFileBytes uint64 + // total_unevictable - sum of all children's "unevictable" + TotalUnevictableBytes uint64 + // No kernel doc + TotalWritebackBytes uint64 + // # The following additional stats are dependent on CONFIG_DEBUG_VM. + // inactive_ratio - VM internal parameter. (see mm/page_alloc.c) + // recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) + // recent_rotated_file - VM internal parameter. (see mm/vmscan.c) + // recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) + // recent_scanned_file - VM internal parameter. (see mm/vmscan.c) +} + +func parseMemStat(r io.Reader) (*MemStat, error) { + var m MemStat + s := bufio.NewScanner(r) + for s.Scan() { + // Each line has at least a name and value + fields := strings.Fields(s.Text()) + if len(fields) < 2 { + return nil, fmt.Errorf("malformed memory.stat line: %q", s.Text()) + } + + v, err := strconv.ParseUint(fields[1], 0, 64) + if err != nil { + return nil, err + } + + switch fields[0] { + case "cache": + m.CacheBytes = v + case "rss": + m.RssBytes = v + case "rss_huge": + m.RssHugeBytes = v + case "shmem": + m.Shmem = v + case "mapped_file": + m.MappedFileBytes = v + case "dirty": + m.DirtyBytes = v + case "writeback": + m.WritebackBytes = v + case "swap": + m.SwapBytes = v + case "pgpgin": + m.PgPgIn = v + case "pgpgout": + m.PgPgOut = v + case "pgfault": + m.PgFault = v + case "pgmajfault": + m.PgMajFault = v + case "inactive_anon": + m.InactiveAnonBytes = v + case "active_anon": + m.ActiveAnonBytes = v + case "inactive_file": + m.InactiveFileBytes = v + case "active_file": + m.ActiveFileBytes = v + case "unevictable": + m.UnevictableBytes = v + case "hierarchical_memory_limit": + m.HierarchialMemoryLimitBytes = v + case "hierarchical_memsw_limit": + m.HierarchialMemswLimitBytes = v + case "total_cache": + m.TotalCacheBytes = v + case "total_rss": + m.TotalRssBytes = v + case "total_rss_huge": + m.TotalRssHugeBytes = v + case "total_shmem": + m.TotalShmemBytes = v + case "total_mapped_file": + m.TotalMappedFileBytes = v + case "total_dirty": + m.TotalDirtyBytes = v + case "total_writeback": + m.TotalWritebackBytes = v + case "total_swap": + m.TotalSwapBytes = v + case "total_pgpgin": + m.TotalPgPgIn = v + case "total_pgpgout": + m.TotalPgPgOut = v + case "total_pgfault": + m.TotalPgFault = v + case "total_pgmajfault": + m.TotalPgMajFault = v + case "total_inactive_anon": + m.TotalInactiveAnonBytes = v + case "total_inactive_file": + m.TotalInactiveFileBytes = v + case "total_active_anon": + m.TotalActiveAnonBytes = v + case "total_active_file": + m.TotalActiveFileBytes = v + case "total_unevictable": + m.TotalUnevictableBytes = v + } + } + + return &m, nil +} + +// NewMemStat will locate and read the kernel's cpu accounting info for +// the provided systemd cgroup subpath. +func NewMemStat(cgSubpath string) (MemStat, error) { + fs, err := NewDefaultFS() + if err != nil { + return MemStat{}, err + } + return fs.NewMemStat(cgSubpath) +} + +// NewMemStat returns an information about cgroup memory statistics. +func (fs FS) NewMemStat(cgSubpath string) (MemStat, error) { + cgPath, err := fs.cgGetPath("memory", cgSubpath, "memory.stat") + if err != nil { + return MemStat{}, errors.Wrapf(err, "unable to get cpu controller path") + } + + b, err := ReadFileNoStat(cgPath) + if err != nil { + return MemStat{}, err + } + + m, err := parseMemStat(bytes.NewReader(b)) + if err != nil { + return MemStat{}, fmt.Errorf("failed to parse meminfo: %v", err) + } + + return *m, nil +} diff --git a/cgroup/memory_test.go b/cgroup/memory_test.go new file mode 100644 index 0000000..192f450 --- /dev/null +++ b/cgroup/memory_test.go @@ -0,0 +1,58 @@ +package cgroup + +import ( + "reflect" + "testing" +) + +func TestMemStat(t *testing.T) { + expected := MemStat{ + CacheBytes: 69984256, + RssBytes: 4866048, + RssHugeBytes: 0, + Shmem: 491520, + MappedFileBytes: 9818112, + DirtyBytes: 8192, + WritebackBytes: 0, + SwapBytes: 0, + PgPgIn: 397887, + PgPgOut: 379613, + PgFault: 541883, + PgMajFault: 232, + InactiveAnonBytes: 4096, + ActiveAnonBytes: 5353472, + InactiveFileBytes: 2621440, + ActiveFileBytes: 63873024, + UnevictableBytes: 2998272, + + HierarchialMemoryLimitBytes: 9223372036854771712, + HierarchialMemswLimitBytes: 9223372036854771712, + TotalCacheBytes: 12469047296, + TotalRssBytes: 2168885248, + TotalRssHugeBytes: 10485760, + TotalShmemBytes: 13168640, + TotalMappedFileBytes: 228769792, + TotalDirtyBytes: 573440, + TotalWritebackBytes: 0, + TotalSwapBytes: 0, + TotalPgPgIn: 135633232, + TotalPgPgOut: 132074848, + TotalPgFault: 96879883, + TotalPgMajFault: 24509, + TotalInactiveAnonBytes: 11632640, + TotalActiveAnonBytes: 2134667264, + TotalInactiveFileBytes: 9267785728, + TotalActiveFileBytes: 3208708096, + TotalUnevictableBytes: 15052800} + + have, err := getHybridFixtures(t).NewMemStat("/") + if err != nil { + t.Fatal(err) + } + + if !reflect.DeepEqual(have, expected) { + t.Logf("have: %+v", have) + t.Logf("expected: %+v", expected) + t.Errorf("structs are not equal") + } +} diff --git a/main.go b/main.go index 16f6eec..190d872 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,8 @@ package main import ( "net/http" _ "net/http/pprof" + "os" + "sync" "github.com/povilasv/prommod" "github.com/povilasv/systemd_exporter/systemd" @@ -14,6 +16,40 @@ import ( ) func main() { + listenAddress := mainCore() + + log.Infoln("Listening on", listenAddress) + if err := http.ListenAndServe(listenAddress, nil); err != nil { + log.Fatal(err) + } + +} + +func testMain(wg *sync.WaitGroup) *http.Server { + listenAddress := mainCore() + + // Launch server in background + srv := &http.Server{Addr: listenAddress} + log.Infoln("Queuing test server startup") + go func() { + defer wg.Done() + + // ErrServerClosed indicates graceful close + log.Infoln("Test server listening on", listenAddress) + if err := srv.ListenAndServe(); err != http.ErrServerClosed { + // unexpected error. port in use? + log.Fatalf("ListenAndServe(): %v", err) + } + + // Reset http package + http.DefaultServeMux = http.NewServeMux() + log.Infoln("Test server shutdown") + }() + + return srv +} + +func mainCore() string { var ( listenAddress = kingpin.Flag( "web.listen-address", @@ -37,6 +73,7 @@ func main() { kingpin.Version(prommod.Print(version.Print("systemd_exporter"))) kingpin.HelpFlag.Short('h') kingpin.Parse() + log.Debugf("Parsed '%s'", os.Args) log.Infoln("Starting systemd_exporter", version.Info()) log.Infoln("Build context", version.BuildContext()) @@ -85,8 +122,5 @@ func main() { } }) - log.Infoln("Listening on", *listenAddress) - if err := http.ListenAndServe(*listenAddress, nil); err != nil { - log.Fatal(err) - } + return *listenAddress } diff --git a/main_test.go b/main_test.go new file mode 100644 index 0000000..67241cc --- /dev/null +++ b/main_test.go @@ -0,0 +1,120 @@ +package main + +import ( + "context" + "fmt" + "net/http" + "os" + "sync" + "testing" + "time" +) + +var ( + address = "127.0.0.1:9550" + binaryName = "systemd_exporter" + defaultArgs = []string{binaryName, fmt.Sprintf("--web.listen-address=%s", address)} +) + +func TestMain(m *testing.M) { + // TODO accept arg for listen address + os.Exit(m.Run()) +} + +// TestNoop only exists as an example of how you can test +func TestNoop(t *testing.T) { + noop := func() error { return nil } + if err := runServerAndTest(defaultArgs, address, noop); err != nil { + t.Errorf("No op failed") + } +} + +// TestVersionFlag is an example of running a test that does not rely on the server being +// online. TODO make a reusable runTest() for this use case +// TODO this is broken. Because runServerAndTest is waiting for the server to come online, +// but it never does (becaseu our args mean it prints version and exits), we do not exit +// cleanly. Somethign hangs, which means test coverage is never written out. Bummer +// func TestVersionFlag(t *testing.T) { +// noop := func() error { return nil } +// runServerAndTest(append(defaultArgs, "--version"), address, noop) +// } + +func TestMetricEndpointReturnsHttp200(t *testing.T) { + test := func() error { + resp, err := getMetrics() + if err != nil { + return err + } + if want, have := http.StatusOK, resp.StatusCode; want != have { + return fmt.Errorf("wanted status code %d, received %d", want, have) + } + return nil + } + if err := runServerAndTest(defaultArgs, address, test); err != nil { + t.Errorf("Metric 200 failed") + } +} + +func runServerAndTest(args []string, url string, fn func() error) error { + // Request server startup + serverDone := &sync.WaitGroup{} + serverDone.Add(1) + // TODO it would be cleaner to change main.go to use kingpin.MustParse + os.Args = args + srv := testMain(serverDone) + + // ensure server is online before running test + fmt.Println("Waiting on test server startup...") + for i := 0; i < 10; i++ { + root := fmt.Sprintf("http://%s/", address) + if resp, err := getURL(root); err == nil && resp.StatusCode == http.StatusOK { + break + } + time.Sleep(10 * time.Millisecond) + if i == 9 { + return fmt.Errorf("can't connect to %s - unable to run any tests", root) + } + } + fmt.Println("Test server ready, running test...") + + // Run the test + err := fn() + + // Shutdown the server before we return + fmt.Println("Test complete, shutting down server...") + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() // TODO is this correct? + + if err := srv.Shutdown(ctx); err != nil { + // TODO is this what we shold do with serverDone? + defer serverDone.Wait() + return fmt.Errorf("failed to start command: %s", err) + } + + serverDone.Wait() + fmt.Println("Test server shutdown, testcase complete.") + + return err +} + +func getMetrics() (*http.Response, error) { + return getURL(fmt.Sprintf("http://%s/metrics", address)) +} + +func getURL(url string) (*http.Response, error) { + resp, err := http.Get(url) + if err != nil { + return nil, err + } + // b, err := ioutil.ReadAll(resp.Body) + // if err != nil { + // return nil, err + // } + // if err := resp.Body.Close(); err != nil { + // return nil, err + // } + // if want, have := http.StatusOK, resp.StatusCode; want != have { + // return nil, fmt.Errorf("want /metrics status code %d, have %d. Body:\n%s", want, have, b) + // } + return resp, nil +} diff --git a/systemd/cgroups.go b/systemd/cgroups.go deleted file mode 100644 index 05399e0..0000000 --- a/systemd/cgroups.go +++ /dev/null @@ -1,263 +0,0 @@ -package systemd - -import ( - "bufio" - "bytes" - "io" - "io/ioutil" - "os" - "path/filepath" - "strconv" - "strings" - - "github.com/pkg/errors" - "github.com/prometheus/common/log" - "golang.org/x/sys/unix" -) - -// cgUnifiedMountMode constant values describe how cgroup filesystems (aka hierarchies) are -// mounted underneath /sys/fs/cgroup. In cgroups-v1 there are many mounts, -// one per controller (cpu, blkio, etc) and one for systemd itself. In -// cgroups-v2 there is only one mount managed entirely by systemd and -// internally exposing all controller syscalls. As kernel+distros migrate towards -// cgroups-v2, systemd has a hybrid mode where it mounts v2 and uses -// that for process management but also mounts all the v1 filesystem -// hierarchies and uses them for resource accounting and control -type cgUnifiedMountMode int8 - -const ( - // unifModeUnknown indicates that we do not know if/how any - // cgroup filesystems are mounted underneath /sys/fs/cgroup - unifModeUnknown cgUnifiedMountMode = iota - // unifModeNone indicates that both systemd and the controllers - // are using v1 legacy mounts and there is no usage of the v2 - // unified hierarchy. a.k.a "legacy hierarchy" - unifModeNone cgUnifiedMountMode = iota - // unifModeSystemd indicates that systemd is using a v2 unified - // hierarcy for organizing processes into control groups, but all - // controller interaction is using v1 per-controller hierarchies. - // a.k.a. "hybrid hierarchy" - unifModeSystemd cgUnifiedMountMode = iota - // unifModeAll indicates that v2 API is in full usage and there - // are no v1 hierarchies exported. Programs (mainly container orchestrators - // such as docker,runc,etc) that rely on v1 APIs will be broken. - // a.k.a. "unified hierarchy" - unifModeAll cgUnifiedMountMode = iota -) - -// WARNING: We only read this data once at process start, systemd updates -// may require restarting systemd-exporter -var cgroupUnified cgUnifiedMountMode = unifModeUnknown - -// Values copied from https://github.com/torvalds/linux/blob/master/include/uapi/linux/magic.h -const ( - tmpFsMagic = 0x01021994 - cgroupSuperMagic = 0x27e0eb - cgroup2SuperMagic = 0x63677270 -) - -// cgUnifiedCached checks the filesystem types mounted under /sys/fs/cgroup to determine -// which systemd layout (legacy/hybrid/unified) is in use. -// We do not bother to track unified_systemd_v232 as our usage does not -// depend on reading the systemd hierarchy directly, we only focus on reading -// the controllers. If you care if /sys/fs/cgroup/systemd is v1 or v2 you need -// to track this -// WARNING: We cache this data once at process start. Systemd updates -// may require restarting systemd-exporter -func cgUnifiedCached() (cgUnifiedMountMode, error) { - if cgroupUnified != unifModeUnknown { - return cgroupUnified, nil - } - - var fs unix.Statfs_t - err := unix.Statfs("/sys/fs/cgroup/", &fs) - if err != nil { - return unifModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup)") - } - - switch fs.Type { - case cgroup2SuperMagic: - log.Debugf("Found cgroup2 on /sys/fs/cgroup, full unified hierarchy") - cgroupUnified = unifModeAll - case tmpFsMagic: - err := unix.Statfs("/sys/fs/cgroup/unified", &fs) - - // Ignore err, we expect path to be missing on v232 - if err == nil && fs.Type == cgroup2SuperMagic { - log.Debugf("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller") - cgroupUnified = unifModeSystemd - } else { - err := unix.Statfs("/sys/fs/cgroup/systemd", &fs) - if err != nil { - return unifModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup/systemd)") - } - switch fs.Type { - case cgroup2SuperMagic: - log.Debugf("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)") - cgroupUnified = unifModeSystemd - case cgroupSuperMagic: - log.Debugf("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy") - cgroupUnified = unifModeNone - default: - return unifModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup/systemd)", fs.Type) - } - } - default: - return unifModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup)", fs.Type) - } - - return cgroupUnified, nil -} - -// cgGetPath returns the absolute path for a specific file in a specific controller -// in the specific cgroup denoted by the passed subpath. -// Input examples: ("cpu", "/system.slice", "cpuacct.usage_all) -func cgGetPath(controller string, subpath string, suffix string) (string, error) { - // relevant systemd source code in cgroup-util.[h|c] specifically cg_get_path - // 2. Joins controller name with base path - - unified, err := cgUnifiedCached() - if err != nil { - return "", errors.Wrapf(err, "failed to determine cgroup mounting hierarchy") - } - - // TODO Ensure controller name is valid - // TODO Convert controller name into guaranteed valid directory name - dn := controller - - joined := "" - switch unified { - case unifModeNone, unifModeSystemd: - joined = filepath.Join("/sys/fs/cgroup", dn, subpath, suffix) - case unifModeAll: - joined = filepath.Join("/sys/fs/cgroup", subpath, suffix) - default: - return "", errors.Errorf("unknown cgroup mount mode (e.g. unified mode) %d", unified) - } - return joined, nil -} - -// CPUUsage stores one core's worth of CPU usage for a control group -// (aka cgroup) of tasks (e.g. both processes and threads). -// Equivalent to cpuacct.usage_percpu_user and cpuacct.usage_percpu_system -type CPUUsage struct { - CPUId uint32 - SystemNanosec uint64 - UserNanosec uint64 -} - -// CPUAcct stores CPU accounting information (e.g. cpu usage) for a control -// group (cgroup) of tasks. Equivalent to cpuacct.usage_all -type CPUAcct struct { - CPUs []CPUUsage -} - -// UsageUserNanosecs returns user (e.g. non-kernel) cpu consumption in nanoseconds, across all available cpu -// cores, from the point that CPU accounting was enabled for this control group. -func (c *CPUAcct) UsageUserNanosecs() uint64 { - var nanoseconds uint64 - for _, cpu := range c.CPUs { - nanoseconds += cpu.UserNanosec - } - return nanoseconds -} - -// UsageSystemNanosecs returns system (e.g. kernel) cpu consumption in nanoseconds, across all available cpu -// cores, from the point that CPU accounting was enabled for this control group. -func (c *CPUAcct) UsageSystemNanosecs() uint64 { - var nanoseconds uint64 - for _, cpu := range c.CPUs { - nanoseconds += cpu.SystemNanosec - } - return nanoseconds -} - -// UsageAllNanosecs returns total cpu consumption in nanoseconds, across all available cpu -// cores, from the point that CPU accounting was enabled for this control group. -func (c *CPUAcct) UsageAllNanosecs() uint64 { - var nanoseconds uint64 - for _, cpu := range c.CPUs { - nanoseconds += cpu.SystemNanosec + cpu.UserNanosec - } - return nanoseconds -} - -// ReadFileNoStat uses ioutil.ReadAll to read contents of entire file. -// This is similar to ioutil.ReadFile but without the call to os.Stat, because -// many files in /proc and /sys report incorrect file sizes (either 0 or 4096). -// Reads a max file size of 512kB. For files larger than this, a scanner -// should be used. -// COPIED FROM prometheus/procfs WHICH ALSO USES APACHE 2.0 -func ReadFileNoStat(filename string) ([]byte, error) { - const maxBufferSize = 1024 * 512 - - f, err := os.Open(filename) - if err != nil { - return nil, err - } - defer f.Close() - - reader := io.LimitReader(f, maxBufferSize) - return ioutil.ReadAll(reader) -} - -// NewCPUAcct will locate and read the kernel's cpu accounting info for -// the provided systemd cgroup subpath. -func NewCPUAcct(cgSubpath string) (*CPUAcct, error) { - var cpuUsage CPUAcct - - cgPath, err := cgGetPath("cpu", cgSubpath, "cpuacct.usage_all") - if err != nil { - return nil, errors.Wrapf(err, "unable to get cpu controller path") - } - - // Example cpuacct.usage_all - // cpu user system - // 0 21165924 0 - // 1 13334251 0 - b, err := ReadFileNoStat(cgPath) - if err != nil { - return nil, errors.Wrapf(err, "unable to read file %s", cgPath) - } - - scanner := bufio.NewScanner(bytes.NewReader(b)) - if ok := scanner.Scan(); !ok { - return nil, errors.Errorf("unable to scan file %s", cgPath) - } - if err := scanner.Err(); err != nil { - return nil, errors.Wrapf(err, "unable to scan file %s", cgPath) - } - for scanner.Scan() { - if err := scanner.Err(); err != nil { - return nil, errors.Wrapf(err, "unable to scan file %s", cgPath) - } - text := scanner.Text() - vals := strings.Split(text, " ") - if len(vals) != 3 { - return nil, errors.Errorf("unable to parse contents of file %s", cgPath) - } - cpu, err := strconv.ParseUint(vals[0], 10, 32) - if err != nil { - return nil, errors.Wrapf(err, "unable to parse %s as uint32 (from %s)", vals[0], cgPath) - } - user, err := strconv.ParseUint(vals[1], 10, 64) - if err != nil { - return nil, errors.Wrapf(err, "unable to parse %s as uint64 (from %s)", vals[1], cgPath) - } - sys, err := strconv.ParseUint(vals[2], 10, 64) - if err != nil { - return nil, errors.Wrapf(err, "unable to parse %s as an in (from %s)", vals[2], cgPath) - } - onecpu := CPUUsage{ - CPUId: uint32(cpu), - UserNanosec: user, - SystemNanosec: sys, - } - cpuUsage.CPUs = append(cpuUsage.CPUs, onecpu) - } - if len(cpuUsage.CPUs) < 1 { - return nil, errors.Errorf("no CPU/core info extracted from %s", cgPath) - } - - return &cpuUsage, nil -} diff --git a/systemd/systemd.go b/systemd/systemd.go index 87af595..e102b38 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -3,6 +3,8 @@ package systemd import ( "fmt" "math" + "os" + // Register pprof-over-http handlers _ "net/http/pprof" "regexp" @@ -12,6 +14,7 @@ import ( "github.com/coreos/go-systemd/dbus" "github.com/pkg/errors" + "github.com/povilasv/systemd_exporter/cgroup" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/log" "github.com/prometheus/procfs" @@ -55,11 +58,17 @@ type Collector struct { socketRefusedConnectionsDesc *prometheus.Desc cpuTotalDesc *prometheus.Desc unitCPUTotal *prometheus.Desc - openFDs *prometheus.Desc - maxFDs *prometheus.Desc - vsize *prometheus.Desc - maxVsize *prometheus.Desc - rss *prometheus.Desc + + unitMemCache *prometheus.Desc + unitMemRss *prometheus.Desc + unitMemDirty *prometheus.Desc + unitMemShmem *prometheus.Desc + + openFDs *prometheus.Desc + maxFDs *prometheus.Desc + vsize *prometheus.Desc + maxVsize *prometheus.Desc + rss *prometheus.Desc unitWhitelistPattern *regexp.Regexp unitBlacklistPattern *regexp.Regexp @@ -134,6 +143,27 @@ func NewCollector(logger log.Logger) (*Collector, error) { []string{"name", "type", "mode"}, nil, ) + unitMemCache := prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "unit_cached_bytes"), + "Unit Page CacheBytes", + []string{"name", "type"}, nil, + ) + unitMemRss := prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "unit_rss_bytes"), + "Unit anon+swap cache, incl. transparent hugepages. Not true RSS", + []string{"name", "type"}, nil, + ) + unitMemDirty := prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "unit_dirty_bytes"), + "Unit bytes waiting to get written to disk", + []string{"name", "type"}, nil, + ) + unitMemShmem := prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "unit_shmem_bytes"), + "", + []string{"name", "type"}, nil, + ) + openFDs := prometheus.NewDesc( prometheus.BuildFQName(namespace, "", "process_open_fds"), "Number of open file descriptors.", @@ -179,6 +209,10 @@ func NewCollector(logger log.Logger) (*Collector, error) { socketRefusedConnectionsDesc: socketRefusedConnectionsDesc, cpuTotalDesc: cpuTotalDesc, unitCPUTotal: unitCPUTotal, + unitMemCache: unitMemCache, + unitMemRss: unitMemRss, + unitMemDirty: unitMemDirty, + unitMemShmem: unitMemShmem, openFDs: openFDs, maxFDs: maxFDs, vsize: vsize, @@ -222,6 +256,13 @@ func parseUnitType(unit dbus.UnitStatus) string { return t[len(t)-1] } +// parseUnitTypeInterface extracts the dbus interface suffix for the interface unique to the passed unit type. +// For example, a systemd "service unit" will be are exposed on dbus as "service objects", and all "service objects" +// implement the org.freedesktop.systemd1.Service interface. This is used as input for dbus.GetUnitTypeProperty +func parseUnitTypeInterface(unit dbus.UnitStatus) string { + return strings.Title(parseUnitType(unit)) +} + func (c *Collector) collect(ch chan<- prometheus.Metric) error { begin := time.Now() conn, err := c.newDbus() @@ -260,80 +301,78 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un logger := c.logger.With("unit", unit.Name) - // Collect unit_state for all + // Collect unit_state for all unit types err := c.collectUnitState(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) // TODO should we continue processing here? } - switch { - case strings.HasSuffix(unit.Name, ".service"): - err = c.collectServiceMetainfo(conn, ch, unit) + // Collect metrics from cgroups + switch parseUnitType(unit) { + case "service", "mount", "socket", "swap", "slice": + cgroupPath, err := c.getControlGroup(conn, unit) + if err != nil { + logger.Warnf(errUnitMetricsMsg, err) + } + // Everything below requires a cgroup + if cgroupPath == nil { + break + } + err = c.collectUnitCPUMetrics(*cgroupPath, conn, ch, unit) + if err != nil { + // Most sockets do not have a cpu cgroupfs entry, but a few big ones do (notably docker.socket). Quiet down + // error reporting if error came from a socket + if parseUnitType(unit) != "socket" { + logger.Warnf(errUnitMetricsMsg, err) + } + } + err = c.collectUnitMemMetrics(*cgroupPath, conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } + } + // Collect metrics from dbus + switch parseUnitType(unit) { + case "service": + err = c.collectServiceMetainfo(conn, ch, unit) + if err != nil { + logger.Warnf(errUnitMetricsMsg, err) + } err = c.collectServiceStartTimeMetrics(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - if *enableRestartsMetrics { err = c.collectServiceRestartCount(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } } - err = c.collectServiceTasksMetrics(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - err = c.collectServiceProcessMetrics(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - err = c.collectUnitCPUUsageMetrics("Service", conn, ch, unit) - if err != nil { - logger.Warnf(errUnitMetricsMsg, err) - } - case strings.HasSuffix(unit.Name, ".mount"): + case "mount": err = c.collectMountMetainfo(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - err = c.collectUnitCPUUsageMetrics("Mount", conn, ch, unit) - if err != nil { - logger.Warnf(errUnitMetricsMsg, err) - } - case strings.HasSuffix(unit.Name, ".timer"): + case "timer": err := c.collectTimerTriggerTime(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - case strings.HasSuffix(unit.Name, ".socket"): + case "socket": err := c.collectSocketConnMetrics(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - // Most sockets do not have a cpu cgroupfs entry, but a - // few do, notably docker.socket - err = c.collectUnitCPUUsageMetrics("Socket", conn, ch, unit) - if err != nil { - logger.Warnf(errUnitMetricsMsg, err) - } - case strings.HasSuffix(unit.Name, ".swap"): - err = c.collectUnitCPUUsageMetrics("Swap", conn, ch, unit) - if err != nil { - logger.Warnf(errUnitMetricsMsg, err) - } - case strings.HasSuffix(unit.Name, ".slice"): - err = c.collectUnitCPUUsageMetrics("Slice", conn, ch, unit) - if err != nil { - logger.Warnf(errUnitMetricsMsg, err) - } default: c.logger.Debugf(infoUnitNoHandler, unit.Name) } @@ -515,16 +554,15 @@ func (c *Collector) mustGetUnitStringTypeProperty(unitType string, return propVal } -// A number of unit types support the 'ControlGroup' property needed to allow us to directly read their -// resource usage from the kernel's cgroupfs cpu hierarchy. The only change is which dbus item we are querying -func (c *Collector) collectUnitCPUUsageMetrics(unitType string, conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { - propCGSubpath, err := conn.GetUnitTypeProperty(unit.Name, unitType, "ControlGroup") +func (c *Collector) getControlGroup(conn *dbus.Conn, unit dbus.UnitStatus) (*string, error) { + unitTypeInterface := parseUnitTypeInterface(unit) + propCGSubpath, err := conn.GetUnitTypeProperty(unit.Name, unitTypeInterface, "ControlGroup") if err != nil { - return errors.Wrapf(err, errGetPropertyMsg, "ControlGroup") + return nil, errors.Wrapf(err, errGetPropertyMsg, "ControlGroup") } cgSubpath, ok := propCGSubpath.Value.Value().(string) if !ok { - return errors.Errorf(errConvertStringPropertyMsg, "ControlGroup", propCGSubpath.Value.Value()) + return nil, errors.Errorf(errConvertStringPropertyMsg, "ControlGroup", propCGSubpath.Value.Value()) } switch { @@ -532,37 +570,32 @@ func (c *Collector) collectUnitCPUUsageMetrics(unitType string, conn *dbus.Conn, cgSubpath == "" && unit.ActiveState == "failed": // Expected condition, systemd has cleaned up and // we have nothing to record - return nil + return nil, nil case cgSubpath == "" && unit.ActiveState == "active": // Unexpected. Why is there no cgroup on an active unit? - subType := c.mustGetUnitStringTypeProperty(unitType, "Type", "unknown", conn, unit) - slice := c.mustGetUnitStringTypeProperty(unitType, "Slice", "unknown", conn, unit) - return errors.Errorf("got 'no cgroup' from systemd for active unit (state=%s subtype=%s slice=%s)", unit.ActiveState, subType, slice) + subType := c.mustGetUnitStringTypeProperty(unitTypeInterface, "Type", "unknown", conn, unit) + slice := c.mustGetUnitStringTypeProperty(unitTypeInterface, "Slice", "unknown", conn, unit) + return nil, errors.Errorf("got 'no cgroup' from systemd for active unit (state=%s subtype=%s slice=%s)", unit.ActiveState, subType, slice) case cgSubpath == "": // We are likely reading a unit that is currently changing state, so // we record this and bail - subType := c.mustGetUnitStringTypeProperty(unitType, "Type", "unknown", conn, unit) - slice := c.mustGetUnitStringTypeProperty(unitType, "Slice", "unknown", conn, unit) + subType := c.mustGetUnitStringTypeProperty(unitTypeInterface, "Type", "unknown", conn, unit) + slice := c.mustGetUnitStringTypeProperty(unitTypeInterface, "Slice", "unknown", conn, unit) log.Debugf("Read 'no cgroup' from unit (name=%s state=%s subtype=%s slice=%s) ", unit.Name, unit.ActiveState, subType, slice) - return nil - } - - propCPUAcct, err := conn.GetUnitTypeProperty(unit.Name, unitType, "CPUAccounting") - if err != nil { - return errors.Wrapf(err, errGetPropertyMsg, "CPUAccounting") - } - cpuAcct, ok := propCPUAcct.Value.Value().(bool) - if !ok { - return errors.Errorf(errConvertStringPropertyMsg, "CPUAccounting", propCPUAcct.Value.Value()) - } - if !cpuAcct { - return nil + return nil, nil + default: + return &cgSubpath, nil } +} - cpuUsage, err := NewCPUAcct(cgSubpath) +// A number of unit types support the 'ControlGroup' property needed to allow us to directly read their +// resource usage from the kernel's cgroupfs cpu hierarchy. The only change is which dbus item we are querying +func (c *Collector) collectUnitCPUMetrics(cgSubpath string, conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { + // Don't bother reading CPUAccounting prop. It's faster to attempt a file read than to query dbus, and it works + // in more situations as well + cpuUsage, err := cgroup.NewCPUAcct(cgSubpath) if err != nil { - if unitType == "Socket" { - log.Debugf("unable to read SocketUnit CPU accounting information (unit=%s)", unit.Name) + if perr, ok := err.(*os.PathError); ok && perr.Op == "open" { return nil } return errors.Wrapf(err, errControlGroupReadMsg, "CPU usage") @@ -581,6 +614,36 @@ func (c *Collector) collectUnitCPUUsageMetrics(unitType string, conn *dbus.Conn, return nil } +func (c *Collector) collectUnitMemMetrics(cgSubpath string, conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { + // Don't bother reading MemoryAccounting prop. It's faster to attempt a file read than to query dbus, and it works + // in more situations as well. For ex: case where + // such as kernel cmdline has cgroups_enabled=memory but systemd still has DefaultMemoryAccounting=no. All cgroups + // will have a memory.stat file, but systemd will still report MemoryAccounting=false for most units + memStat, err := cgroup.NewMemStat(cgSubpath) + if err != nil { + if perr, ok := err.(*os.PathError); ok && perr.Op == "open" { + return nil + } + return errors.Wrapf(err, errControlGroupReadMsg, "Memory stat") + } + + unitType := parseUnitType(unit) + ch <- prometheus.MustNewConstMetric( + c.unitMemCache, prometheus.GaugeValue, + float64(memStat.CacheBytes), unit.Name, unitType) + ch <- prometheus.MustNewConstMetric( + c.unitMemRss, prometheus.GaugeValue, + float64(memStat.RssBytes), unit.Name, unitType) + ch <- prometheus.MustNewConstMetric( + c.unitMemDirty, prometheus.GaugeValue, + float64(memStat.DirtyBytes), unit.Name, unitType) + ch <- prometheus.MustNewConstMetric( + c.unitMemShmem, prometheus.GaugeValue, + float64(memStat.Shmem), unit.Name, unitType) + + return nil +} + func (c *Collector) collectSocketConnMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { acceptedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NAccepted") if err != nil { @@ -605,7 +668,7 @@ func (c *Collector) collectSocketConnMetrics(conn *dbus.Conn, ch chan<- promethe return errors.Wrapf(err, errGetPropertyMsg, "NRefused") } ch <- prometheus.MustNewConstMetric( - c.socketRefusedConnectionsDesc, prometheus.GaugeValue, + c.socketRefusedConnectionsDesc, prometheus.CounterValue, float64(refusedConnectionCount.Value.Value().(uint32)), unit.Name) return nil diff --git a/systemd/systemd_test.go b/systemd/systemd_test.go new file mode 100644 index 0000000..3f4f3ee --- /dev/null +++ b/systemd/systemd_test.go @@ -0,0 +1,26 @@ +package systemd + +import ( + "github.com/coreos/go-systemd/dbus" + "testing" +) + +func TestParseUnitType(t *testing.T) { + x := dbus.UnitStatus{ + Name: "test.service", + Description: "", + LoadState: "", + ActiveState: "", + SubState: "", + Followed: "", + Path: "", + JobId: 0, + JobType: "", + JobPath: "", + } + found := parseUnitType(x) + if found != "service" { + t.Errorf("Bad unit name parsing. Wanted %s got %s", "service", found) + } + +}