From c6abe3dca57a86b1ea1e91387167c6a4f42b9c6f Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Thu, 30 Jan 2020 10:02:21 -0500 Subject: [PATCH 01/57] Change socket refused from gauge to counter Looking into current systemd source code, this value only increases. e.g. https://github.com/systemd/systemd/blob/7c286cd6a615fa9bce8a2830133bcf89becfbf9f/src/core/socket.c#L2416 --- systemd/systemd.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systemd/systemd.go b/systemd/systemd.go index 87af595..3ac1ff4 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -605,7 +605,7 @@ func (c *Collector) collectSocketConnMetrics(conn *dbus.Conn, ch chan<- promethe return errors.Wrapf(err, errGetPropertyMsg, "NRefused") } ch <- prometheus.MustNewConstMetric( - c.socketRefusedConnectionsDesc, prometheus.GaugeValue, + c.socketRefusedConnectionsDesc, prometheus.CounterValue, float64(refusedConnectionCount.Value.Value().(uint32)), unit.Name) return nil From 36d4121a6d037b32246e2390301a3a71f2239558 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Thu, 30 Jan 2020 10:02:49 -0500 Subject: [PATCH 02/57] Fix counter-or-gauge metric documentation --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9d52ad1..9fa8f76 100644 --- a/README.md +++ b/README.md @@ -79,15 +79,15 @@ Note that a number of unit types are filtered by default | ----------------------------------------- | ----------- | -------- | ------------------------------------------------------------------ | | systemd_exporter_build_info | Gauge | UNSTABLE | 1 per systemd-exporter | | systemd_unit_info | Gauge | UNSTABLE | 1 per service + 1 per mount | -| systemd_unit_cpu_seconds_total | Gauge | UNSTABLE | 2 per mount/scope/slice/socket/swap {mode="system/user"} | +| systemd_unit_cpu_seconds_total | Counter | UNSTABLE | 2 per mount/scope/slice/socket/swap {mode="system/user"} | | systemd_unit_state | Gauge | UNSTABLE | 5 per unit {state="activating/active/deactivating/failed/inactive} | | systemd_unit_tasks_current | Gauge | UNSTABLE | 1 per service | | systemd_unit_tasks_max | Gauge | UNSTABLE | 1 per service | | systemd_unit_start_time_seconds | Gauge | UNSTABLE | 1 per service | -| systemd_service_restart_total | Gauge | UNSTABLE | 1 per service | +| systemd_service_restart_total | Counter | UNSTABLE | 1 per service | | systemd_socket_accepted_connections_total | Counter | UNSTABLE | 1 per socket | | systemd_socket_current_connections | Gauge | UNSTABLE | 1 per socket | -| systemd_socket_refused_connections_total | Gauge | UNSTABLE | 1 per socket | +| systemd_socket_refused_connections_total | Counter | UNSTABLE | 1 per socket. Requires systemd>239 | | systemd_timer_last_trigger_seconds | Gauge | UNSTABLE | 1 per timer | | systemd_process_resident_memory_bytes | Gauge | UNSTABLE | 1 per service | | systemd_process_virtual_memory_bytes | Gauge | UNSTABLE | 1 per service | From d4c73f7ec08dc92bbbd096bac82a8f9ddb98fb4f Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Thu, 30 Jan 2020 10:09:57 -0500 Subject: [PATCH 03/57] Markdown cleanup --- CHANGELOG.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b34087b..60ea02d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,20 +2,18 @@ ### **Breaking changes** -* `systemd_unit_state` label `type` has new meaning - Now shows Unit type (`service`, `scope`, etc), not Service Unit types (`simple`, `forking`, etc) - or mount unit types(`aufs`,`ext3`, etc). Service and mount types have been moved to `systemd_unit_info` +* `systemd_unit_state` label `type` has new meaning. Previously `type` contained service unit type (`simple`, `forking`, etc) or mount unit types (`aufs`, `ext3`, etc). Now `systemd_unit_state{type}` contains overall unit type (`service`, `scope`, etc) to allow easy PromQL group by clauses. Service and mount types have been moved to `systemd_unit_info` ### Changes - [FEATURE] Read unit CPU usage from cgroup. Added `systemd_unit_cpu_seconds_total` metric. **Note** - Untested on unified hierarchy - [FEATURE] Add `systemd_unit_info` with metainformation about units incl. subtype specific info - [ENHANCEMENT] Added `type` label to all metrics named `systemd_unit-*` to support PromQL grouping -* [ENHANCEMENT] `systemd_unit_state` works for all unit types, not just service and mount units -* [ENHANCEMENT] Scrapes are approx 80% faster. If needed, set GOMAXPROCS to limit max concurrency -* [CHANGE] Start tracking metric cardinality in readme -* [CHANGE] Expanded default set of unit types monitored. Only device unit types are not enabled by default -* [BUGFIX] `timer_last_trigger_seconds` metric is now exported as expected for all timers +- [ENHANCEMENT] `systemd_unit_state` works for all unit types, not just service and mount units +- [ENHANCEMENT] Scrapes are approx 80% faster. If needed, set GOMAXPROCS to limit max concurrency +- [CHANGE] Start tracking metric cardinality in readme +- [CHANGE] Expanded default set of unit types monitored. Only device unit types are not enabled by default +- [BUGFIX] `timer_last_trigger_seconds` metric is now exported as expected for all timers ## 0.2.0 / 2019-03-20 From 2351bdb232f638d1a6c1da6e3ba9f207475aa0d3 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Wed, 29 Jan 2020 22:40:36 -0500 Subject: [PATCH 04/57] Initial test script based on node_exporter Main problem is that this does not support test coverage, because we are using an external binary as the binary-under-test and coverage works by compiling a custom binary with coverage-tracking-logic when running go test. It's not simple to force compilation of these tracking statements, so as long as we use an external binary we cannot have coverage data. Minor issue is that the process launch time is relatively slow. Not an issue when running 2-3 tests, but this is may be an issue down the road --- main_test.go | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 main_test.go diff --git a/main_test.go b/main_test.go new file mode 100644 index 0000000..fd0c935 --- /dev/null +++ b/main_test.go @@ -0,0 +1,107 @@ +package main + +import ( + "fmt" + "net/http" + "os" + "os/exec" + "path/filepath" + "sync" + "testing" + "time" +) + +var ( + binaryName = "systemd_exporter" + binaryPath = filepath.Join(binaryName) + // os.Getenv("GOPATH"), "bin/node_exporter") +) + +const ( + address = "localhost:9558" +) + +func testFlagVersion(t *testing.T) { + if _, err := os.Stat(binaryPath); err != nil { + t.Skipf("binary not available, try to run `go build` first: %s", err) + } + exporter := exec.Command(binaryPath, "--version", address) + if err := exporter.Run(); err != nil { + t.Error(err) + } +} + +func testMetricsReturns200(t *testing.T) { + if _, err := os.Stat(binaryPath); err != nil { + t.Skipf("binary not available, try to run `go build` first: %s", err) + } + + exporter := exec.Command(binaryPath, "--web.listen-address", address) + test := func(_ int) error { + resp, err := queryExporterMetrics() + if err != nil { + return err + } + if want, have := http.StatusOK, resp.StatusCode; want != have { + return fmt.Errorf("wanted status code %d, received %d", want, have) + } + + return nil + } + + if err := runCommandAndTests(exporter, address, test); err != nil { + t.Error(err) + } +} + +func queryExporterMetrics() (*http.Response, error) { + return queryExporter(fmt.Sprintf("http://%s/metrics", address)) +} + +func queryExporter(url string) (*http.Response, error) { + resp, err := http.Get(url) + if err != nil { + return nil, err + } + // b, err := ioutil.ReadAll(resp.Body) + // if err != nil { + // return nil, err + // } + // if err := resp.Body.Close(); err != nil { + // return nil, err + // } + // if want, have := http.StatusOK, resp.StatusCode; want != have { + // return nil, fmt.Errorf("want /metrics status code %d, have %d. Body:\n%s", want, have, b) + // } + return resp, nil +} + +func runCommandAndTests(cmd *exec.Cmd, address string, fn func(pid int) error) error { + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start command: %s", err) + } + + // ensure server is online before running test + time.Sleep(10 * time.Millisecond) + for i := 0; i < 10; i++ { + root := fmt.Sprintf("http://%s/", address) + if resp, err := queryExporter(root); err == nil && resp.StatusCode == http.StatusOK { + break + } + time.Sleep(500 * time.Millisecond) + if cmd.Process == nil || i == 9 { + return fmt.Errorf("can't connect to %s - unable to run any tests", root) + } + } + + errc := make(chan error) + go func(pid int) { + errc <- fn(pid) + }(cmd.Process.Pid) + + err := <-errc + if cmd.Process != nil { + cmd.Process.Kill() + } + return err +} From 7515a976634eada0251029ec7bdac163dd3741e7 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Wed, 29 Jan 2020 23:21:28 -0500 Subject: [PATCH 05/57] Allow http.Shutdown call for testing This is a minimal proof-of-concept for how to test the exporter while also getting code coverage details. Should also be a bit faster. Bad things: If something ever goes wrong, cleaning up would be much harder because we cannot just kill the external process. So arguably this could be a bad thing, or in the future we bother to split up testing into smoke tests that can run in this manner and larger tests that should be run as an external process. But for now I'm happy with this, I like seeing code coverage reports :-) While it's tempting to think this magically enables parallelism in the test suite, remember we are sharing the global default http server here. It would not be too hard to change that, and also auto-update the port numbers, but it's overkill for now. I'll just run stuff serially until it becomes an issue --- main.go | 40 ++++++++++++++++++++++++++++++++++++---- main_test.go | 21 +++++++++++++++++++++ 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/main.go b/main.go index 16f6eec..c0e268d 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( "net/http" _ "net/http/pprof" + "sync" "github.com/povilasv/prommod" "github.com/povilasv/systemd_exporter/systemd" @@ -14,6 +15,40 @@ import ( ) func main() { + listenAddress := mainCore() + + log.Infoln("Listening on", listenAddress) + if err := http.ListenAndServe(listenAddress, nil); err != nil { + log.Fatal(err) + } + +} + +func testMain(wg *sync.WaitGroup) *http.Server { + listenAddress := mainCore() + + // Launch server in background + srv := &http.Server{Addr: listenAddress} + log.Infoln("Queuing test server startup") + go func() { + defer wg.Done() + + // ErrServerClosed indicates graceful close + log.Infoln("Test server listening on", listenAddress) + if err := srv.ListenAndServe(); err != http.ErrServerClosed { + // unexpected error. port in use? + log.Fatalf("ListenAndServe(): %v", err) + } + + // Reset http package + http.DefaultServeMux = http.NewServeMux() + log.Infoln("Test server shutdown") + }() + + return srv +} + +func mainCore() string { var ( listenAddress = kingpin.Flag( "web.listen-address", @@ -85,8 +120,5 @@ func main() { } }) - log.Infoln("Listening on", *listenAddress) - if err := http.ListenAndServe(*listenAddress, nil); err != nil { - log.Fatal(err) - } + return *listenAddress } diff --git a/main_test.go b/main_test.go index fd0c935..1799f88 100644 --- a/main_test.go +++ b/main_test.go @@ -1,6 +1,7 @@ package main import ( + "context" "fmt" "net/http" "os" @@ -21,6 +22,26 @@ const ( address = "localhost:9558" ) +func TestWhatTheHack(t *testing.T) { + serverDone := &sync.WaitGroup{} + serverDone.Add(1) + // os.Args = []string{binaryName, "--version"} + os.Args = []string{binaryName, "--web.listen-address=127.0.0.1:9558", "--log.level=debug"} + srv := testMain(serverDone) + + // Running some fancy tests right here :-) + time.Sleep(5 * time.Second) + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + if err := srv.Shutdown(ctx); err != nil { + panic(err) // failure/timeout shutting down the server gracefully + } + + serverDone.Wait() +} + func testFlagVersion(t *testing.T) { if _, err := os.Stat(binaryPath); err != nil { t.Skipf("binary not available, try to run `go build` first: %s", err) From abb9906dfa6d370431b0b992dc05f2871696c82b Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Thu, 30 Jan 2020 01:09:44 -0500 Subject: [PATCH 06/57] Cleanup test suite and use mainTest by default Replaces the 'call an external binary' approach with the new 'call a handler function in main.go' approach. Much faster. This commit also shows a few example test cases just to get the ball rolling --- main_test.go | 129 +++++++++++++++++++++++---------------------------- 1 file changed, 57 insertions(+), 72 deletions(-) diff --git a/main_test.go b/main_test.go index 1799f88..3d56b2a 100644 --- a/main_test.go +++ b/main_test.go @@ -5,7 +5,6 @@ import ( "fmt" "net/http" "os" - "os/exec" "path/filepath" "sync" "testing" @@ -19,67 +18,83 @@ var ( ) const ( - address = "localhost:9558" + address = "127.0.0.1:9558" ) -func TestWhatTheHack(t *testing.T) { - serverDone := &sync.WaitGroup{} - serverDone.Add(1) - // os.Args = []string{binaryName, "--version"} - os.Args = []string{binaryName, "--web.listen-address=127.0.0.1:9558", "--log.level=debug"} - srv := testMain(serverDone) - - // Running some fancy tests right here :-) - time.Sleep(5 * time.Second) - - ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) - defer cancel() - - if err := srv.Shutdown(ctx); err != nil { - panic(err) // failure/timeout shutting down the server gracefully - } - - serverDone.Wait() +// TestNoop only exists as an example of how you can test +func TestNoop(t *testing.T) { + noop := func() error { return nil } + runServerAndTest([]string{binaryName}, address, noop) } -func testFlagVersion(t *testing.T) { - if _, err := os.Stat(binaryPath); err != nil { - t.Skipf("binary not available, try to run `go build` first: %s", err) - } - exporter := exec.Command(binaryPath, "--version", address) - if err := exporter.Run(); err != nil { - t.Error(err) - } +// TestVersionFlag is an example of running a test that does not rely on the server being +// online. TODO make a reusable runTest() for this use case +func TestVersionFlag(t *testing.T) { + noop := func() error { return nil } + runServerAndTest([]string{binaryName, "--version"}, address, noop) } -func testMetricsReturns200(t *testing.T) { - if _, err := os.Stat(binaryPath); err != nil { - t.Skipf("binary not available, try to run `go build` first: %s", err) - } - - exporter := exec.Command(binaryPath, "--web.listen-address", address) - test := func(_ int) error { - resp, err := queryExporterMetrics() +func TestMetricEndpointReturnsHttp200(t *testing.T) { + test := func() error { + resp, err := getMetrics() if err != nil { return err } if want, have := http.StatusOK, resp.StatusCode; want != have { return fmt.Errorf("wanted status code %d, received %d", want, have) } - return nil } + runServerAndTest([]string{binaryName}, address, test) +} + +func runServerAndTest(args []string, url string, fn func() error) error { + // Request server startup + serverDone := &sync.WaitGroup{} + serverDone.Add(1) + // TODO it would be cleaner to change main.go to use kingpin.MustParse + os.Args = args + srv := testMain(serverDone) + + // ensure server is online before running test + fmt.Println("Waiting on test server startup...") + for i := 0; i < 10; i++ { + root := fmt.Sprintf("http://%s/", address) + if resp, err := getUrl(root); err == nil && resp.StatusCode == http.StatusOK { + break + } + time.Sleep(10 * time.Millisecond) + if i == 9 { + return fmt.Errorf("can't connect to %s - unable to run any tests", root) + } + } + fmt.Println("Test server ready, running test...") - if err := runCommandAndTests(exporter, address, test); err != nil { - t.Error(err) + // Run the test + err := fn() + + // Shutdown the server before we return + fmt.Println("Test complete, shutting down server...") + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() // TODO is this correct? + + if err := srv.Shutdown(ctx); err != nil { + // TODO is this what we shold do with serverDone? + defer serverDone.Wait() + return fmt.Errorf("failed to start command: %s", err) } + + serverDone.Wait() + fmt.Println("Test server shutdown, testcase complete.") + + return err } -func queryExporterMetrics() (*http.Response, error) { - return queryExporter(fmt.Sprintf("http://%s/metrics", address)) +func getMetrics() (*http.Response, error) { + return getUrl(fmt.Sprintf("http://%s/metrics", address)) } -func queryExporter(url string) (*http.Response, error) { +func getUrl(url string) (*http.Response, error) { resp, err := http.Get(url) if err != nil { return nil, err @@ -96,33 +111,3 @@ func queryExporter(url string) (*http.Response, error) { // } return resp, nil } - -func runCommandAndTests(cmd *exec.Cmd, address string, fn func(pid int) error) error { - if err := cmd.Start(); err != nil { - return fmt.Errorf("failed to start command: %s", err) - } - - // ensure server is online before running test - time.Sleep(10 * time.Millisecond) - for i := 0; i < 10; i++ { - root := fmt.Sprintf("http://%s/", address) - if resp, err := queryExporter(root); err == nil && resp.StatusCode == http.StatusOK { - break - } - time.Sleep(500 * time.Millisecond) - if cmd.Process == nil || i == 9 { - return fmt.Errorf("can't connect to %s - unable to run any tests", root) - } - } - - errc := make(chan error) - go func(pid int) { - errc <- fn(pid) - }(cmd.Process.Pid) - - err := <-errc - if cmd.Process != nil { - cmd.Process.Kill() - } - return err -} From 5a700f6837651a077aefca626dad87f85430dbac Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Thu, 30 Jan 2020 01:22:38 -0500 Subject: [PATCH 07/57] Ensure tests run server at proper address --- main_test.go | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/main_test.go b/main_test.go index 3d56b2a..a3b67aa 100644 --- a/main_test.go +++ b/main_test.go @@ -5,33 +5,32 @@ import ( "fmt" "net/http" "os" - "path/filepath" "sync" "testing" "time" ) var ( - binaryName = "systemd_exporter" - binaryPath = filepath.Join(binaryName) - // os.Getenv("GOPATH"), "bin/node_exporter") + address = "127.0.0.1:9550" + binaryName = "systemd_exporter" + defaultArgs = []string{binaryName, fmt.Sprintf("--web.listen-address=%s", address)} ) -const ( - address = "127.0.0.1:9558" -) +func TestMain(m *testing.M) { + // TODO accept arg for listen address +} // TestNoop only exists as an example of how you can test func TestNoop(t *testing.T) { noop := func() error { return nil } - runServerAndTest([]string{binaryName}, address, noop) + runServerAndTest(defaultArgs, address, noop) } // TestVersionFlag is an example of running a test that does not rely on the server being // online. TODO make a reusable runTest() for this use case func TestVersionFlag(t *testing.T) { noop := func() error { return nil } - runServerAndTest([]string{binaryName, "--version"}, address, noop) + runServerAndTest(append(defaultArgs, "--version"), address, noop) } func TestMetricEndpointReturnsHttp200(t *testing.T) { @@ -45,7 +44,7 @@ func TestMetricEndpointReturnsHttp200(t *testing.T) { } return nil } - runServerAndTest([]string{binaryName}, address, test) + runServerAndTest(defaultArgs, address, test) } func runServerAndTest(args []string, url string, fn func() error) error { From 0ad0613c91fe3dfb15b89dabda1d479fe0d2e573 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Thu, 30 Jan 2020 01:43:35 -0500 Subject: [PATCH 08/57] Fix testing bugs - run main & avoid hang on --version --- main_test.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/main_test.go b/main_test.go index a3b67aa..5d36a87 100644 --- a/main_test.go +++ b/main_test.go @@ -18,6 +18,7 @@ var ( func TestMain(m *testing.M) { // TODO accept arg for listen address + os.Exit(m.Run()) } // TestNoop only exists as an example of how you can test @@ -28,10 +29,13 @@ func TestNoop(t *testing.T) { // TestVersionFlag is an example of running a test that does not rely on the server being // online. TODO make a reusable runTest() for this use case -func TestVersionFlag(t *testing.T) { - noop := func() error { return nil } - runServerAndTest(append(defaultArgs, "--version"), address, noop) -} +// TODO this is broken. Because runServerAndTest is waiting for the server to come online, +// but it never does (becaseu our args mean it prints version and exits), we do not exit +// cleanly. Somethign hangs, which means test coverage is never written out. Bummer +// func TestVersionFlag(t *testing.T) { +// noop := func() error { return nil } +// runServerAndTest(append(defaultArgs, "--version"), address, noop) +// } func TestMetricEndpointReturnsHttp200(t *testing.T) { test := func() error { From ee24c67b604ff75d665edbcc095c87e690150423 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Thu, 30 Jan 2020 01:50:10 -0500 Subject: [PATCH 09/57] First circleCI image --- .circleci/config.yml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..3d07e68 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,9 @@ + version: 2 + jobs: + build: + docker: + - image: circleci/golang:1.13.6-buster + steps: + - checkout + - run: go test + From 324b61fb0d1865f2572748950474d7eade4875cc Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 12:20:28 -0500 Subject: [PATCH 10/57] Testing artifacts and machine builds --- .circleci/config.yml | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3d07e68..ae31d84 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,9 +1,24 @@ - version: 2 - jobs: - build: - docker: - - image: circleci/golang:1.13.6-buster - steps: - - checkout - - run: go test +version: 2 +jobs: + test: + docker: + - image: circleci/golang:1.13.6-buster + steps: + - checkout + - run: go test -coverprofile=coverage.out + - run: systemd --version || true + - store_artifacts: + path: coverage.out + + test_full: + machine: + - image: ubuntu-1604:201903-01 + steps: + - checkout + - run: go test -coverprofile=full_coverage.out + - run: systemd --versioni || true + - store_artifacts: + path: full_coverage.out + + From b1152d14482202c2ac908cc6993a082b040e6ccb Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 12:23:17 -0500 Subject: [PATCH 11/57] Add workflow --- .circleci/config.yml | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ae31d84..ceafc9d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,20 +5,53 @@ jobs: - image: circleci/golang:1.13.6-buster steps: - checkout + - run: go version || true + - run: go env || true - run: go test -coverprofile=coverage.out - run: systemd --version || true - store_artifacts: path: coverage.out + destination: coverage.out test_full: machine: - - image: ubuntu-1604:201903-01 + image: ubuntu-1604:201903-01 steps: - checkout + # The default is 1.10, which is installed into /usr/local/bin/go + - run: + name: Setup golang-backports + command: sudo add-apt-repository ppa:longsleep/golang-backports && sudo apt-get update + - run: + name: Install latest golang + command: sudo apt-get install -y golang-go + - run: + name: Reconfigure golang + command: echo "export GOROOT=/usr/lib/go-1.13" >> ~/.bashrc && echo "export PATH=$GOROOT/bin:$PATH" >> ~/.bashrc + # - run: echo $PATH && which go + # - run: export GOROOT=/usr/lib/go-1.13 && export PATH=$GOROOT/bin:$PATH && echo $PATH && which go + - run: go version || true + - run: systemd --version || true + - run: go env || true + # - run: go get || true + # - run: go build || true - run: go test -coverprofile=full_coverage.out - - run: systemd --versioni || true - store_artifacts: path: full_coverage.out + destination: full_coverage.out + # containerize: + # docker: + # - image: docker:17.05.0-ce-git + # steps: + # - checkout + # - setup_remote_docker + # - run: +workflows: + version: 2 + test: + jobs: + - test + - test_full From 161855d5da850444fadbcccc0bd3d584cee35b4a Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 14:15:51 -0500 Subject: [PATCH 12/57] Stop using container at all. Start using BASH_ENV --- .circleci/config.yml | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ceafc9d..358c266 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,22 +1,12 @@ version: 2 jobs: - test: - docker: - - image: circleci/golang:1.13.6-buster - steps: - - checkout - - run: go version || true - - run: go env || true - - run: go test -coverprofile=coverage.out - - run: systemd --version || true - - store_artifacts: - path: coverage.out - destination: coverage.out - - test_full: + build: machine: image: ubuntu-1604:201903-01 + # environment: + # BASH_ENV: /home/circleci/.bashrc steps: + - run: env | sort && touch $BASH_ENV - checkout # The default is 1.10, which is installed into /usr/local/bin/go - run: @@ -27,7 +17,7 @@ jobs: command: sudo apt-get install -y golang-go - run: name: Reconfigure golang - command: echo "export GOROOT=/usr/lib/go-1.13" >> ~/.bashrc && echo "export PATH=$GOROOT/bin:$PATH" >> ~/.bashrc + command: echo "export GOROOT=/usr/lib/go-1.13" >> $BASH_ENV && echo "export PATH=$GOROOT/bin:$PATH" >> $BASH_ENV # - run: echo $PATH && which go # - run: export GOROOT=/usr/lib/go-1.13 && export PATH=$GOROOT/bin:$PATH && echo $PATH && which go - run: go version || true @@ -49,9 +39,19 @@ jobs: # - setup_remote_docker # - run: -workflows: - version: 2 - test: - jobs: - - test - - test_full +# This is much faster than running a 'machine' (a VM) but it does not have +# systemd +# test-in-container: +# docker: +# - image: circleci/golang:1.13.6-buster +# steps: +# - checkout +# - run: go version || true +# - run: go env || true +# - run: go test -coverprofile=coverage.out +# - run: systemd --version || true +# - store_artifacts: +# path: coverage.out +# destination: coverage.out + + From a70a1e6b26fcf0887e31112fcdcb69cf47588923 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 14:55:10 -0500 Subject: [PATCH 13/57] Stop attempting bashrc hacks --- .circleci/config.yml | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 358c266..1e199b8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,8 +3,8 @@ jobs: build: machine: image: ubuntu-1604:201903-01 - # environment: - # BASH_ENV: /home/circleci/.bashrc + environment: + GOROOT=/usr/lib/go-1.13 steps: - run: env | sort && touch $BASH_ENV - checkout @@ -15,16 +15,10 @@ jobs: - run: name: Install latest golang command: sudo apt-get install -y golang-go - - run: - name: Reconfigure golang - command: echo "export GOROOT=/usr/lib/go-1.13" >> $BASH_ENV && echo "export PATH=$GOROOT/bin:$PATH" >> $BASH_ENV - # - run: echo $PATH && which go - # - run: export GOROOT=/usr/lib/go-1.13 && export PATH=$GOROOT/bin:$PATH && echo $PATH && which go - - run: go version || true - - run: systemd --version || true - - run: go env || true - # - run: go get || true - # - run: go build || true + - run: + run: go version && go env + command: PATH=$GOROOT/bin:$PATH go version && go env + - run: systemd --version - run: go test -coverprofile=full_coverage.out - store_artifacts: path: full_coverage.out From 1f9768e6694b2cc49c6214ac8c0f469739a3c55d Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 15:20:46 -0500 Subject: [PATCH 14/57] Stop trying to use circleCI Their focus on testing-inside-docker means they do not have a good testing-in-vm experience. Only one ubuntu version is supported, and upgrading it on each CI run is both wasteful and a huge pain to get correct (we only want to upgrade golang, but they have a customized version of ubuntu to the typical backported PPA approach does not work out of the box). As a nice side bonus, travis-ci is mostly OSS while circleCI is not, so we can read the source if we have any issues with travis. --- .circleci/config.yml | 51 -------------------------------------------- 1 file changed, 51 deletions(-) delete mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 1e199b8..0000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,51 +0,0 @@ -version: 2 -jobs: - build: - machine: - image: ubuntu-1604:201903-01 - environment: - GOROOT=/usr/lib/go-1.13 - steps: - - run: env | sort && touch $BASH_ENV - - checkout - # The default is 1.10, which is installed into /usr/local/bin/go - - run: - name: Setup golang-backports - command: sudo add-apt-repository ppa:longsleep/golang-backports && sudo apt-get update - - run: - name: Install latest golang - command: sudo apt-get install -y golang-go - - run: - run: go version && go env - command: PATH=$GOROOT/bin:$PATH go version && go env - - run: systemd --version - - run: go test -coverprofile=full_coverage.out - - store_artifacts: - path: full_coverage.out - destination: full_coverage.out - - - # containerize: - # docker: - # - image: docker:17.05.0-ce-git - # steps: - # - checkout - # - setup_remote_docker - # - run: - -# This is much faster than running a 'machine' (a VM) but it does not have -# systemd -# test-in-container: -# docker: -# - image: circleci/golang:1.13.6-buster -# steps: -# - checkout -# - run: go version || true -# - run: go env || true -# - run: go test -coverprofile=coverage.out -# - run: systemd --version || true -# - store_artifacts: -# path: coverage.out -# destination: coverage.out - - From 8f9e07552df27bdcbbf11287e5686cf04a3a6632 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 15:31:44 -0500 Subject: [PATCH 15/57] Initial travis CI file --- .travis.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.travis.yml b/.travis.yml index 1ca29c5..ebb09b0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,18 @@ +dist: xenial language: go go: - "1.x" + +jobs: + include: + - name: testing + language: bash + script: + - ls + - name: t2 + language: go + os: linux + dist: trusty + + From 26df7d22d5b149f802f6dc40ca2cb2eb2179b7e8 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 15:36:35 -0500 Subject: [PATCH 16/57] Workaround bug in golangci-lint See https://github.com/golangci/golangci-lint/issues/658 for details Summary - golangci-lint, built with Go 1.12, will generate this error when linting Go 1.13 code --- .golangci.yml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .golangci.yml diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..686f7cc --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,5 @@ +issues: + exclude: + - "not declared by package utf8" + - "unicode/utf8/utf8.go" + From 7c9b7c8bf2f84cd0de53e9a65dc4bb15982f1bd7 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 16:01:25 -0500 Subject: [PATCH 17/57] Fix bug found by ci linting :-) --- main_test.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/main_test.go b/main_test.go index 5d36a87..45f0320 100644 --- a/main_test.go +++ b/main_test.go @@ -24,7 +24,9 @@ func TestMain(m *testing.M) { // TestNoop only exists as an example of how you can test func TestNoop(t *testing.T) { noop := func() error { return nil } - runServerAndTest(defaultArgs, address, noop) + if err := runServerAndTest(defaultArgs, address, noop); err != nil { + t.Errorf("No op failed") + } } // TestVersionFlag is an example of running a test that does not rely on the server being @@ -48,7 +50,9 @@ func TestMetricEndpointReturnsHttp200(t *testing.T) { } return nil } - runServerAndTest(defaultArgs, address, test) + if err := runServerAndTest(defaultArgs, address, test); err != nil { + t.Errorf("Metric 200 failed") + } } func runServerAndTest(args []string, url string, fn func() error) error { From cd0bfae3c7e849ebe1f17a4a1909d05464f6e4ee Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 16:09:55 -0500 Subject: [PATCH 18/57] Added codecov.io --- .travis.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index ebb09b0..87c0317 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,8 @@ jobs: - name: t2 language: go os: linux - dist: trusty - + dist: xenial + after_success: + - bash <(curl -s https://codecov.io/bash) + From 10a20d3c66486406cab73f1e25942968877b2918 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 17:00:43 -0500 Subject: [PATCH 19/57] Define CI matrix directly (no job include) --- .travis.yml | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/.travis.yml b/.travis.yml index 87c0317..1985e3e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,20 +1,13 @@ -dist: xenial -language: go +language: go go: - "1.x" -jobs: - include: - - name: testing - language: bash - script: - - ls - - name: t2 - language: go - os: linux - dist: xenial - after_success: - - bash <(curl -s https://codecov.io/bash) +before_script: systemd --version +os: linux +dist: xenial + +after_success: + - bash <(curl -s https://codecov.io/bash) From 4804ae2782060d688608ccdfea830f565f2c3d28 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 17:18:54 -0500 Subject: [PATCH 20/57] Update test flags to work with codecov --- .travis.yml | 7 +++++++ Makefile | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1985e3e..a0e1bf4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,6 @@ +# This defines the script for us automatically. By default it installs +# to requested go version then runs make language: go go: - "1.x" @@ -7,6 +9,11 @@ before_script: systemd --version os: linux dist: xenial +go: + - 1.x + +before_script: systemd --version + after_success: - bash <(curl -s https://codecov.io/bash) diff --git a/Makefile b/Makefile index 5c07f3a..b51d686 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ BRANCH := $(shell git branch | grep \* | cut -d ' ' -f2) LINT_FLAGS := run --deadline=120s LINTER := ./bin/golangci-lint -TESTFLAGS := -v -cover +TESTFLAGS := -v -cover -race -coverprofile=coverage.txt -covermode=atomic GO111MODULE := on all: $(LINTER) deps test lint build From 13ca4f388c005c626bed816f87e84553f8b11512 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 18:07:57 -0500 Subject: [PATCH 21/57] Including integration tests into coverage counts --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b51d686..3e85494 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ BRANCH := $(shell git branch | grep \* | cut -d ' ' -f2) LINT_FLAGS := run --deadline=120s LINTER := ./bin/golangci-lint -TESTFLAGS := -v -cover -race -coverprofile=coverage.txt -covermode=atomic +TESTFLAGS := -v -cover -race -coverpkg=github.com/povilasv/systemd_exporter,github.com/povilasv/systemd_exporter/systemd -coverprofile=coverage.txt -covermode=atomic GO111MODULE := on all: $(LINTER) deps test lint build From 4c9259f75a283cf9c94c5a41cd839747db8cc991 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 18:19:24 -0500 Subject: [PATCH 22/57] Listing units in travis-ci --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a0e1bf4..5bfccd1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ dist: xenial go: - 1.x -before_script: systemd --version +before_script: systemd --version && systemctl list-units after_success: - bash <(curl -s https://codecov.io/bash) From 3bb0ed0c40da17ce5e38c592021f7b352f110165 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 18:34:32 -0500 Subject: [PATCH 23/57] Log args as debug to aid in testing --- main.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.go b/main.go index c0e268d..190d872 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( "net/http" _ "net/http/pprof" + "os" "sync" "github.com/povilasv/prommod" @@ -72,6 +73,7 @@ func mainCore() string { kingpin.Version(prommod.Print(version.Print("systemd_exporter"))) kingpin.HelpFlag.Short('h') kingpin.Parse() + log.Debugf("Parsed '%s'", os.Args) log.Infoln("Starting systemd_exporter", version.Info()) log.Infoln("Build context", version.BuildContext()) From 40601368b3cd86af670b4ec901e12fc6c8345742 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 18:50:50 -0500 Subject: [PATCH 24/57] Testing --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3e85494..eb06f75 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,8 @@ deps: .PHONY: test test: - go test $(TESTFLAGS) ./... + go get github.com/ory/go-acc + go-acc ./... .PHONY: build build: deps From d0e740e33b16e5cc9eb81c0b3cc4c9627288fcf1 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 19:12:02 -0500 Subject: [PATCH 25/57] New go-acc version to work with go.mod --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index eb06f75..479d4f4 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ deps: .PHONY: test test: - go get github.com/ory/go-acc + go get github.com/stristr/go-acc go-acc ./... .PHONY: build From 44c22fa1f1f6e532665fbf390ab8760af5773f0b Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 21:58:52 -0500 Subject: [PATCH 26/57] Debugging - add go list --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 479d4f4..cfb33e5 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ deps: .PHONY: test test: go get github.com/stristr/go-acc + go list go-acc ./... .PHONY: build From 17652d4c73b7f7a9e1d6fe08863746f2a9ee135a Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 22:26:15 -0500 Subject: [PATCH 27/57] Document todo regarding integration test coverage --- .travis.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.travis.yml b/.travis.yml index 5bfccd1..734ad2a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,13 @@ +# TODO Use CodeCov 'Flags' to separate coverage for integration tests and unit +# tests. Near 100% on integration is expected - these tests cast a wide net to +# catch many issues (but do not easily tell you where the issue is). Near 100% +# on unit tests is a feat of heroism - these tests identify a specific code +# chunk with an issue. We mainly care about 100% on unit tests, but 100% on +# integration is an easy win and a nice to have +# +# See https://docs.codecov.io/docs/flags +# Use go get github.com/stristr/go-acc && go-acc ./... +# Or use coverpkg=github.com/povilasv/systemd_exporter,github.com/povilasv/systemd_exporter/systemd # This defines the script for us automatically. By default it installs # to requested go version then runs make From 62fe927ec2d9169059773edfbf58e80c12eb5c24 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 22:31:59 -0500 Subject: [PATCH 28/57] Enabling cgroup accounting on Travis-CI --- Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile b/Makefile index cfb33e5..2c2d4d8 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,11 @@ deps: .PHONY: test test: +ifdef TRAVIS + sudo ls + sudo systemctl set-property cron.service MemoryAccounting=yes + systemctl set-property cron.service CPUAccounting=yes +endif go get github.com/stristr/go-acc go list go-acc ./... From 37f6d6812003b7761bd4e6ca5ff0f0b831359b36 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 22:35:41 -0500 Subject: [PATCH 29/57] Fixing typo --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2c2d4d8..eaff20d 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ test: ifdef TRAVIS sudo ls sudo systemctl set-property cron.service MemoryAccounting=yes - systemctl set-property cron.service CPUAccounting=yes + sudo systemctl set-property cron.service CPUAccounting=yes endif go get github.com/stristr/go-acc go list From 1a6ae16df1ff9ab04f114b805fa6fe4e9595426e Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 22:35:55 -0500 Subject: [PATCH 30/57] Documenting requirements for CPUAccounting --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9d52ad1..b3dfa5a 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ Note that a number of unit types are filtered by default | ----------------------------------------- | ----------- | -------- | ------------------------------------------------------------------ | | systemd_exporter_build_info | Gauge | UNSTABLE | 1 per systemd-exporter | | systemd_unit_info | Gauge | UNSTABLE | 1 per service + 1 per mount | -| systemd_unit_cpu_seconds_total | Gauge | UNSTABLE | 2 per mount/scope/slice/socket/swap {mode="system/user"} | +| systemd_unit_cpu_seconds_total | Gauge | UNSTABLE | 12 per mount/scope/slice/socket/swap {mode="system/user"} | | systemd_unit_state | Gauge | UNSTABLE | 5 per unit {state="activating/active/deactivating/failed/inactive} | | systemd_unit_tasks_current | Gauge | UNSTABLE | 1 per service | | systemd_unit_tasks_max | Gauge | UNSTABLE | 1 per service | @@ -95,3 +95,5 @@ Note that a number of unit types are filtered by default | systemd_process_open_fds | Gauge | UNSTABLE | 1 per service | | systemd_process_max_fds | Gauge | UNSTABLE | 1 per service | | systemd_process_cpu_seconds_total | Counter | UNSTABLE | 1 per service | + +1Only present for units which have systemd `CPUAccounting` enabled From 6499bbc365a58719c8e59a8595d7e01d40948d32 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 22:47:26 -0500 Subject: [PATCH 31/57] On Travis-CI, enable CPUAccounting by default --- Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index eaff20d..48efb3b 100644 --- a/Makefile +++ b/Makefile @@ -24,9 +24,8 @@ deps: .PHONY: test test: ifdef TRAVIS - sudo ls - sudo systemctl set-property cron.service MemoryAccounting=yes - sudo systemctl set-property cron.service CPUAccounting=yes + sudo sh -c 'echo DefaultCPUAccounting=yes >> /etc/systemd/system.conf' + sudo systemctl daemon-reload endif go get github.com/stristr/go-acc go list From 83c718e7d63de36e31791bc88f592d30d3d4c9fb Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 22:57:57 -0500 Subject: [PATCH 32/57] Testing multiple OS builds --- .travis.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 734ad2a..9011808 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,7 +17,6 @@ go: before_script: systemd --version os: linux -dist: xenial go: - 1.x @@ -27,4 +26,11 @@ before_script: systemd --version && systemctl list-units after_success: - bash <(curl -s https://codecov.io/bash) +jobs: + include: + - dist: xenial + name: xenial-229 + - dist: bionic + name: bionic-237 + From 054967521318778f942f07bf866d782defc59bc6 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Fri, 31 Jan 2020 23:27:10 -0500 Subject: [PATCH 33/57] Adding fixtures for testing --- systemd/fixtures/README.md | 11 +++++++++++ systemd/fixtures/cgroup-hybrid/cpu | 1 + .../cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all | 5 +++++ 3 files changed, 17 insertions(+) create mode 100644 systemd/fixtures/README.md create mode 120000 systemd/fixtures/cgroup-hybrid/cpu create mode 100644 systemd/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all diff --git a/systemd/fixtures/README.md b/systemd/fixtures/README.md new file mode 100644 index 0000000..6f071ae --- /dev/null +++ b/systemd/fixtures/README.md @@ -0,0 +1,11 @@ +Contains fixed state used as a baseline for running tests. The purpose of these test fixtures +is to ensure that there is a well known and fixed environment in which tests are run so that +results are repeatable + +Note: including symlinks into fixtures is important for testing. However this can break +community toolchains and OS'es in unexpected ways. prometheus/procfs addressed this +issue by using ttar to flatten their fixtures directory into a single standard file, and +only folks who are running testing will unflatten this file. This prevents symlinks from +appearing on disk for anyone only doing a git checkout. May be something to consider if +we get problem reports. See https://github.com/prometheus/procfs/pull/79 + diff --git a/systemd/fixtures/cgroup-hybrid/cpu b/systemd/fixtures/cgroup-hybrid/cpu new file mode 120000 index 0000000..c5a8e01 --- /dev/null +++ b/systemd/fixtures/cgroup-hybrid/cpu @@ -0,0 +1 @@ +cpu,cpuacct \ No newline at end of file diff --git a/systemd/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all b/systemd/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all new file mode 100644 index 0000000..609c1f0 --- /dev/null +++ b/systemd/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all @@ -0,0 +1,5 @@ +cpu user system +0 7746241803817 122204678803 +1 7385109326139 107275346559 +2 7307001772824 94093225654 +3 7093088113588 295613450937 From 9806a993d0dcbcb4b0b048a8933b0504ef179db6 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sat, 1 Feb 2020 17:04:12 -0500 Subject: [PATCH 34/57] Rewrite cgroups.go to allow unit testing Follows conventions laid out by prometheus/procfs package. Note that this is *not* a clean API right now, this commit is intentionally only a minimal step towards reworking the API to look more like the procfs API. In later commits we will refactor more, but for now I wanted a single commit that shows the minimal changes to keep everything working but also enable some unit testing --- systemd/cgroups.go | 96 +++++++++++++++++++++++++++++++---------- systemd/cgroups_test.go | 74 +++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+), 23 deletions(-) create mode 100644 systemd/cgroups_test.go diff --git a/systemd/cgroups.go b/systemd/cgroups.go index 05399e0..1bc95b9 100644 --- a/systemd/cgroups.go +++ b/systemd/cgroups.go @@ -3,6 +3,7 @@ package systemd import ( "bufio" "bytes" + "fmt" "io" "io/ioutil" "os" @@ -15,6 +16,51 @@ import ( "golang.org/x/sys/unix" ) +// FS is the pseudo-filesystem cgroupfs, which provides an interface to +// kernel data structures +type FS struct { + mountPoint string + + // WARNING: We only read this data once at process start, systemd updates + // may require restarting systemd-exporter + cgroupUnified cgUnifiedMountMode +} + +// DefaultMountPoint is the common mount point of the cgroupfs filesystem +const DefaultMountPoint = "/sys/fs/cgroup" + +// NewDefaultFS returns a new cgroup FS mounted under the default mountPoint. +// It will error if cgroup hierarchies are not laid out in a manner understood +// by systemd. +func NewDefaultFS() (FS, error) { + + mode, err := cgUnifiedCached() + if err != nil || mode == unifModeUnknown { + return FS{}, fmt.Errorf("could not determine cgroupfs mount mode: %s", err) + } + + return newFS(DefaultMountPoint, mode) +} + +// newFS returns a new cgroup FS mounted under the given mountPoint. It does not check +// the provided mount mode +func newFS(mountPoint string, mountMode cgUnifiedMountMode) (FS, error) { + info, err := os.Stat(mountPoint) + if err != nil { + return FS{}, fmt.Errorf("could not read %s: %s", mountPoint, err) + } + if !info.IsDir() { + return FS{}, fmt.Errorf("mount point %s is not a directory", mountPoint) + } + return FS{mountPoint, mountMode}, nil +} + +// path appends the given path elements to the filesystem path, adding separators +// as necessary. +func (fs FS) path(p ...string) string { + return filepath.Join(append([]string{string(fs.mountPoint)}, p...)...) +} + // cgUnifiedMountMode constant values describe how cgroup filesystems (aka hierarchies) are // mounted underneath /sys/fs/cgroup. In cgroups-v1 there are many mounts, // one per controller (cpu, blkio, etc) and one for systemd itself. In @@ -45,10 +91,6 @@ const ( unifModeAll cgUnifiedMountMode = iota ) -// WARNING: We only read this data once at process start, systemd updates -// may require restarting systemd-exporter -var cgroupUnified cgUnifiedMountMode = unifModeUnknown - // Values copied from https://github.com/torvalds/linux/blob/master/include/uapi/linux/magic.h const ( tmpFsMagic = 0x01021994 @@ -64,10 +106,11 @@ const ( // to track this // WARNING: We cache this data once at process start. Systemd updates // may require restarting systemd-exporter +// Equivalent to systemd cgUnifiedCached method func cgUnifiedCached() (cgUnifiedMountMode, error) { - if cgroupUnified != unifModeUnknown { - return cgroupUnified, nil - } + // if cgroupUnified != unifModeUnknown { + // return cgroupUnified, nil + // } var fs unix.Statfs_t err := unix.Statfs("/sys/fs/cgroup/", &fs) @@ -78,14 +121,14 @@ func cgUnifiedCached() (cgUnifiedMountMode, error) { switch fs.Type { case cgroup2SuperMagic: log.Debugf("Found cgroup2 on /sys/fs/cgroup, full unified hierarchy") - cgroupUnified = unifModeAll + return unifModeAll, nil case tmpFsMagic: err := unix.Statfs("/sys/fs/cgroup/unified", &fs) // Ignore err, we expect path to be missing on v232 if err == nil && fs.Type == cgroup2SuperMagic { log.Debugf("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller") - cgroupUnified = unifModeSystemd + return unifModeSystemd, nil } else { err := unix.Statfs("/sys/fs/cgroup/systemd", &fs) if err != nil { @@ -94,10 +137,10 @@ func cgUnifiedCached() (cgUnifiedMountMode, error) { switch fs.Type { case cgroup2SuperMagic: log.Debugf("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)") - cgroupUnified = unifModeSystemd + return unifModeSystemd, nil case cgroupSuperMagic: log.Debugf("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy") - cgroupUnified = unifModeNone + return unifModeNone, nil default: return unifModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup/systemd)", fs.Type) } @@ -105,20 +148,17 @@ func cgUnifiedCached() (cgUnifiedMountMode, error) { default: return unifModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup)", fs.Type) } - - return cgroupUnified, nil } // cgGetPath returns the absolute path for a specific file in a specific controller // in the specific cgroup denoted by the passed subpath. -// Input examples: ("cpu", "/system.slice", "cpuacct.usage_all) -func cgGetPath(controller string, subpath string, suffix string) (string, error) { +// Input examples: ("cpu", "/system.slice", "cpuacct.usage_all") +func (fs FS) cgGetPath(controller string, subpath string, suffix string) (string, error) { // relevant systemd source code in cgroup-util.[h|c] specifically cg_get_path // 2. Joins controller name with base path - unified, err := cgUnifiedCached() - if err != nil { - return "", errors.Wrapf(err, "failed to determine cgroup mounting hierarchy") + if fs.cgroupUnified == unifModeUnknown { + return "", errors.Errorf("Cannot determine path with unknown mounting hierarchy") } // TODO Ensure controller name is valid @@ -126,13 +166,13 @@ func cgGetPath(controller string, subpath string, suffix string) (string, error) dn := controller joined := "" - switch unified { + switch fs.cgroupUnified { case unifModeNone, unifModeSystemd: - joined = filepath.Join("/sys/fs/cgroup", dn, subpath, suffix) + joined = fs.path(dn, subpath, suffix) case unifModeAll: - joined = filepath.Join("/sys/fs/cgroup", subpath, suffix) + joined = fs.path(subpath, suffix) default: - return "", errors.Errorf("unknown cgroup mount mode (e.g. unified mode) %d", unified) + return "", errors.Errorf("unknown cgroup mount mode (e.g. unified mode) %d", fs.cgroupUnified) } return joined, nil } @@ -204,9 +244,19 @@ func ReadFileNoStat(filename string) ([]byte, error) { // NewCPUAcct will locate and read the kernel's cpu accounting info for // the provided systemd cgroup subpath. func NewCPUAcct(cgSubpath string) (*CPUAcct, error) { + fs, err := NewDefaultFS() + if err != nil { + return nil, err + } + return fs.NewCPUAcct(cgSubpath) +} + +// NewCPUAcct will locate and read the kernel's cpu accounting info for +// the provided systemd cgroup subpath. +func (fs FS) NewCPUAcct(cgSubpath string) (*CPUAcct, error) { var cpuUsage CPUAcct - cgPath, err := cgGetPath("cpu", cgSubpath, "cpuacct.usage_all") + cgPath, err := fs.cgGetPath("cpu", cgSubpath, "cpuacct.usage_all") if err != nil { return nil, errors.Wrapf(err, "unable to get cpu controller path") } diff --git a/systemd/cgroups_test.go b/systemd/cgroups_test.go new file mode 100644 index 0000000..8ef28ad --- /dev/null +++ b/systemd/cgroups_test.go @@ -0,0 +1,74 @@ +package systemd + +import "testing" + +const ( + testFixturesHybrid = "fixtures/cgroup-hybrid" +) + +func TestNewFS(t *testing.T) { + if _, err := newFS("foobar", unifModeUnknown); err == nil { + t.Error("want newFS to fail for non-existing path") + } + + if _, err := newFS("cgroups_test.go", unifModeUnknown); err == nil { + t.Error("want newFS to fail if mount point is not a dir") + } + + if _, err := newFS(testFixturesHybrid, unifModeUnknown); err != nil { + t.Error("want newFS to succeed if mount point exists") + } +} + +func getHybridFixtures(t *testing.T) FS { + fs, err := newFS(testFixturesHybrid, unifModeSystemd) + if err != nil { + t.Fatal("Unable to create hybrid text fixtures") + } + return fs +} + +func TestCgSubpath(t *testing.T) { + fs := getHybridFixtures(t) + + fs.cgroupUnified = unifModeUnknown + if _,err := fs.cgGetPath("cpu", "/system.slice", "cpuacct.usage_all"); err == nil { + t.Error("should not be able to determine path with unknown mount mode") + } + fs.cgroupUnified = unifModeSystemd + path, err := fs.cgGetPath("cpu", "/system.slice", "cpuacct.usage_all") + if err != nil { + t.Error("should be able to determine path with systemd mount mode") + } + want := testFixturesHybrid + "/cpu/system.slice/cpuacct.usage_all" + if path != want { + t.Errorf("bad response. Wanted %s, got %s", want, path) + } +} + +func TestRootCPUAcct(t *testing.T) { + fs := getHybridFixtures(t) + cpu, err := fs.NewCPUAcct("/") + if err != nil { + t.Error("want NewCPUAcct('/') to succeed") + } + + if len(cpu.CPUs) != 4 { + t.Errorf("Wrong number of CPUs. Wanted %d got %d", 4, len(cpu.CPUs)) + } + + var expectedUser uint64 = 29531441016368 + if cpu.UsageUserNanosecs() != expectedUser { + t.Errorf("Wrong user nanoseconds. Wanted %d got %d", expectedUser, cpu.UsageUserNanosecs()) + } + + var expectedSys uint64 = 619186701953 + if cpu.UsageSystemNanosecs() != expectedSys { + t.Errorf("Wrong sys nanoseconds. Wanted %d got %d",expectedSys, cpu.UsageSystemNanosecs()) + } + + expectedTotal := expectedSys + expectedUser + if cpu.UsageAllNanosecs() != expectedTotal { + t.Errorf("Wrong total nanoseconds. Wanted %d got %d",expectedTotal, cpu.UsageAllNanosecs()) + } +} From 1a0c29cfb5b9cc37c3a75d7784d5fb8e00721305 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sat, 1 Feb 2020 17:04:35 -0500 Subject: [PATCH 35/57] Ignore jetbrains idea files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0617875..68e3257 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /systemd_exporter /bin/golangci-lint +.idea From 7abd3fe9eae145c29c7b5d1c969a7891159fe36b Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sat, 1 Feb 2020 17:06:44 -0500 Subject: [PATCH 36/57] Stop including integration tests in coverage count --- .gitignore | 1 + Makefile | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 68e3257..1714718 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /systemd_exporter /bin/golangci-lint .idea +coverage.txt diff --git a/Makefile b/Makefile index 48efb3b..c753ec2 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ BRANCH := $(shell git branch | grep \* | cut -d ' ' -f2) LINT_FLAGS := run --deadline=120s LINTER := ./bin/golangci-lint -TESTFLAGS := -v -cover -race -coverpkg=github.com/povilasv/systemd_exporter,github.com/povilasv/systemd_exporter/systemd -coverprofile=coverage.txt -covermode=atomic +TEST_FLAGS := -v -cover -race -coverprofile=coverage.txt -covermode=atomic GO111MODULE := on all: $(LINTER) deps test lint build @@ -27,9 +27,7 @@ ifdef TRAVIS sudo sh -c 'echo DefaultCPUAccounting=yes >> /etc/systemd/system.conf' sudo systemctl daemon-reload endif - go get github.com/stristr/go-acc - go list - go-acc ./... + go test $(TEST_FLAGS) ./... .PHONY: build build: deps From 9191e1711e26d8b953f9179325d4c8c009d41f65 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sat, 1 Feb 2020 17:17:56 -0500 Subject: [PATCH 37/57] Move cgroup work to dedicated package No functional changes --- systemd/cgroups.go => cgroup/cgroup.go | 2 +- systemd/cgroups_test.go => cgroup/cgroup_test.go | 2 +- {systemd => cgroup}/fixtures/README.md | 0 {systemd => cgroup}/fixtures/cgroup-hybrid/cpu | 0 .../fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all | 0 systemd/systemd.go | 3 ++- systemd/systemd_test.go | 1 + 7 files changed, 5 insertions(+), 3 deletions(-) rename systemd/cgroups.go => cgroup/cgroup.go (99%) rename systemd/cgroups_test.go => cgroup/cgroup_test.go (99%) rename {systemd => cgroup}/fixtures/README.md (100%) rename {systemd => cgroup}/fixtures/cgroup-hybrid/cpu (100%) rename {systemd => cgroup}/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all (100%) create mode 100644 systemd/systemd_test.go diff --git a/systemd/cgroups.go b/cgroup/cgroup.go similarity index 99% rename from systemd/cgroups.go rename to cgroup/cgroup.go index 1bc95b9..f034bd4 100644 --- a/systemd/cgroups.go +++ b/cgroup/cgroup.go @@ -1,4 +1,4 @@ -package systemd +package cgroup import ( "bufio" diff --git a/systemd/cgroups_test.go b/cgroup/cgroup_test.go similarity index 99% rename from systemd/cgroups_test.go rename to cgroup/cgroup_test.go index 8ef28ad..6699e45 100644 --- a/systemd/cgroups_test.go +++ b/cgroup/cgroup_test.go @@ -1,4 +1,4 @@ -package systemd +package cgroup import "testing" diff --git a/systemd/fixtures/README.md b/cgroup/fixtures/README.md similarity index 100% rename from systemd/fixtures/README.md rename to cgroup/fixtures/README.md diff --git a/systemd/fixtures/cgroup-hybrid/cpu b/cgroup/fixtures/cgroup-hybrid/cpu similarity index 100% rename from systemd/fixtures/cgroup-hybrid/cpu rename to cgroup/fixtures/cgroup-hybrid/cpu diff --git a/systemd/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all b/cgroup/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all similarity index 100% rename from systemd/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all rename to cgroup/fixtures/cgroup-hybrid/cpu,cpuacct/cpuacct.usage_all diff --git a/systemd/systemd.go b/systemd/systemd.go index 87af595..7cc5cd7 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -12,6 +12,7 @@ import ( "github.com/coreos/go-systemd/dbus" "github.com/pkg/errors" + "github.com/povilasv/systemd_exporter/cgroup" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/log" "github.com/prometheus/procfs" @@ -559,7 +560,7 @@ func (c *Collector) collectUnitCPUUsageMetrics(unitType string, conn *dbus.Conn, return nil } - cpuUsage, err := NewCPUAcct(cgSubpath) + cpuUsage, err := cgroup.NewCPUAcct(cgSubpath) if err != nil { if unitType == "Socket" { log.Debugf("unable to read SocketUnit CPU accounting information (unit=%s)", unit.Name) diff --git a/systemd/systemd_test.go b/systemd/systemd_test.go new file mode 100644 index 0000000..728b123 --- /dev/null +++ b/systemd/systemd_test.go @@ -0,0 +1 @@ +package systemd From 64eaf3e0dee912079ee11c8357af5cf13c2b00b3 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sat, 1 Feb 2020 17:18:06 -0500 Subject: [PATCH 38/57] Add first systemd.go test unit --- systemd/systemd_test.go | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/systemd/systemd_test.go b/systemd/systemd_test.go index 728b123..3f4f3ee 100644 --- a/systemd/systemd_test.go +++ b/systemd/systemd_test.go @@ -1 +1,26 @@ package systemd + +import ( + "github.com/coreos/go-systemd/dbus" + "testing" +) + +func TestParseUnitType(t *testing.T) { + x := dbus.UnitStatus{ + Name: "test.service", + Description: "", + LoadState: "", + ActiveState: "", + SubState: "", + Followed: "", + Path: "", + JobId: 0, + JobType: "", + JobPath: "", + } + found := parseUnitType(x) + if found != "service" { + t.Errorf("Bad unit name parsing. Wanted %s got %s", "service", found) + } + +} From 8aba2bab8fbb0fabfa99ab11257995b01b2cdab3 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sat, 1 Feb 2020 18:13:03 -0500 Subject: [PATCH 39/57] Pull cpuacct controller logic into separate file Goal is to organize cgroup.go so that it does not matter what controller you are speaking to. This is the start of that --- cgroup/cgroup.go | 144 +------------------------------------------- cgroup/cpuacct.go | 148 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+), 142 deletions(-) create mode 100644 cgroup/cpuacct.go diff --git a/cgroup/cgroup.go b/cgroup/cgroup.go index f034bd4..da2551c 100644 --- a/cgroup/cgroup.go +++ b/cgroup/cgroup.go @@ -1,19 +1,12 @@ package cgroup import ( - "bufio" - "bytes" "fmt" - "io" - "io/ioutil" - "os" - "path/filepath" - "strconv" - "strings" - "github.com/pkg/errors" "github.com/prometheus/common/log" "golang.org/x/sys/unix" + "os" + "path/filepath" ) // FS is the pseudo-filesystem cgroupfs, which provides an interface to @@ -177,137 +170,4 @@ func (fs FS) cgGetPath(controller string, subpath string, suffix string) (string return joined, nil } -// CPUUsage stores one core's worth of CPU usage for a control group -// (aka cgroup) of tasks (e.g. both processes and threads). -// Equivalent to cpuacct.usage_percpu_user and cpuacct.usage_percpu_system -type CPUUsage struct { - CPUId uint32 - SystemNanosec uint64 - UserNanosec uint64 -} - -// CPUAcct stores CPU accounting information (e.g. cpu usage) for a control -// group (cgroup) of tasks. Equivalent to cpuacct.usage_all -type CPUAcct struct { - CPUs []CPUUsage -} - -// UsageUserNanosecs returns user (e.g. non-kernel) cpu consumption in nanoseconds, across all available cpu -// cores, from the point that CPU accounting was enabled for this control group. -func (c *CPUAcct) UsageUserNanosecs() uint64 { - var nanoseconds uint64 - for _, cpu := range c.CPUs { - nanoseconds += cpu.UserNanosec - } - return nanoseconds -} - -// UsageSystemNanosecs returns system (e.g. kernel) cpu consumption in nanoseconds, across all available cpu -// cores, from the point that CPU accounting was enabled for this control group. -func (c *CPUAcct) UsageSystemNanosecs() uint64 { - var nanoseconds uint64 - for _, cpu := range c.CPUs { - nanoseconds += cpu.SystemNanosec - } - return nanoseconds -} - -// UsageAllNanosecs returns total cpu consumption in nanoseconds, across all available cpu -// cores, from the point that CPU accounting was enabled for this control group. -func (c *CPUAcct) UsageAllNanosecs() uint64 { - var nanoseconds uint64 - for _, cpu := range c.CPUs { - nanoseconds += cpu.SystemNanosec + cpu.UserNanosec - } - return nanoseconds -} - -// ReadFileNoStat uses ioutil.ReadAll to read contents of entire file. -// This is similar to ioutil.ReadFile but without the call to os.Stat, because -// many files in /proc and /sys report incorrect file sizes (either 0 or 4096). -// Reads a max file size of 512kB. For files larger than this, a scanner -// should be used. -// COPIED FROM prometheus/procfs WHICH ALSO USES APACHE 2.0 -func ReadFileNoStat(filename string) ([]byte, error) { - const maxBufferSize = 1024 * 512 - - f, err := os.Open(filename) - if err != nil { - return nil, err - } - defer f.Close() - - reader := io.LimitReader(f, maxBufferSize) - return ioutil.ReadAll(reader) -} - -// NewCPUAcct will locate and read the kernel's cpu accounting info for -// the provided systemd cgroup subpath. -func NewCPUAcct(cgSubpath string) (*CPUAcct, error) { - fs, err := NewDefaultFS() - if err != nil { - return nil, err - } - return fs.NewCPUAcct(cgSubpath) -} - -// NewCPUAcct will locate and read the kernel's cpu accounting info for -// the provided systemd cgroup subpath. -func (fs FS) NewCPUAcct(cgSubpath string) (*CPUAcct, error) { - var cpuUsage CPUAcct - - cgPath, err := fs.cgGetPath("cpu", cgSubpath, "cpuacct.usage_all") - if err != nil { - return nil, errors.Wrapf(err, "unable to get cpu controller path") - } - - // Example cpuacct.usage_all - // cpu user system - // 0 21165924 0 - // 1 13334251 0 - b, err := ReadFileNoStat(cgPath) - if err != nil { - return nil, errors.Wrapf(err, "unable to read file %s", cgPath) - } - scanner := bufio.NewScanner(bytes.NewReader(b)) - if ok := scanner.Scan(); !ok { - return nil, errors.Errorf("unable to scan file %s", cgPath) - } - if err := scanner.Err(); err != nil { - return nil, errors.Wrapf(err, "unable to scan file %s", cgPath) - } - for scanner.Scan() { - if err := scanner.Err(); err != nil { - return nil, errors.Wrapf(err, "unable to scan file %s", cgPath) - } - text := scanner.Text() - vals := strings.Split(text, " ") - if len(vals) != 3 { - return nil, errors.Errorf("unable to parse contents of file %s", cgPath) - } - cpu, err := strconv.ParseUint(vals[0], 10, 32) - if err != nil { - return nil, errors.Wrapf(err, "unable to parse %s as uint32 (from %s)", vals[0], cgPath) - } - user, err := strconv.ParseUint(vals[1], 10, 64) - if err != nil { - return nil, errors.Wrapf(err, "unable to parse %s as uint64 (from %s)", vals[1], cgPath) - } - sys, err := strconv.ParseUint(vals[2], 10, 64) - if err != nil { - return nil, errors.Wrapf(err, "unable to parse %s as an in (from %s)", vals[2], cgPath) - } - onecpu := CPUUsage{ - CPUId: uint32(cpu), - UserNanosec: user, - SystemNanosec: sys, - } - cpuUsage.CPUs = append(cpuUsage.CPUs, onecpu) - } - if len(cpuUsage.CPUs) < 1 { - return nil, errors.Errorf("no CPU/core info extracted from %s", cgPath) - } - - return &cpuUsage, nil -} diff --git a/cgroup/cpuacct.go b/cgroup/cpuacct.go new file mode 100644 index 0000000..3786d3b --- /dev/null +++ b/cgroup/cpuacct.go @@ -0,0 +1,148 @@ +package cgroup + +import ( + "bufio" + "bytes" + "github.com/pkg/errors" + "io" + "io/ioutil" + "os" + "strconv" + "strings" +) + +// CPUUsage stores one core's worth of CPU usage for a control group +// (aka cgroup) of tasks (e.g. both processes and threads). +// Equivalent to cpuacct.usage_percpu_user and cpuacct.usage_percpu_system +type CPUUsage struct { + CPUId uint32 + SystemNanosec uint64 + UserNanosec uint64 +} + +// CPUAcct stores CPU accounting information (e.g. cpu usage) for a control +// group (cgroup) of tasks. Equivalent to cpuacct.usage_all +type CPUAcct struct { + CPUs []CPUUsage +} + +// NewCPUAcct will locate and read the kernel's cpu accounting info for +// the provided systemd cgroup subpath. +func NewCPUAcct(cgSubpath string) (*CPUAcct, error) { + fs, err := NewDefaultFS() + if err != nil { + return nil, err + } + return fs.NewCPUAcct(cgSubpath) +} + +// UsageUserNanosecs returns user (e.g. non-kernel) cpu consumption in nanoseconds, across all available cpu +// cores, from the point that CPU accounting was enabled for this control group. +func (c *CPUAcct) UsageUserNanosecs() uint64 { + var nanoseconds uint64 + for _, cpu := range c.CPUs { + nanoseconds += cpu.UserNanosec + } + return nanoseconds +} + +// UsageSystemNanosecs returns system (e.g. kernel) cpu consumption in nanoseconds, across all available cpu +// cores, from the point that CPU accounting was enabled for this control group. +func (c *CPUAcct) UsageSystemNanosecs() uint64 { + var nanoseconds uint64 + for _, cpu := range c.CPUs { + nanoseconds += cpu.SystemNanosec + } + return nanoseconds +} + +// UsageAllNanosecs returns total cpu consumption in nanoseconds, across all available cpu +// cores, from the point that CPU accounting was enabled for this control group. +func (c *CPUAcct) UsageAllNanosecs() uint64 { + var nanoseconds uint64 + for _, cpu := range c.CPUs { + nanoseconds += cpu.SystemNanosec + cpu.UserNanosec + } + return nanoseconds +} + +// ReadFileNoStat uses ioutil.ReadAll to read contents of entire file. +// This is similar to ioutil.ReadFile but without the call to os.Stat, because +// many files in /proc and /sys report incorrect file sizes (either 0 or 4096). +// Reads a max file size of 512kB. For files larger than this, a scanner +// should be used. +// COPIED FROM prometheus/procfs WHICH ALSO USES APACHE 2.0 +func ReadFileNoStat(filename string) ([]byte, error) { + const maxBufferSize = 1024 * 512 + + f, err := os.Open(filename) + if err != nil { + return nil, err + } + defer f.Close() + + reader := io.LimitReader(f, maxBufferSize) + return ioutil.ReadAll(reader) +} + + +// NewCPUAcct will locate and read the kernel's cpu accounting info for +// the provided systemd cgroup subpath. +func (fs FS) NewCPUAcct(cgSubpath string) (*CPUAcct, error) { + var cpuUsage CPUAcct + + cgPath, err := fs.cgGetPath("cpu", cgSubpath, "cpuacct.usage_all") + if err != nil { + return nil, errors.Wrapf(err, "unable to get cpu controller path") + } + + // Example cpuacct.usage_all + // cpu user system + // 0 21165924 0 + // 1 13334251 0 + b, err := ReadFileNoStat(cgPath) + if err != nil { + return nil, errors.Wrapf(err, "unable to read file %s", cgPath) + } + + scanner := bufio.NewScanner(bytes.NewReader(b)) + if ok := scanner.Scan(); !ok { + return nil, errors.Errorf("unable to scan file %s", cgPath) + } + if err := scanner.Err(); err != nil { + return nil, errors.Wrapf(err, "unable to scan file %s", cgPath) + } + for scanner.Scan() { + if err := scanner.Err(); err != nil { + return nil, errors.Wrapf(err, "unable to scan file %s", cgPath) + } + text := scanner.Text() + vals := strings.Split(text, " ") + if len(vals) != 3 { + return nil, errors.Errorf("unable to parse contents of file %s", cgPath) + } + cpu, err := strconv.ParseUint(vals[0], 10, 32) + if err != nil { + return nil, errors.Wrapf(err, "unable to parse %s as uint32 (from %s)", vals[0], cgPath) + } + user, err := strconv.ParseUint(vals[1], 10, 64) + if err != nil { + return nil, errors.Wrapf(err, "unable to parse %s as uint64 (from %s)", vals[1], cgPath) + } + sys, err := strconv.ParseUint(vals[2], 10, 64) + if err != nil { + return nil, errors.Wrapf(err, "unable to parse %s as an in (from %s)", vals[2], cgPath) + } + onecpu := CPUUsage{ + CPUId: uint32(cpu), + UserNanosec: user, + SystemNanosec: sys, + } + cpuUsage.CPUs = append(cpuUsage.CPUs, onecpu) + } + if len(cpuUsage.CPUs) < 1 { + return nil, errors.Errorf("no CPU/core info extracted from %s", cgPath) + } + + return &cpuUsage, nil +} From dc57119487e1d97cdd7948d4eb7b8db4bda6c3d8 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sat, 1 Feb 2020 18:13:14 -0500 Subject: [PATCH 40/57] Tiny bit more testing --- cgroup/cgroup_test.go | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/cgroup/cgroup_test.go b/cgroup/cgroup_test.go index 6699e45..12a444e 100644 --- a/cgroup/cgroup_test.go +++ b/cgroup/cgroup_test.go @@ -1,14 +1,32 @@ package cgroup -import "testing" +import ( + "os" + "testing" +) const ( testFixturesHybrid = "fixtures/cgroup-hybrid" ) +func TestMountModeParsing(t *testing.T) { + // This test cannot (easily) use test fixtures, because it relies on being + // able to call Statfs on mounted filesystems. So we only run inside + // system where we expect to find cgroupfs mounted in a mode systemd expects. + // For now, that's only inside TravisCI, but in future we may expand to run + // this by default on certain Linux systems + if _, inTravisCI := os.LookupEnv("TRAVIS"); inTravisCI == false { + return + } + + if _, err := NewDefaultFS(); err != nil { + t.Errorf("expected success determining mount type inside of travis CI: %s", err) + } +} + func TestNewFS(t *testing.T) { if _, err := newFS("foobar", unifModeUnknown); err == nil { - t.Error("want newFS to fail for non-existing path") + t.Error("newFS should have failed with non-existing path") } if _, err := newFS("cgroups_test.go", unifModeUnknown); err == nil { From e438c3f2d50446c83dc31ad1bf9ce812a9ebf263 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sat, 1 Feb 2020 18:21:26 -0500 Subject: [PATCH 41/57] Better code organization --- cgroup/cgroup_test.go | 25 ------------------------- cgroup/cpuacct_test.go | 37 +++++++++++++++++++++++++++++++++++++ systemd/systemd.go | 1 + 3 files changed, 38 insertions(+), 25 deletions(-) create mode 100644 cgroup/cpuacct_test.go diff --git a/cgroup/cgroup_test.go b/cgroup/cgroup_test.go index 12a444e..08e870e 100644 --- a/cgroup/cgroup_test.go +++ b/cgroup/cgroup_test.go @@ -64,29 +64,4 @@ func TestCgSubpath(t *testing.T) { } } -func TestRootCPUAcct(t *testing.T) { - fs := getHybridFixtures(t) - cpu, err := fs.NewCPUAcct("/") - if err != nil { - t.Error("want NewCPUAcct('/') to succeed") - } - if len(cpu.CPUs) != 4 { - t.Errorf("Wrong number of CPUs. Wanted %d got %d", 4, len(cpu.CPUs)) - } - - var expectedUser uint64 = 29531441016368 - if cpu.UsageUserNanosecs() != expectedUser { - t.Errorf("Wrong user nanoseconds. Wanted %d got %d", expectedUser, cpu.UsageUserNanosecs()) - } - - var expectedSys uint64 = 619186701953 - if cpu.UsageSystemNanosecs() != expectedSys { - t.Errorf("Wrong sys nanoseconds. Wanted %d got %d",expectedSys, cpu.UsageSystemNanosecs()) - } - - expectedTotal := expectedSys + expectedUser - if cpu.UsageAllNanosecs() != expectedTotal { - t.Errorf("Wrong total nanoseconds. Wanted %d got %d",expectedTotal, cpu.UsageAllNanosecs()) - } -} diff --git a/cgroup/cpuacct_test.go b/cgroup/cpuacct_test.go new file mode 100644 index 0000000..d4bdf7c --- /dev/null +++ b/cgroup/cpuacct_test.go @@ -0,0 +1,37 @@ +package cgroup + +import "testing" + + +func TestNewCPUAcct(t *testing.T) { + fs := getHybridFixtures(t) + cpu, err := fs.NewCPUAcct("/") + if err != nil { + t.Error("want NewCPUAcct('/') to succeed") + } + + if len(cpu.CPUs) != 4 { + t.Errorf("Wrong number of CPUs. Wanted %d got %d", 4, len(cpu.CPUs)) + } + + var expectedUser uint64 = 29531441016368 + if cpu.UsageUserNanosecs() != expectedUser { + t.Errorf("Wrong user nanoseconds. Wanted %d got %d", expectedUser, cpu.UsageUserNanosecs()) + } + + var expectedSys uint64 = 619186701953 + if cpu.UsageSystemNanosecs() != expectedSys { + t.Errorf("Wrong sys nanoseconds. Wanted %d got %d",expectedSys, cpu.UsageSystemNanosecs()) + } + + expectedTotal := expectedSys + expectedUser + if cpu.UsageAllNanosecs() != expectedTotal { + t.Errorf("Wrong total nanoseconds. Wanted %d got %d",expectedTotal, cpu.UsageAllNanosecs()) + } + + if _, err := fs.NewCPUAcct("foobar"); err == nil { + t.Errorf("expected error getting cpu accounting info for bogus cgroup") + } +} + + diff --git a/systemd/systemd.go b/systemd/systemd.go index 7cc5cd7..7c4b570 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -3,6 +3,7 @@ package systemd import ( "fmt" "math" + // Register pprof-over-http handlers _ "net/http/pprof" "regexp" From 5b0ba07593bc08675dab44ea10a14d3897255460 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sun, 2 Feb 2020 15:27:02 -0500 Subject: [PATCH 42/57] Read memory.stat from cgroup --- cgroup/Readme.md | 0 .../fixtures/cgroup-hybrid/memory/memory.stat | 36 +++ cgroup/memory.go | 234 ++++++++++++++++++ cgroup/memory_test.go | 58 +++++ 4 files changed, 328 insertions(+) create mode 100644 cgroup/Readme.md create mode 100644 cgroup/fixtures/cgroup-hybrid/memory/memory.stat create mode 100644 cgroup/memory.go create mode 100644 cgroup/memory_test.go diff --git a/cgroup/Readme.md b/cgroup/Readme.md new file mode 100644 index 0000000..e69de29 diff --git a/cgroup/fixtures/cgroup-hybrid/memory/memory.stat b/cgroup/fixtures/cgroup-hybrid/memory/memory.stat new file mode 100644 index 0000000..a2fd567 --- /dev/null +++ b/cgroup/fixtures/cgroup-hybrid/memory/memory.stat @@ -0,0 +1,36 @@ +cache 69984256 +rss 4866048 +rss_huge 0 +shmem 491520 +mapped_file 9818112 +dirty 8192 +writeback 0 +swap 0 +pgpgin 397887 +pgpgout 379613 +pgfault 541883 +pgmajfault 232 +inactive_anon 4096 +active_anon 5353472 +inactive_file 2621440 +active_file 63873024 +unevictable 2998272 +hierarchical_memory_limit 9223372036854771712 +hierarchical_memsw_limit 9223372036854771712 +total_cache 12469047296 +total_rss 2168885248 +total_rss_huge 10485760 +total_shmem 13168640 +total_mapped_file 228769792 +total_dirty 573440 +total_writeback 0 +total_swap 0 +total_pgpgin 135633232 +total_pgpgout 132074848 +total_pgfault 96879883 +total_pgmajfault 24509 +total_inactive_anon 11632640 +total_active_anon 2134667264 +total_inactive_file 9267785728 +total_active_file 3208708096 +total_unevictable 15052800 diff --git a/cgroup/memory.go b/cgroup/memory.go new file mode 100644 index 0000000..6b60114 --- /dev/null +++ b/cgroup/memory.go @@ -0,0 +1,234 @@ +package cgroup + +import ( + "bufio" + "bytes" + "fmt" + "github.com/pkg/errors" + "io" + "strconv" + "strings" +) + +// https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt +// memory.stat file +type MemStat struct { + // bytes of page cache memory + Cache uint64 + // bytes of anon and swap cache, including transparent hugepages. + // Note: Only anonymous and swap cache memory is listed as part of 'rss' stat. + // This should not be confused with the true 'resident set size' or the + // amount of physical memory used by the cgroup. 'rss + file_mapped" will + // give you resident set size of cgroup + Rss uint64 + // bytes of anonymous transparent hugepages + RssHuge uint64 + // No kernel documentation + Shmem uint64 + // bytes of mapped files (includes tmpfs/shmem) + MappedFile uint64 + // number of charging events to the memory cgroup. The charging + // event happens each time a page is accounted as either mapped + // anon page(RSS) or cache page(Page Cache) to the cgroup. + PgPgIn uint64 + // # of uncharging events to the memory cgroup. The uncharging + // event happens each time a page is unaccounted from the cgroup. + PgPgOut uint64 + // no kernel documentation + PgFault uint64 + // no kernel documentation + PgMajFault uint64 + // # of bytes of swap usage + Swap uint64 + // # of bytes that are waiting to get written back to the disk. + Dirty uint64 + // writeback - # of bytes of file/anon cache that are queued for syncing to + // disk. + Writeback uint64 + // inactive_anon - # of bytes of anonymous and swap cache memory on inactive + // LRU list. + InactiveAnon uint64 + // active_anon - # of bytes of anonymous and swap cache memory on active + // LRU list. + ActiveAnon uint64 + // inactive_file - # of bytes of file-backed memory on inactive LRU list. + InactiveFile uint64 + // active_file - # of bytes of file-backed memory on active LRU list. + ActiveFile uint64 + // unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). + Unevictable uint64 + + // status considering hierarchy (see memory.use_hierarchy settings) + // # of bytes of memory limit with regard to hierarchy + // under which the memory cgroup is + HierarchialMemoryLimit uint64 + // # of bytes of memory+swap limit with regard to + // hierarchy under which memory cgroup is. + HierarchialMemswLimit uint64 + // total_cache - sum of all children's "cache" + TotalCache uint64 + // No kernel doc + TotalDirty uint64 + // total_rss - sum of all children's "rss" + TotalRss uint64 + // No kernel docs + TotalRssHuge uint64 + // total_mapped_file - sum of all children's "cache" + TotalMappedFile uint64 + // No kernel docs + TotalPgFault uint64 + // No kernel docs + TotalPgMajFault uint64 + // total_pgpgout - sum of all children's "pgpgout" + TotalPgPgIn uint64 + // total_pgpgout - sum of all children's "pgpgout" + TotalPgPgOut uint64 + // No kernel doc + TotalShmem uint64 + // total_swap - sum of all children's "swap" + TotalSwap uint64 + // total_inactive_anon - sum of all children's "inactive_anon" + TotalInactiveAnon uint64 + // total_active_anon - sum of all children's "active_anon" + TotalActiveAnon uint64 + // total_inactive_file - sum of all children's "inactive_file" + TotalInactiveFile uint64 + // total_active_file - sum of all children's "active_file" + TotalActiveFile uint64 + // total_unevictable - sum of all children's "unevictable" + TotalUnevictable uint64 + // No kernel doc + TotalWriteback uint64 + // # The following additional stats are dependent on CONFIG_DEBUG_VM. + // inactive_ratio - VM internal parameter. (see mm/page_alloc.c) + // recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) + // recent_rotated_file - VM internal parameter. (see mm/vmscan.c) + // recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) + // recent_scanned_file - VM internal parameter. (see mm/vmscan.c) + +} + +func parseMemStat(r io.Reader) (*MemStat, error) { + var m MemStat + s := bufio.NewScanner(r) + for s.Scan() { + // Each line has at least a name and value + fields := strings.Fields(s.Text()) + if len(fields) < 2 { + return nil, fmt.Errorf("malformed memory.stat line: %q", s.Text()) + } + + v, err := strconv.ParseUint(fields[1], 0, 64) + if err != nil { + return nil, err + } + + switch fields[0] { + case "cache": + m.Cache = v + case "rss": + m.Rss = v + case "rss_huge": + m.RssHuge = v + case "shmem": + m.Shmem = v + case "mapped_file": + m.MappedFile = v + case "dirty": + m.Dirty = v + case "writeback": + m.Writeback = v + case "swap": + m.Swap = v + case "pgpgin": + m.PgPgIn = v + case "pgpgout": + m.PgPgOut = v + case "pgfault": + m.PgFault = v + case "pgmajfault": + m.PgMajFault = v + case "inactive_anon": + m.InactiveAnon = v + case "active_anon": + m.ActiveAnon = v + case "inactive_file": + m.InactiveFile = v + case "active_file": + m.ActiveFile = v + case "unevictable": + m.Unevictable = v + case "hierarchical_memory_limit": + m.HierarchialMemoryLimit = v + case "hierarchical_memsw_limit": + m.HierarchialMemswLimit = v + case "total_cache": + m.TotalCache = v + case "total_rss": + m.TotalRss = v + case "total_rss_huge": + m.TotalRssHuge = v + case "total_shmem": + m.TotalShmem = v + case "total_mapped_file": + m.TotalMappedFile = v + case "total_dirty": + m.TotalDirty = v + case "total_writeback": + m.TotalWriteback = v + case "total_swap": + m.TotalSwap = v + case "total_pgpgin": + m.TotalPgPgIn = v + case "total_pgpgout": + m.TotalPgPgOut = v + case "total_pgfault": + m.TotalPgFault = v + case "total_pgmajfault": + m.TotalPgMajFault = v + case "total_inactive_anon": + m.TotalInactiveAnon = v + case "total_inactive_file": + m.TotalInactiveFile = v + case "total_active_anon": + m.TotalActiveAnon = v + case "total_active_file": + m.TotalActiveFile = v + case "total_unevictable": + m.TotalUnevictable = v + } + } + + return &m, nil +} + +// NewMemStat will locate and read the kernel's cpu accounting info for +// the provided systemd cgroup subpath. +func NewMemStat(cgSubpath string) (MemStat, error) { + fs, err := NewDefaultFS() + if err != nil { + return MemStat{}, err + } + return fs.NewMemStat(cgSubpath) +} + +// MemStat returns an information about cgroup memory statistics. +// See +func (fs FS) NewMemStat(cgSubpath string) (MemStat, error) { + cgPath, err := fs.cgGetPath("memory", cgSubpath, "memory.stat") + if err != nil { + return MemStat{}, errors.Wrapf(err, "unable to get cpu controller path") + } + + b, err := ReadFileNoStat(cgPath) + if err != nil { + return MemStat{}, err + } + + m, err := parseMemStat(bytes.NewReader(b)) + if err != nil { + return MemStat{}, fmt.Errorf("failed to parse meminfo: %v", err) + } + + return *m, nil +} diff --git a/cgroup/memory_test.go b/cgroup/memory_test.go new file mode 100644 index 0000000..00eb13b --- /dev/null +++ b/cgroup/memory_test.go @@ -0,0 +1,58 @@ +package cgroup + +import ( + "reflect" + "testing" +) + +func TestMemStat(t *testing.T) { + expected := MemStat{ + Cache: 69984256, + Rss: 4866048, + RssHuge: 0, + Shmem: 491520, + MappedFile: 9818112, + Dirty: 8192, + Writeback: 0, + Swap: 0, + PgPgIn: 397887, + PgPgOut: 379613, + PgFault: 541883, + PgMajFault: 232, + InactiveAnon: 4096, + ActiveAnon: 5353472, + InactiveFile: 2621440, + ActiveFile: 63873024, + Unevictable: 2998272, + + HierarchialMemoryLimit: 9223372036854771712, + HierarchialMemswLimit: 9223372036854771712, + TotalCache: 12469047296, + TotalRss: 2168885248, + TotalRssHuge: 10485760, + TotalShmem: 13168640, + TotalMappedFile: 228769792, + TotalDirty: 573440, + TotalWriteback: 0, + TotalSwap: 0, + TotalPgPgIn: 135633232, + TotalPgPgOut: 132074848, + TotalPgFault: 96879883, + TotalPgMajFault: 24509, + TotalInactiveAnon: 11632640, + TotalActiveAnon: 2134667264, + TotalInactiveFile: 9267785728, + TotalActiveFile: 3208708096, + TotalUnevictable: 15052800} + + have, err := getHybridFixtures(t).NewMemStat("/") + if err != nil { + t.Fatal(err) + } + + if !reflect.DeepEqual(have, expected) { + t.Logf("have: %+v", have) + t.Logf("expected: %+v", expected) + t.Errorf("structs are not equal") + } +} From b93d38d568759d5acf69ad67b1eb5c2102487774 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sun, 2 Feb 2020 15:27:28 -0500 Subject: [PATCH 43/57] Linting --- cgroup/Readme.md | 49 ++++++++++++++++++++++++++++++++++++++++++ cgroup/cgroup.go | 2 -- cgroup/cgroup_test.go | 16 ++++++-------- cgroup/cpuacct.go | 1 - cgroup/cpuacct_test.go | 7 ++---- 5 files changed, 58 insertions(+), 17 deletions(-) diff --git a/cgroup/Readme.md b/cgroup/Readme.md index e69de29..01dfd92 100644 --- a/cgroup/Readme.md +++ b/cgroup/Readme.md @@ -0,0 +1,49 @@ + +This package provides functions to retrieve control group metrics from the pseudo-filesystem `/sys/cgroup/`. + +**WARNING:** This package is a work in progress. Its API may still break in backwards-incompatible ways without warnings. Use it at your own risk. + +The Linux kernel supports two APIs for userspace to interact with control groups, the v1 API and the v2 API. See +[this LWN Article](https://lwn.net/Articles/679786/) or +[this kernel documentation](https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#deprecated-v1-core-features) +for background on the two APIs. This package will interact with both v1 and v2 APIs. + + +### Focus on Systemd + +This package is initially focused on reading metrics for systemd units. Therefore, +the following systemd documentation is relevant. + +#### Systemd cgroup mount mode + +The kernel can mount the cgroupfs in any manner it chooses. However, anyone wanting to use that cgroupfs must know +where/how it is mounted. When there was only one cgroup API, it was always mounted at `/sys/fs/cgroup`. With the +transition from v1 to v2, the mounting approach differs per-distro, with some mounting only v2, some mounting only +v1(all hierarchies), and some mounting a combination. For simplicity, this package initially focuses on the three +mount "modes" supported by systemd: + +via [systemd.io](https://systemd.io/CGROUP_DELEGATION/#three-different-tree-setups-) + +1. Unified — this is the simplest mode, and exposes a pure cgroup v2 logic +2. Legacy — this is the traditional cgroup v1 mode. In this mode the various controllers each get their own cgroup + file system mounted to `/sys/fs/cgroup//` +3. Hybrid — this is a hybrid between the unified and legacy mode. It’s set up mostly like legacy + +#### Systemd Supported Controllers + +The initial target controllers this package aims to read from are the controllers supported by systemd. Reading from +other controllers may be supported in the future. Systemd guarantees that all v1 hierarchies are kept in sync. + +Via [systemd.io](https://systemd.io/CGROUP_DELEGATION/#controller-support): + +Systemd supports a number of controllers (but not all). Specifically, supported are: + +on cgroup v1: cpu, cpuacct, blkio, memory, devices, pids +on cgroup v2: cpu, io, memory, pids + +It is our intention to natively support all cgroup v2 controllers as they are added +to the kernel. However, regarding cgroup v1: at this point we will not add support +for any other controllers anymore. This means systemd currently does not and will +never manage the following controllers on cgroup v1: freezer, cpuset, net_cls, +perf_event, net_prio, hugetlb + diff --git a/cgroup/cgroup.go b/cgroup/cgroup.go index da2551c..a0addc0 100644 --- a/cgroup/cgroup.go +++ b/cgroup/cgroup.go @@ -169,5 +169,3 @@ func (fs FS) cgGetPath(controller string, subpath string, suffix string) (string } return joined, nil } - - diff --git a/cgroup/cgroup_test.go b/cgroup/cgroup_test.go index 08e870e..94d040c 100644 --- a/cgroup/cgroup_test.go +++ b/cgroup/cgroup_test.go @@ -10,13 +10,13 @@ const ( ) func TestMountModeParsing(t *testing.T) { - // This test cannot (easily) use test fixtures, because it relies on being - // able to call Statfs on mounted filesystems. So we only run inside - // system where we expect to find cgroupfs mounted in a mode systemd expects. - // For now, that's only inside TravisCI, but in future we may expand to run - // this by default on certain Linux systems + // This test cannot (easily) use test fixtures, because it relies on being + // able to call Statfs on mounted filesystems. So we only run inside + // system where we expect to find cgroupfs mounted in a mode systemd expects. + // For now, that's only inside TravisCI, but in future we may expand to run + // this by default on certain Linux systems if _, inTravisCI := os.LookupEnv("TRAVIS"); inTravisCI == false { - return + return } if _, err := NewDefaultFS(); err != nil { @@ -50,7 +50,7 @@ func TestCgSubpath(t *testing.T) { fs := getHybridFixtures(t) fs.cgroupUnified = unifModeUnknown - if _,err := fs.cgGetPath("cpu", "/system.slice", "cpuacct.usage_all"); err == nil { + if _, err := fs.cgGetPath("cpu", "/system.slice", "cpuacct.usage_all"); err == nil { t.Error("should not be able to determine path with unknown mount mode") } fs.cgroupUnified = unifModeSystemd @@ -63,5 +63,3 @@ func TestCgSubpath(t *testing.T) { t.Errorf("bad response. Wanted %s, got %s", want, path) } } - - diff --git a/cgroup/cpuacct.go b/cgroup/cpuacct.go index 3786d3b..e64eeb7 100644 --- a/cgroup/cpuacct.go +++ b/cgroup/cpuacct.go @@ -85,7 +85,6 @@ func ReadFileNoStat(filename string) ([]byte, error) { return ioutil.ReadAll(reader) } - // NewCPUAcct will locate and read the kernel's cpu accounting info for // the provided systemd cgroup subpath. func (fs FS) NewCPUAcct(cgSubpath string) (*CPUAcct, error) { diff --git a/cgroup/cpuacct_test.go b/cgroup/cpuacct_test.go index d4bdf7c..fdd4c9a 100644 --- a/cgroup/cpuacct_test.go +++ b/cgroup/cpuacct_test.go @@ -2,7 +2,6 @@ package cgroup import "testing" - func TestNewCPUAcct(t *testing.T) { fs := getHybridFixtures(t) cpu, err := fs.NewCPUAcct("/") @@ -21,17 +20,15 @@ func TestNewCPUAcct(t *testing.T) { var expectedSys uint64 = 619186701953 if cpu.UsageSystemNanosecs() != expectedSys { - t.Errorf("Wrong sys nanoseconds. Wanted %d got %d",expectedSys, cpu.UsageSystemNanosecs()) + t.Errorf("Wrong sys nanoseconds. Wanted %d got %d", expectedSys, cpu.UsageSystemNanosecs()) } expectedTotal := expectedSys + expectedUser if cpu.UsageAllNanosecs() != expectedTotal { - t.Errorf("Wrong total nanoseconds. Wanted %d got %d",expectedTotal, cpu.UsageAllNanosecs()) + t.Errorf("Wrong total nanoseconds. Wanted %d got %d", expectedTotal, cpu.UsageAllNanosecs()) } if _, err := fs.NewCPUAcct("foobar"); err == nil { t.Errorf("expected error getting cpu accounting info for bogus cgroup") } } - - From fd79e1d7ec9fdd72d776352ee2ff96223df633c4 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sun, 2 Feb 2020 15:44:02 -0500 Subject: [PATCH 44/57] Add memory stats to systemd_exporter --- systemd/systemd.go | 73 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/systemd/systemd.go b/systemd/systemd.go index 7c4b570..ab41588 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -57,6 +57,8 @@ type Collector struct { socketRefusedConnectionsDesc *prometheus.Desc cpuTotalDesc *prometheus.Desc unitCPUTotal *prometheus.Desc + + unitMemCache *prometheus.Desc openFDs *prometheus.Desc maxFDs *prometheus.Desc vsize *prometheus.Desc @@ -136,6 +138,12 @@ func NewCollector(logger log.Logger) (*Collector, error) { []string{"name", "type", "mode"}, nil, ) + unitMemCache := prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "unit_cached_bytes"), + "Unit Memory Cached in bytes", + []string{"name", "type"}, nil, + ) + openFDs := prometheus.NewDesc( prometheus.BuildFQName(namespace, "", "process_open_fds"), "Number of open file descriptors.", @@ -181,6 +189,7 @@ func NewCollector(logger log.Logger) (*Collector, error) { socketRefusedConnectionsDesc: socketRefusedConnectionsDesc, cpuTotalDesc: cpuTotalDesc, unitCPUTotal: unitCPUTotal, + unitMemCache: unitMemCache, openFDs: openFDs, maxFDs: maxFDs, vsize: vsize, @@ -301,6 +310,10 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un if err != nil { logger.Warnf(errUnitMetricsMsg, err) } + err = c.collectUnitMemUsageMetrics("Service", conn, ch, unit) + if err != nil { + logger.Warnf(errUnitMetricsMsg, err) + } case strings.HasSuffix(unit.Name, ".mount"): err = c.collectMountMetainfo(conn, ch, unit) if err != nil { @@ -583,6 +596,66 @@ func (c *Collector) collectUnitCPUUsageMetrics(unitType string, conn *dbus.Conn, return nil } +// A number of unit types support the 'ControlGroup' property needed to allow us to directly read their +// resource usage from the kernel's cgroupfs cpu hierarchy. The only change is which dbus item we are querying +func (c *Collector) collectUnitMemUsageMetrics(unitType string, conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { + propCGSubpath, err := conn.GetUnitTypeProperty(unit.Name, unitType, "ControlGroup") + if err != nil { + return errors.Wrapf(err, errGetPropertyMsg, "ControlGroup") + } + cgSubpath, ok := propCGSubpath.Value.Value().(string) + if !ok { + return errors.Errorf(errConvertStringPropertyMsg, "ControlGroup", propCGSubpath.Value.Value()) + } + + switch { + case cgSubpath == "" && unit.ActiveState == "inactive", + cgSubpath == "" && unit.ActiveState == "failed": + // Expected condition, systemd has cleaned up and + // we have nothing to record + return nil + case cgSubpath == "" && unit.ActiveState == "active": + // Unexpected. Why is there no cgroup on an active unit? + subType := c.mustGetUnitStringTypeProperty(unitType, "Type", "unknown", conn, unit) + slice := c.mustGetUnitStringTypeProperty(unitType, "Slice", "unknown", conn, unit) + return errors.Errorf("got 'no cgroup' from systemd for active unit (state=%s subtype=%s slice=%s)", unit.ActiveState, subType, slice) + case cgSubpath == "": + // We are likely reading a unit that is currently changing state, so + // we record this and bail + subType := c.mustGetUnitStringTypeProperty(unitType, "Type", "unknown", conn, unit) + slice := c.mustGetUnitStringTypeProperty(unitType, "Slice", "unknown", conn, unit) + log.Debugf("Read 'no cgroup' from unit (name=%s state=%s subtype=%s slice=%s) ", unit.Name, unit.ActiveState, subType, slice) + return nil + } + + propMemAcct, err := conn.GetUnitTypeProperty(unit.Name, unitType, "MemoryAccounting") + if err != nil { + return errors.Wrapf(err, errGetPropertyMsg, "MemoryAccounting") + } + memAcct, ok := propMemAcct.Value.Value().(bool) + if !ok { + return errors.Errorf(errConvertStringPropertyMsg, "MemoryAccounting", propMemAcct.Value.Value()) + } + if !memAcct { + return nil + } + + memStat, err := cgroup.NewMemStat(cgSubpath) + if err != nil { + // if unitType == "Socket" { + // log.Debugf("unable to read SocketUnit CPU accounting information (unit=%s)", unit.Name) + // return nil + // } + return errors.Wrapf(err, errControlGroupReadMsg, "Memory stat") + } + + ch <- prometheus.MustNewConstMetric( + c.unitMemCache, prometheus.GaugeValue, + float64(memStat.Cache), unit.Name, parseUnitType(unit), "user") + + return nil +} + func (c *Collector) collectSocketConnMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { acceptedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NAccepted") if err != nil { From f760287c883ca51db96188ddef90e8337f08ef5e Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sun, 2 Feb 2020 15:49:58 -0500 Subject: [PATCH 45/57] Enable memory accounting in travis CI --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index c753ec2..c13eed5 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,7 @@ deps: test: ifdef TRAVIS sudo sh -c 'echo DefaultCPUAccounting=yes >> /etc/systemd/system.conf' + sudo sh -c 'echo DefaultMemoryAccounting=yes >> /etc/systemd/system.conf' sudo systemctl daemon-reload endif go test $(TEST_FLAGS) ./... From 2e5121933a0448d04ca4b50002ead860ad263045 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Sun, 2 Feb 2020 15:54:07 -0500 Subject: [PATCH 46/57] Fix bug caused by typo --- systemd/systemd.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systemd/systemd.go b/systemd/systemd.go index ab41588..e46e9ea 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -651,7 +651,7 @@ func (c *Collector) collectUnitMemUsageMetrics(unitType string, conn *dbus.Conn, ch <- prometheus.MustNewConstMetric( c.unitMemCache, prometheus.GaugeValue, - float64(memStat.Cache), unit.Name, parseUnitType(unit), "user") + float64(memStat.Cache), unit.Name, parseUnitType(unit)) return nil } From 4fa6131b71d6ef12278f56e171b9f2a5d2eded6e Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Mon, 3 Feb 2020 00:39:15 -0500 Subject: [PATCH 47/57] Switch unit handling on parsed type --- systemd/systemd.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/systemd/systemd.go b/systemd/systemd.go index e46e9ea..adb2b7f 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -3,6 +3,7 @@ package systemd import ( "fmt" "math" + "os" // Register pprof-over-http handlers _ "net/http/pprof" @@ -271,15 +272,15 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un logger := c.logger.With("unit", unit.Name) - // Collect unit_state for all + // Collect unit_state for all unit types err := c.collectUnitState(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) // TODO should we continue processing here? } - switch { - case strings.HasSuffix(unit.Name, ".service"): + switch parseUnitType(unit) { + case "service": err = c.collectServiceMetainfo(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) @@ -314,7 +315,7 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - case strings.HasSuffix(unit.Name, ".mount"): + case "mount": err = c.collectMountMetainfo(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) @@ -323,12 +324,12 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - case strings.HasSuffix(unit.Name, ".timer"): + case "timer": err := c.collectTimerTriggerTime(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - case strings.HasSuffix(unit.Name, ".socket"): + case "socket": err := c.collectSocketConnMetrics(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) @@ -339,13 +340,13 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - case strings.HasSuffix(unit.Name, ".swap"): err = c.collectUnitCPUUsageMetrics("Swap", conn, ch, unit) + case "swap": if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - case strings.HasSuffix(unit.Name, ".slice"): err = c.collectUnitCPUUsageMetrics("Slice", conn, ch, unit) + case "slice": if err != nil { logger.Warnf(errUnitMetricsMsg, err) } From a02aeeda386fa3ffa17ecf21da83cb17f3aa8ecc Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Mon, 3 Feb 2020 00:47:21 -0500 Subject: [PATCH 48/57] Generalize code relying on cgroups --- systemd/systemd.go | 120 +++++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 65 deletions(-) diff --git a/systemd/systemd.go b/systemd/systemd.go index adb2b7f..4a7399f 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -234,6 +234,13 @@ func parseUnitType(unit dbus.UnitStatus) string { return t[len(t)-1] } +// parseUnitTypeInterface extracts the dbus interface suffix for the interface unique to the passed unit type. +// For example, a systemd "service unit" will be are exposed on dbus as "service objects", and all "service objects" +// implement the org.freedesktop.systemd1.Service interface. This is used as input for dbus.GetUnitTypeProperty +func parseUnitTypeInterface(unit dbus.UnitStatus) string { + return strings.Title(parseUnitType(unit)) +} + func (c *Collector) collect(ch chan<- prometheus.Metric) error { begin := time.Now() conn, err := c.newDbus() @@ -307,11 +314,22 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - err = c.collectUnitCPUUsageMetrics("Service", conn, ch, unit) + + cgroupPath, err := c.getControlGroup(conn, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - err = c.collectUnitMemUsageMetrics("Service", conn, ch, unit) + + // Everything below requires a cgroup + if cgroupPath == nil { + break + } + + err = c.collectUnitCPUMetrics(*cgroupPath, conn, ch, unit) + if err != nil { + logger.Warnf(errUnitMetricsMsg, err) + } + err = c.collectUnitMemMetrics("Service", conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } @@ -320,7 +338,7 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - err = c.collectUnitCPUUsageMetrics("Mount", conn, ch, unit) + err = c.collectUnitCPUMetrics("Mount", conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } @@ -336,17 +354,17 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un } // Most sockets do not have a cpu cgroupfs entry, but a // few do, notably docker.socket - err = c.collectUnitCPUUsageMetrics("Socket", conn, ch, unit) + err = c.collectUnitCPUMetrics("Socket", conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - err = c.collectUnitCPUUsageMetrics("Swap", conn, ch, unit) case "swap": + err = c.collectUnitCPUMetrics("Swap", conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - err = c.collectUnitCPUUsageMetrics("Slice", conn, ch, unit) case "slice": + err = c.collectUnitCPUMetrics("Slice", conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } @@ -531,16 +549,15 @@ func (c *Collector) mustGetUnitStringTypeProperty(unitType string, return propVal } -// A number of unit types support the 'ControlGroup' property needed to allow us to directly read their -// resource usage from the kernel's cgroupfs cpu hierarchy. The only change is which dbus item we are querying -func (c *Collector) collectUnitCPUUsageMetrics(unitType string, conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { - propCGSubpath, err := conn.GetUnitTypeProperty(unit.Name, unitType, "ControlGroup") +func (c *Collector) getControlGroup(conn *dbus.Conn, unit dbus.UnitStatus) (*string, error) { + unitTypeInterface := parseUnitTypeInterface(unit) + propCGSubpath, err := conn.GetUnitTypeProperty(unit.Name, unitTypeInterface, "ControlGroup") if err != nil { - return errors.Wrapf(err, errGetPropertyMsg, "ControlGroup") + return nil, errors.Wrapf(err, errGetPropertyMsg, "ControlGroup") } cgSubpath, ok := propCGSubpath.Value.Value().(string) if !ok { - return errors.Errorf(errConvertStringPropertyMsg, "ControlGroup", propCGSubpath.Value.Value()) + return nil, errors.Errorf(errConvertStringPropertyMsg, "ControlGroup", propCGSubpath.Value.Value()) } switch { @@ -548,22 +565,29 @@ func (c *Collector) collectUnitCPUUsageMetrics(unitType string, conn *dbus.Conn, cgSubpath == "" && unit.ActiveState == "failed": // Expected condition, systemd has cleaned up and // we have nothing to record - return nil + return nil, nil case cgSubpath == "" && unit.ActiveState == "active": // Unexpected. Why is there no cgroup on an active unit? - subType := c.mustGetUnitStringTypeProperty(unitType, "Type", "unknown", conn, unit) - slice := c.mustGetUnitStringTypeProperty(unitType, "Slice", "unknown", conn, unit) - return errors.Errorf("got 'no cgroup' from systemd for active unit (state=%s subtype=%s slice=%s)", unit.ActiveState, subType, slice) + subType := c.mustGetUnitStringTypeProperty(unitTypeInterface, "Type", "unknown", conn, unit) + slice := c.mustGetUnitStringTypeProperty(unitTypeInterface, "Slice", "unknown", conn, unit) + return nil, errors.Errorf("got 'no cgroup' from systemd for active unit (state=%s subtype=%s slice=%s)", unit.ActiveState, subType, slice) case cgSubpath == "": // We are likely reading a unit that is currently changing state, so // we record this and bail - subType := c.mustGetUnitStringTypeProperty(unitType, "Type", "unknown", conn, unit) - slice := c.mustGetUnitStringTypeProperty(unitType, "Slice", "unknown", conn, unit) + subType := c.mustGetUnitStringTypeProperty(unitTypeInterface, "Type", "unknown", conn, unit) + slice := c.mustGetUnitStringTypeProperty(unitTypeInterface, "Slice", "unknown", conn, unit) log.Debugf("Read 'no cgroup' from unit (name=%s state=%s subtype=%s slice=%s) ", unit.Name, unit.ActiveState, subType, slice) - return nil + return nil, nil + default: + return &cgSubpath, nil } +} - propCPUAcct, err := conn.GetUnitTypeProperty(unit.Name, unitType, "CPUAccounting") +// A number of unit types support the 'ControlGroup' property needed to allow us to directly read their +// resource usage from the kernel's cgroupfs cpu hierarchy. The only change is which dbus item we are querying +func (c *Collector) collectUnitCPUMetrics(cgSubpath string, conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { + unitTypeInterface := parseUnitTypeInterface(unit) + propCPUAcct, err := conn.GetUnitTypeProperty(unit.Name, unitTypeInterface, "CPUAccounting") if err != nil { return errors.Wrapf(err, errGetPropertyMsg, "CPUAccounting") } @@ -577,7 +601,7 @@ func (c *Collector) collectUnitCPUUsageMetrics(unitType string, conn *dbus.Conn, cpuUsage, err := cgroup.NewCPUAcct(cgSubpath) if err != nil { - if unitType == "Socket" { + if unitTypeInterface == "Socket" { log.Debugf("unable to read SocketUnit CPU accounting information (unit=%s)", unit.Name) return nil } @@ -597,52 +621,18 @@ func (c *Collector) collectUnitCPUUsageMetrics(unitType string, conn *dbus.Conn, return nil } -// A number of unit types support the 'ControlGroup' property needed to allow us to directly read their -// resource usage from the kernel's cgroupfs cpu hierarchy. The only change is which dbus item we are querying -func (c *Collector) collectUnitMemUsageMetrics(unitType string, conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { - propCGSubpath, err := conn.GetUnitTypeProperty(unit.Name, unitType, "ControlGroup") - if err != nil { - return errors.Wrapf(err, errGetPropertyMsg, "ControlGroup") - } - cgSubpath, ok := propCGSubpath.Value.Value().(string) - if !ok { - return errors.Errorf(errConvertStringPropertyMsg, "ControlGroup", propCGSubpath.Value.Value()) - } - - switch { - case cgSubpath == "" && unit.ActiveState == "inactive", - cgSubpath == "" && unit.ActiveState == "failed": - // Expected condition, systemd has cleaned up and - // we have nothing to record - return nil - case cgSubpath == "" && unit.ActiveState == "active": - // Unexpected. Why is there no cgroup on an active unit? - subType := c.mustGetUnitStringTypeProperty(unitType, "Type", "unknown", conn, unit) - slice := c.mustGetUnitStringTypeProperty(unitType, "Slice", "unknown", conn, unit) - return errors.Errorf("got 'no cgroup' from systemd for active unit (state=%s subtype=%s slice=%s)", unit.ActiveState, subType, slice) - case cgSubpath == "": - // We are likely reading a unit that is currently changing state, so - // we record this and bail - subType := c.mustGetUnitStringTypeProperty(unitType, "Type", "unknown", conn, unit) - slice := c.mustGetUnitStringTypeProperty(unitType, "Slice", "unknown", conn, unit) - log.Debugf("Read 'no cgroup' from unit (name=%s state=%s subtype=%s slice=%s) ", unit.Name, unit.ActiveState, subType, slice) - return nil - } - - propMemAcct, err := conn.GetUnitTypeProperty(unit.Name, unitType, "MemoryAccounting") - if err != nil { - return errors.Wrapf(err, errGetPropertyMsg, "MemoryAccounting") - } - memAcct, ok := propMemAcct.Value.Value().(bool) - if !ok { - return errors.Errorf(errConvertStringPropertyMsg, "MemoryAccounting", propMemAcct.Value.Value()) - } - if !memAcct { - return nil - } - +func (c *Collector) collectUnitMemMetrics(cgSubpath string, conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { + // Don't bother reading MemoryAccounting prop. It's faster to attempt a file read than to query dbus, and it works + // in more situations as well. For ex: case where + // such as kernel cmdline has cgroups_enabled=memory but systemd still has DefaultMemoryAccounting=no. All cgroups + // will have a memory.stat file, but systemd will still report MemoryAccounting=false for most units memStat, err := cgroup.NewMemStat(cgSubpath) if err != nil { + // Unable to open the file + if perr, ok := err.(*os.PathError); ok && perr.Op == "open" { + return nil + } + // if unitType == "Socket" { // log.Debugf("unable to read SocketUnit CPU accounting information (unit=%s)", unit.Name) // return nil From 026ea0f34416c012b1deeec6c8c7f9d978643c33 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Mon, 3 Feb 2020 00:49:59 -0500 Subject: [PATCH 49/57] Split cgroup metrics from dbus metrics This seems cleaner to follow than to add the calls inside specific metrics. For all of the cgroups metrics, if we know thay unit type has a cgroup we can just check the filesystem --- systemd/systemd.go | 67 +++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/systemd/systemd.go b/systemd/systemd.go index 4a7399f..6ee79d3 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -286,50 +286,51 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un // TODO should we continue processing here? } + // Collect metrics from cgroups switch parseUnitType(unit) { - case "service": - err = c.collectServiceMetainfo(conn, ch, unit) + // Most sockets do not have a cpu cgroupfs entry, but a + // few do, notably docker.socket + case "service", "mount","socket", "swap", "slice": + cgroupPath, err := c.getControlGroup(conn, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - - err = c.collectServiceStartTimeMetrics(conn, ch, unit) + // Everything below requires a cgroup + if cgroupPath == nil { + break + } + err = c.collectUnitCPUMetrics(*cgroupPath, conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - - if *enableRestartsMetrics { - err = c.collectServiceRestartCount(conn, ch, unit) - if err != nil { - logger.Warnf(errUnitMetricsMsg, err) - } - } - - err = c.collectServiceTasksMetrics(conn, ch, unit) + err = c.collectUnitMemMetrics(*cgroupPath, conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } + } - err = c.collectServiceProcessMetrics(conn, ch, unit) + // Collect metrics from dbus + switch parseUnitType(unit) { + case "service": + err = c.collectServiceMetainfo(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - - cgroupPath, err := c.getControlGroup(conn, unit) + err = c.collectServiceStartTimeMetrics(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - - // Everything below requires a cgroup - if cgroupPath == nil { - break + if *enableRestartsMetrics { + err = c.collectServiceRestartCount(conn, ch, unit) + if err != nil { + logger.Warnf(errUnitMetricsMsg, err) + } } - - err = c.collectUnitCPUMetrics(*cgroupPath, conn, ch, unit) + err = c.collectServiceTasksMetrics(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - err = c.collectUnitMemMetrics("Service", conn, ch, unit) + err = c.collectServiceProcessMetrics(conn, ch, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) } @@ -338,10 +339,6 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - err = c.collectUnitCPUMetrics("Mount", conn, ch, unit) - if err != nil { - logger.Warnf(errUnitMetricsMsg, err) - } case "timer": err := c.collectTimerTriggerTime(conn, ch, unit) if err != nil { @@ -352,22 +349,6 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un if err != nil { logger.Warnf(errUnitMetricsMsg, err) } - // Most sockets do not have a cpu cgroupfs entry, but a - // few do, notably docker.socket - err = c.collectUnitCPUMetrics("Socket", conn, ch, unit) - if err != nil { - logger.Warnf(errUnitMetricsMsg, err) - } - case "swap": - err = c.collectUnitCPUMetrics("Swap", conn, ch, unit) - if err != nil { - logger.Warnf(errUnitMetricsMsg, err) - } - case "slice": - err = c.collectUnitCPUMetrics("Slice", conn, ch, unit) - if err != nil { - logger.Warnf(errUnitMetricsMsg, err) - } default: c.logger.Debugf(infoUnitNoHandler, unit.Name) } From 5eadce7c5e46b30c2f950f27917d166735480fae Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Mon, 3 Feb 2020 00:53:22 -0500 Subject: [PATCH 50/57] Stop reading CPUAccounting --- systemd/systemd.go | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/systemd/systemd.go b/systemd/systemd.go index 6ee79d3..f769aa8 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -567,21 +567,14 @@ func (c *Collector) getControlGroup(conn *dbus.Conn, unit dbus.UnitStatus) (*str // A number of unit types support the 'ControlGroup' property needed to allow us to directly read their // resource usage from the kernel's cgroupfs cpu hierarchy. The only change is which dbus item we are querying func (c *Collector) collectUnitCPUMetrics(cgSubpath string, conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { + // Don't bother reading CPUAccounting prop. It's faster to attempt a file read than to query dbus, and it works + // in more situations as well unitTypeInterface := parseUnitTypeInterface(unit) - propCPUAcct, err := conn.GetUnitTypeProperty(unit.Name, unitTypeInterface, "CPUAccounting") - if err != nil { - return errors.Wrapf(err, errGetPropertyMsg, "CPUAccounting") - } - cpuAcct, ok := propCPUAcct.Value.Value().(bool) - if !ok { - return errors.Errorf(errConvertStringPropertyMsg, "CPUAccounting", propCPUAcct.Value.Value()) - } - if !cpuAcct { - return nil - } - cpuUsage, err := cgroup.NewCPUAcct(cgSubpath) if err != nil { + if perr, ok := err.(*os.PathError); ok && perr.Op == "open" { + return nil + } if unitTypeInterface == "Socket" { log.Debugf("unable to read SocketUnit CPU accounting information (unit=%s)", unit.Name) return nil From d71f59aa4d33aae8adf25f5057f3b0ab86f323dd Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Mon, 3 Feb 2020 01:01:26 -0500 Subject: [PATCH 51/57] Remove special-casing in generic collect unit CPU Better to handle this visibly at a top level --- systemd/systemd.go | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/systemd/systemd.go b/systemd/systemd.go index f769aa8..8af02cd 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -288,8 +288,6 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un // Collect metrics from cgroups switch parseUnitType(unit) { - // Most sockets do not have a cpu cgroupfs entry, but a - // few do, notably docker.socket case "service", "mount","socket", "swap", "slice": cgroupPath, err := c.getControlGroup(conn, unit) if err != nil { @@ -301,7 +299,11 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un } err = c.collectUnitCPUMetrics(*cgroupPath, conn, ch, unit) if err != nil { - logger.Warnf(errUnitMetricsMsg, err) + // Most sockets do not have a cpu cgroupfs entry, but a few big ones do (notably docker.socket). Quiet down + // error reporting if error came from a socket + if parseUnitType(unit) != "socket" { + logger.Warnf(errUnitMetricsMsg, err) + } } err = c.collectUnitMemMetrics(*cgroupPath, conn, ch, unit) if err != nil { @@ -569,16 +571,11 @@ func (c *Collector) getControlGroup(conn *dbus.Conn, unit dbus.UnitStatus) (*str func (c *Collector) collectUnitCPUMetrics(cgSubpath string, conn *dbus.Conn, ch chan<- prometheus.Metric, unit dbus.UnitStatus) error { // Don't bother reading CPUAccounting prop. It's faster to attempt a file read than to query dbus, and it works // in more situations as well - unitTypeInterface := parseUnitTypeInterface(unit) cpuUsage, err := cgroup.NewCPUAcct(cgSubpath) if err != nil { if perr, ok := err.(*os.PathError); ok && perr.Op == "open" { return nil } - if unitTypeInterface == "Socket" { - log.Debugf("unable to read SocketUnit CPU accounting information (unit=%s)", unit.Name) - return nil - } return errors.Wrapf(err, errControlGroupReadMsg, "CPU usage") } From 29df5f797d8725441df43015cc7c4bece9ae2a66 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Mon, 3 Feb 2020 01:12:41 -0500 Subject: [PATCH 52/57] Add 3 more memory metrics --- systemd/systemd.go | 56 +++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/systemd/systemd.go b/systemd/systemd.go index 8af02cd..0212131 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -59,12 +59,16 @@ type Collector struct { cpuTotalDesc *prometheus.Desc unitCPUTotal *prometheus.Desc - unitMemCache *prometheus.Desc - openFDs *prometheus.Desc - maxFDs *prometheus.Desc - vsize *prometheus.Desc - maxVsize *prometheus.Desc - rss *prometheus.Desc + unitMemCache *prometheus.Desc + unitMemRss *prometheus.Desc + unitMemDirty *prometheus.Desc + unitMemShmem *prometheus.Desc + + openFDs *prometheus.Desc + maxFDs *prometheus.Desc + vsize *prometheus.Desc + maxVsize *prometheus.Desc + rss *prometheus.Desc unitWhitelistPattern *regexp.Regexp unitBlacklistPattern *regexp.Regexp @@ -141,7 +145,22 @@ func NewCollector(logger log.Logger) (*Collector, error) { unitMemCache := prometheus.NewDesc( prometheus.BuildFQName(namespace, "", "unit_cached_bytes"), - "Unit Memory Cached in bytes", + "Unit Page Cache", + []string{"name", "type"}, nil, + ) + unitMemRss := prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "unit_rss_bytes"), + "Unit anon+swap cache, incl. transparent hugepages. Not true RSS", + []string{"name", "type"}, nil, + ) + unitMemDirty := prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "unit_dirty_bytes"), + "Unit bytes waiting to get written to disk", + []string{"name", "type"}, nil, + ) + unitMemShmem := prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "unit_shmem_bytes"), + "", []string{"name", "type"}, nil, ) @@ -191,6 +210,9 @@ func NewCollector(logger log.Logger) (*Collector, error) { cpuTotalDesc: cpuTotalDesc, unitCPUTotal: unitCPUTotal, unitMemCache: unitMemCache, + unitMemRss: unitMemRss, + unitMemDirty: unitMemDirty, + unitMemShmem: unitMemShmem, openFDs: openFDs, maxFDs: maxFDs, vsize: vsize, @@ -288,7 +310,7 @@ func (c *Collector) collectUnit(conn *dbus.Conn, ch chan<- prometheus.Metric, un // Collect metrics from cgroups switch parseUnitType(unit) { - case "service", "mount","socket", "swap", "slice": + case "service", "mount", "socket", "swap", "slice": cgroupPath, err := c.getControlGroup(conn, unit) if err != nil { logger.Warnf(errUnitMetricsMsg, err) @@ -599,21 +621,25 @@ func (c *Collector) collectUnitMemMetrics(cgSubpath string, conn *dbus.Conn, ch // will have a memory.stat file, but systemd will still report MemoryAccounting=false for most units memStat, err := cgroup.NewMemStat(cgSubpath) if err != nil { - // Unable to open the file if perr, ok := err.(*os.PathError); ok && perr.Op == "open" { return nil } - - // if unitType == "Socket" { - // log.Debugf("unable to read SocketUnit CPU accounting information (unit=%s)", unit.Name) - // return nil - // } return errors.Wrapf(err, errControlGroupReadMsg, "Memory stat") } + unitType := parseUnitType(unit) ch <- prometheus.MustNewConstMetric( c.unitMemCache, prometheus.GaugeValue, - float64(memStat.Cache), unit.Name, parseUnitType(unit)) + float64(memStat.Cache), unit.Name, unitType) + ch <- prometheus.MustNewConstMetric( + c.unitMemRss, prometheus.GaugeValue, + float64(memStat.Rss), unit.Name, unitType) + ch <- prometheus.MustNewConstMetric( + c.unitMemDirty, prometheus.GaugeValue, + float64(memStat.Dirty), unit.Name, unitType) + ch <- prometheus.MustNewConstMetric( + c.unitMemShmem, prometheus.GaugeValue, + float64(memStat.Shmem), unit.Name, unitType) return nil } From 6dff4304835981d5614d2f109bf24cbd415ec0af Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Mon, 3 Feb 2020 01:22:00 -0500 Subject: [PATCH 53/57] Ensure fields have units declared --- cgroup/memory.go | 111 +++++++++++++++++++++--------------------- cgroup/memory_test.go | 72 +++++++++++++-------------- systemd/systemd.go | 8 +-- 3 files changed, 95 insertions(+), 96 deletions(-) diff --git a/cgroup/memory.go b/cgroup/memory.go index 6b60114..65b058e 100644 --- a/cgroup/memory.go +++ b/cgroup/memory.go @@ -14,22 +14,22 @@ import ( // memory.stat file type MemStat struct { // bytes of page cache memory - Cache uint64 + CacheBytes uint64 // bytes of anon and swap cache, including transparent hugepages. // Note: Only anonymous and swap cache memory is listed as part of 'rss' stat. // This should not be confused with the true 'resident set size' or the // amount of physical memory used by the cgroup. 'rss + file_mapped" will // give you resident set size of cgroup - Rss uint64 + RssBytes uint64 // bytes of anonymous transparent hugepages - RssHuge uint64 + RssHugeBytes uint64 // No kernel documentation Shmem uint64 // bytes of mapped files (includes tmpfs/shmem) - MappedFile uint64 + MappedFileBytes uint64 // number of charging events to the memory cgroup. The charging // event happens each time a page is accounted as either mapped - // anon page(RSS) or cache page(Page Cache) to the cgroup. + // anon page(RSS) or cache page(Page CacheBytes) to the cgroup. PgPgIn uint64 // # of uncharging events to the memory cgroup. The uncharging // event happens each time a page is unaccounted from the cgroup. @@ -39,42 +39,42 @@ type MemStat struct { // no kernel documentation PgMajFault uint64 // # of bytes of swap usage - Swap uint64 + SwapBytes uint64 // # of bytes that are waiting to get written back to the disk. - Dirty uint64 + DirtyBytes uint64 // writeback - # of bytes of file/anon cache that are queued for syncing to // disk. - Writeback uint64 + WritebackBytes uint64 // inactive_anon - # of bytes of anonymous and swap cache memory on inactive // LRU list. - InactiveAnon uint64 + InactiveAnonBytes uint64 // active_anon - # of bytes of anonymous and swap cache memory on active // LRU list. - ActiveAnon uint64 + ActiveAnonBytes uint64 // inactive_file - # of bytes of file-backed memory on inactive LRU list. - InactiveFile uint64 + InactiveFileBytes uint64 // active_file - # of bytes of file-backed memory on active LRU list. - ActiveFile uint64 + ActiveFileBytes uint64 // unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). - Unevictable uint64 + UnevictableBytes uint64 // status considering hierarchy (see memory.use_hierarchy settings) // # of bytes of memory limit with regard to hierarchy // under which the memory cgroup is - HierarchialMemoryLimit uint64 + HierarchialMemoryLimitBytes uint64 // # of bytes of memory+swap limit with regard to // hierarchy under which memory cgroup is. - HierarchialMemswLimit uint64 + HierarchialMemswLimitBytes uint64 // total_cache - sum of all children's "cache" - TotalCache uint64 + TotalCacheBytes uint64 // No kernel doc - TotalDirty uint64 + TotalDirtyBytes uint64 // total_rss - sum of all children's "rss" - TotalRss uint64 + TotalRssBytes uint64 // No kernel docs - TotalRssHuge uint64 + TotalRssHugeBytes uint64 // total_mapped_file - sum of all children's "cache" - TotalMappedFile uint64 + TotalMappedFileBytes uint64 // No kernel docs TotalPgFault uint64 // No kernel docs @@ -84,28 +84,27 @@ type MemStat struct { // total_pgpgout - sum of all children's "pgpgout" TotalPgPgOut uint64 // No kernel doc - TotalShmem uint64 + TotalShmemBytes uint64 // total_swap - sum of all children's "swap" - TotalSwap uint64 + TotalSwapBytes uint64 // total_inactive_anon - sum of all children's "inactive_anon" - TotalInactiveAnon uint64 + TotalInactiveAnonBytes uint64 // total_active_anon - sum of all children's "active_anon" - TotalActiveAnon uint64 + TotalActiveAnonBytes uint64 // total_inactive_file - sum of all children's "inactive_file" - TotalInactiveFile uint64 + TotalInactiveFileBytes uint64 // total_active_file - sum of all children's "active_file" - TotalActiveFile uint64 + TotalActiveFileBytes uint64 // total_unevictable - sum of all children's "unevictable" - TotalUnevictable uint64 + TotalUnevictableBytes uint64 // No kernel doc - TotalWriteback uint64 + TotalWritebackBytes uint64 // # The following additional stats are dependent on CONFIG_DEBUG_VM. // inactive_ratio - VM internal parameter. (see mm/page_alloc.c) // recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) // recent_rotated_file - VM internal parameter. (see mm/vmscan.c) // recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) // recent_scanned_file - VM internal parameter. (see mm/vmscan.c) - } func parseMemStat(r io.Reader) (*MemStat, error) { @@ -125,21 +124,21 @@ func parseMemStat(r io.Reader) (*MemStat, error) { switch fields[0] { case "cache": - m.Cache = v + m.CacheBytes = v case "rss": - m.Rss = v + m.RssBytes = v case "rss_huge": - m.RssHuge = v + m.RssHugeBytes = v case "shmem": m.Shmem = v case "mapped_file": - m.MappedFile = v + m.MappedFileBytes = v case "dirty": - m.Dirty = v + m.DirtyBytes = v case "writeback": - m.Writeback = v + m.WritebackBytes = v case "swap": - m.Swap = v + m.SwapBytes = v case "pgpgin": m.PgPgIn = v case "pgpgout": @@ -149,35 +148,35 @@ func parseMemStat(r io.Reader) (*MemStat, error) { case "pgmajfault": m.PgMajFault = v case "inactive_anon": - m.InactiveAnon = v + m.InactiveAnonBytes = v case "active_anon": - m.ActiveAnon = v + m.ActiveAnonBytes = v case "inactive_file": - m.InactiveFile = v + m.InactiveFileBytes = v case "active_file": - m.ActiveFile = v + m.ActiveFileBytes = v case "unevictable": - m.Unevictable = v + m.UnevictableBytes = v case "hierarchical_memory_limit": - m.HierarchialMemoryLimit = v + m.HierarchialMemoryLimitBytes = v case "hierarchical_memsw_limit": - m.HierarchialMemswLimit = v + m.HierarchialMemswLimitBytes = v case "total_cache": - m.TotalCache = v + m.TotalCacheBytes = v case "total_rss": - m.TotalRss = v + m.TotalRssBytes = v case "total_rss_huge": - m.TotalRssHuge = v + m.TotalRssHugeBytes = v case "total_shmem": - m.TotalShmem = v + m.TotalShmemBytes = v case "total_mapped_file": - m.TotalMappedFile = v + m.TotalMappedFileBytes = v case "total_dirty": - m.TotalDirty = v + m.TotalDirtyBytes = v case "total_writeback": - m.TotalWriteback = v + m.TotalWritebackBytes = v case "total_swap": - m.TotalSwap = v + m.TotalSwapBytes = v case "total_pgpgin": m.TotalPgPgIn = v case "total_pgpgout": @@ -187,15 +186,15 @@ func parseMemStat(r io.Reader) (*MemStat, error) { case "total_pgmajfault": m.TotalPgMajFault = v case "total_inactive_anon": - m.TotalInactiveAnon = v + m.TotalInactiveAnonBytes = v case "total_inactive_file": - m.TotalInactiveFile = v + m.TotalInactiveFileBytes = v case "total_active_anon": - m.TotalActiveAnon = v + m.TotalActiveAnonBytes = v case "total_active_file": - m.TotalActiveFile = v + m.TotalActiveFileBytes = v case "total_unevictable": - m.TotalUnevictable = v + m.TotalUnevictableBytes = v } } diff --git a/cgroup/memory_test.go b/cgroup/memory_test.go index 00eb13b..192f450 100644 --- a/cgroup/memory_test.go +++ b/cgroup/memory_test.go @@ -7,43 +7,43 @@ import ( func TestMemStat(t *testing.T) { expected := MemStat{ - Cache: 69984256, - Rss: 4866048, - RssHuge: 0, - Shmem: 491520, - MappedFile: 9818112, - Dirty: 8192, - Writeback: 0, - Swap: 0, - PgPgIn: 397887, - PgPgOut: 379613, - PgFault: 541883, - PgMajFault: 232, - InactiveAnon: 4096, - ActiveAnon: 5353472, - InactiveFile: 2621440, - ActiveFile: 63873024, - Unevictable: 2998272, + CacheBytes: 69984256, + RssBytes: 4866048, + RssHugeBytes: 0, + Shmem: 491520, + MappedFileBytes: 9818112, + DirtyBytes: 8192, + WritebackBytes: 0, + SwapBytes: 0, + PgPgIn: 397887, + PgPgOut: 379613, + PgFault: 541883, + PgMajFault: 232, + InactiveAnonBytes: 4096, + ActiveAnonBytes: 5353472, + InactiveFileBytes: 2621440, + ActiveFileBytes: 63873024, + UnevictableBytes: 2998272, - HierarchialMemoryLimit: 9223372036854771712, - HierarchialMemswLimit: 9223372036854771712, - TotalCache: 12469047296, - TotalRss: 2168885248, - TotalRssHuge: 10485760, - TotalShmem: 13168640, - TotalMappedFile: 228769792, - TotalDirty: 573440, - TotalWriteback: 0, - TotalSwap: 0, - TotalPgPgIn: 135633232, - TotalPgPgOut: 132074848, - TotalPgFault: 96879883, - TotalPgMajFault: 24509, - TotalInactiveAnon: 11632640, - TotalActiveAnon: 2134667264, - TotalInactiveFile: 9267785728, - TotalActiveFile: 3208708096, - TotalUnevictable: 15052800} + HierarchialMemoryLimitBytes: 9223372036854771712, + HierarchialMemswLimitBytes: 9223372036854771712, + TotalCacheBytes: 12469047296, + TotalRssBytes: 2168885248, + TotalRssHugeBytes: 10485760, + TotalShmemBytes: 13168640, + TotalMappedFileBytes: 228769792, + TotalDirtyBytes: 573440, + TotalWritebackBytes: 0, + TotalSwapBytes: 0, + TotalPgPgIn: 135633232, + TotalPgPgOut: 132074848, + TotalPgFault: 96879883, + TotalPgMajFault: 24509, + TotalInactiveAnonBytes: 11632640, + TotalActiveAnonBytes: 2134667264, + TotalInactiveFileBytes: 9267785728, + TotalActiveFileBytes: 3208708096, + TotalUnevictableBytes: 15052800} have, err := getHybridFixtures(t).NewMemStat("/") if err != nil { diff --git a/systemd/systemd.go b/systemd/systemd.go index 0212131..61bca21 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -145,7 +145,7 @@ func NewCollector(logger log.Logger) (*Collector, error) { unitMemCache := prometheus.NewDesc( prometheus.BuildFQName(namespace, "", "unit_cached_bytes"), - "Unit Page Cache", + "Unit Page CacheBytes", []string{"name", "type"}, nil, ) unitMemRss := prometheus.NewDesc( @@ -630,13 +630,13 @@ func (c *Collector) collectUnitMemMetrics(cgSubpath string, conn *dbus.Conn, ch unitType := parseUnitType(unit) ch <- prometheus.MustNewConstMetric( c.unitMemCache, prometheus.GaugeValue, - float64(memStat.Cache), unit.Name, unitType) + float64(memStat.CacheBytes), unit.Name, unitType) ch <- prometheus.MustNewConstMetric( c.unitMemRss, prometheus.GaugeValue, - float64(memStat.Rss), unit.Name, unitType) + float64(memStat.RssBytes), unit.Name, unitType) ch <- prometheus.MustNewConstMetric( c.unitMemDirty, prometheus.GaugeValue, - float64(memStat.Dirty), unit.Name, unitType) + float64(memStat.DirtyBytes), unit.Name, unitType) ch <- prometheus.MustNewConstMetric( c.unitMemShmem, prometheus.GaugeValue, float64(memStat.Shmem), unit.Name, unitType) From 8b7bd769b2892818ed698d505ddc5c81436ba686 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Wed, 5 Feb 2020 00:15:16 -0500 Subject: [PATCH 54/57] Add tests for complex 'systemd mount mode' logic This exposes the critical statfs call used inside cgUnifiedCache as a settable variable. Using this variable, we can now write test tables that emulate common mounting scenarios. This achieves 100% coverage within the tricky function, so we should now be able to rewrite this function in canonical go with much less concern that we've borked it up. Also caused me to notice+fix some bugs in the cgUnifiedCached method - we were using file paths when we should have been explicitly using statfs on directory paths --- cgroup/cgroup.go | 19 ++++--- cgroup/cgroup_test.go | 112 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 7 deletions(-) diff --git a/cgroup/cgroup.go b/cgroup/cgroup.go index a0addc0..85e45e6 100644 --- a/cgroup/cgroup.go +++ b/cgroup/cgroup.go @@ -83,6 +83,10 @@ const ( // a.k.a. "unified hierarchy" unifModeAll cgUnifiedMountMode = iota ) +func (c cgUnifiedMountMode) String() string { + return [...]string{"unknown", "none", "systemd", "all"}[c] +} + // Values copied from https://github.com/torvalds/linux/blob/master/include/uapi/linux/magic.h const ( @@ -99,31 +103,32 @@ const ( // to track this // WARNING: We cache this data once at process start. Systemd updates // may require restarting systemd-exporter -// Equivalent to systemd cgUnifiedCached method +// Equivalent to systemd cgroup-util.c#cg_unified_cached +var statfsFunc = unix.Statfs func cgUnifiedCached() (cgUnifiedMountMode, error) { // if cgroupUnified != unifModeUnknown { // return cgroupUnified, nil // } var fs unix.Statfs_t - err := unix.Statfs("/sys/fs/cgroup/", &fs) + err := statfsFunc("/sys/fs/cgroup/", &fs) if err != nil { - return unifModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup)") + return unifModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup/)") } switch fs.Type { case cgroup2SuperMagic: - log.Debugf("Found cgroup2 on /sys/fs/cgroup, full unified hierarchy") + log.Debugf("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy") return unifModeAll, nil case tmpFsMagic: - err := unix.Statfs("/sys/fs/cgroup/unified", &fs) + err := statfsFunc("/sys/fs/cgroup/unified/", &fs) // Ignore err, we expect path to be missing on v232 if err == nil && fs.Type == cgroup2SuperMagic { - log.Debugf("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller") + log.Debugf("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller") return unifModeSystemd, nil } else { - err := unix.Statfs("/sys/fs/cgroup/systemd", &fs) + err := statfsFunc("/sys/fs/cgroup/systemd/", &fs) if err != nil { return unifModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup/systemd)") } diff --git a/cgroup/cgroup_test.go b/cgroup/cgroup_test.go index 94d040c..5836356 100644 --- a/cgroup/cgroup_test.go +++ b/cgroup/cgroup_test.go @@ -1,6 +1,8 @@ package cgroup import ( + "errors" + "golang.org/x/sys/unix" "os" "testing" ) @@ -24,6 +26,116 @@ func TestMountModeParsing(t *testing.T) { } } + +func TestCgUnifiedCached(t *testing.T) { + // Build some functions we will use to simulate various cgroup mounting scenarios + noCgroupMount := func(path string, stat *unix.Statfs_t) error { + // No fs present on /sys/fs/cgroup/ + return errors.New("boo") + } + unknownCgroupMount := func(path string, stat *unix.Statfs_t) error { + // Unknown fs type present on /sys/fs/cgroup/ + stat.Type = 0x0 + return nil + } + unifiedMount := func(path string, stat *unix.Statfs_t) error { + // unified fs present + switch path { + case "/sys/fs/cgroup/": + stat.Type = cgroup2SuperMagic + return nil + default: + return errors.New("pretend path not found") + } + } + hybridMountSystemdV232 := func(path string, stat *unix.Statfs_t) error { + switch path { + case "/sys/fs/cgroup/": + stat.Type = tmpFsMagic + case "/sys/fs/cgroup/systemd/": + stat.Type = cgroup2SuperMagic + } + return nil + } + hybridMountSystemdV233 := func(path string, stat *unix.Statfs_t) error { + switch path { + case "/sys/fs/cgroup/": + stat.Type = tmpFsMagic + case "/sys/fs/cgroup/unified/": + stat.Type = cgroup2SuperMagic + case "/sys/fs/cgroup/systemd/": + stat.Type = cgroupSuperMagic + } + return nil + } + legacyMount := func(path string, stat *unix.Statfs_t) error { + switch path { + case "/sys/fs/cgroup/": + stat.Type = tmpFsMagic + case "/sys/fs/cgroup/unified/": + return errors.New("pretend unified path not found") + case "/sys/fs/cgroup/systemd/": + stat.Type = cgroupSuperMagic + } + return nil + } + missingSystemdFolder := func(path string, stat *unix.Statfs_t) error { + switch path { + case "/sys/fs/cgroup/": + stat.Type = tmpFsMagic + case "/sys/fs/cgroup/unified/": + return errors.New("pretend unified path not found") + case "/sys/fs/cgroup/systemd/": + return errors.New("pretend we cannot stat systemd dir") + } + return nil + } + unknownSystemdFolderMountType := func(path string, stat *unix.Statfs_t) error { + switch path { + case "/sys/fs/cgroup/": + stat.Type = tmpFsMagic + case "/sys/fs/cgroup/unified/": + return errors.New("pretend unified path not found") + case "/sys/fs/cgroup/systemd/": + stat.Type = 0x0 + } + return nil + } + + tables := []struct { + name string + statFn func(string,*unix.Statfs_t) error + expectedMode cgUnifiedMountMode + errExpected bool + }{ + {"NoCgroupMount", noCgroupMount, unifModeUnknown, true}, + {"UnknownCgroupMountType", unknownCgroupMount, unifModeUnknown, true}, + {"LegacyMount", legacyMount, unifModeNone, false}, + {"HybridMount, v232", hybridMountSystemdV232, unifModeSystemd, false}, + {"HybridMount, v233+", hybridMountSystemdV233, unifModeSystemd, false}, + {"MissingSystemdFolder", missingSystemdFolder, unifModeUnknown, true}, + {"UnknownSystemdFolderType", unknownSystemdFolderMountType, unifModeUnknown, true}, + {"UnifiedMount", unifiedMount, unifModeAll, false}, + } + + for _, table := range tables { + statfsFunc = table.statFn + mode, err := cgUnifiedCached() + if table.errExpected && err == nil { + t.Errorf("%s: expected an err, but got mode %s with no error", table.name, mode) + } + if !table.errExpected && err != nil { + t.Errorf("%s: expected no error, but got mode %s with err: %s", table.name, mode, err) + } + if mode != table.expectedMode { + t.Errorf("%s: expected mode %s but got mode %s", table.name, table.expectedMode, mode) + } + } + + + +} + func TestNewFS(t *testing.T) { if _, err := newFS("foobar", unifModeUnknown); err == nil { t.Error("newFS should have failed with non-existing path") From 4447073d687c78ec1867fbd3e98ccf8c6e4313d9 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Wed, 5 Feb 2020 00:15:40 -0500 Subject: [PATCH 55/57] Fix linting issues --- cgroup/memory.go | 7 +++---- main_test.go | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cgroup/memory.go b/cgroup/memory.go index 65b058e..506e392 100644 --- a/cgroup/memory.go +++ b/cgroup/memory.go @@ -10,8 +10,8 @@ import ( "strings" ) -// https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt -// memory.stat file +// MemStat represents the memory.stat file exported by the kernel when the memory cgroup controller is enabled. +// See https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt type MemStat struct { // bytes of page cache memory CacheBytes uint64 @@ -211,8 +211,7 @@ func NewMemStat(cgSubpath string) (MemStat, error) { return fs.NewMemStat(cgSubpath) } -// MemStat returns an information about cgroup memory statistics. -// See +// NewMemStat returns an information about cgroup memory statistics. func (fs FS) NewMemStat(cgSubpath string) (MemStat, error) { cgPath, err := fs.cgGetPath("memory", cgSubpath, "memory.stat") if err != nil { diff --git a/main_test.go b/main_test.go index 45f0320..67241cc 100644 --- a/main_test.go +++ b/main_test.go @@ -67,7 +67,7 @@ func runServerAndTest(args []string, url string, fn func() error) error { fmt.Println("Waiting on test server startup...") for i := 0; i < 10; i++ { root := fmt.Sprintf("http://%s/", address) - if resp, err := getUrl(root); err == nil && resp.StatusCode == http.StatusOK { + if resp, err := getURL(root); err == nil && resp.StatusCode == http.StatusOK { break } time.Sleep(10 * time.Millisecond) @@ -98,10 +98,10 @@ func runServerAndTest(args []string, url string, fn func() error) error { } func getMetrics() (*http.Response, error) { - return getUrl(fmt.Sprintf("http://%s/metrics", address)) + return getURL(fmt.Sprintf("http://%s/metrics", address)) } -func getUrl(url string) (*http.Response, error) { +func getURL(url string) (*http.Response, error) { resp, err := http.Get(url) if err != nil { return nil, err From 6884b2a4642109c7a48351e23817203033cdbddf Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Wed, 5 Feb 2020 00:31:07 -0500 Subject: [PATCH 56/57] Accept linting suggestion --- cgroup/cgroup.go | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/cgroup/cgroup.go b/cgroup/cgroup.go index 85e45e6..82a6abd 100644 --- a/cgroup/cgroup.go +++ b/cgroup/cgroup.go @@ -127,22 +127,24 @@ func cgUnifiedCached() (cgUnifiedMountMode, error) { if err == nil && fs.Type == cgroup2SuperMagic { log.Debugf("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller") return unifModeSystemd, nil - } else { - err := statfsFunc("/sys/fs/cgroup/systemd/", &fs) - if err != nil { - return unifModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup/systemd)") - } - switch fs.Type { - case cgroup2SuperMagic: - log.Debugf("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)") - return unifModeSystemd, nil - case cgroupSuperMagic: - log.Debugf("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy") - return unifModeNone, nil - default: - return unifModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup/systemd)", fs.Type) - } } + + err = statfsFunc("/sys/fs/cgroup/systemd/", &fs) + if err != nil { + return unifModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup/systemd)") + } + + switch fs.Type { + case cgroup2SuperMagic: + log.Debugf("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)") + return unifModeSystemd, nil + case cgroupSuperMagic: + log.Debugf("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy") + return unifModeNone, nil + default: + return unifModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup/systemd)", fs.Type) + } + default: return unifModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup)", fs.Type) } From 7eec03e30e5f06d28b168a0f1f51e2ae30443102 Mon Sep 17 00:00:00 2001 From: Hamilton Turner Date: Mon, 24 Feb 2020 13:16:16 -0500 Subject: [PATCH 57/57] cgroup: Export multiple new symbols Exports the ability to manually create a new cgroup filesystem struct, as well as the various mount modes. Cleans up documentation --- cgroup/cgroup.go | 87 +++++++++++++++++++++---------------------- cgroup/cgroup_test.go | 39 +++++++++---------- 2 files changed, 60 insertions(+), 66 deletions(-) diff --git a/cgroup/cgroup.go b/cgroup/cgroup.go index 82a6abd..31945b9 100644 --- a/cgroup/cgroup.go +++ b/cgroup/cgroup.go @@ -16,7 +16,7 @@ type FS struct { // WARNING: We only read this data once at process start, systemd updates // may require restarting systemd-exporter - cgroupUnified cgUnifiedMountMode + cgroupUnified MountMode } // DefaultMountPoint is the common mount point of the cgroupfs filesystem @@ -28,16 +28,16 @@ const DefaultMountPoint = "/sys/fs/cgroup" func NewDefaultFS() (FS, error) { mode, err := cgUnifiedCached() - if err != nil || mode == unifModeUnknown { + if err != nil || mode == MountModeUnknown { return FS{}, fmt.Errorf("could not determine cgroupfs mount mode: %s", err) } - return newFS(DefaultMountPoint, mode) + return NewFS(DefaultMountPoint, mode) } -// newFS returns a new cgroup FS mounted under the given mountPoint. It does not check +// NewFS returns a new cgroup FS mounted under the given mountPoint. It does not check // the provided mount mode -func newFS(mountPoint string, mountMode cgUnifiedMountMode) (FS, error) { +func NewFS(mountPoint string, mountMode MountMode) (FS, error) { info, err := os.Stat(mountPoint) if err != nil { return FS{}, fmt.Errorf("could not read %s: %s", mountPoint, err) @@ -54,36 +54,33 @@ func (fs FS) path(p ...string) string { return filepath.Join(append([]string{string(fs.mountPoint)}, p...)...) } -// cgUnifiedMountMode constant values describe how cgroup filesystems (aka hierarchies) are -// mounted underneath /sys/fs/cgroup. In cgroups-v1 there are many mounts, -// one per controller (cpu, blkio, etc) and one for systemd itself. In -// cgroups-v2 there is only one mount managed entirely by systemd and -// internally exposing all controller syscalls. As kernel+distros migrate towards -// cgroups-v2, systemd has a hybrid mode where it mounts v2 and uses -// that for process management but also mounts all the v1 filesystem -// hierarchies and uses them for resource accounting and control -type cgUnifiedMountMode int8 +// MountMode constants describe how the kernel has mounted various cgroup filesystems under /sys/fs/cgroup. +// Generally speaking, kernels using the cgroups-v1 API will have many cgroup controller hierarchies, each with +// their own fs and their own mount point. Kernels using cgroups-v2 API will only have the one unified hierarchy. +// To support back compatibility, kernels often mount both the v1 and v2 hierarchies at different points. Systemd +// has to know where the hierarchies are, so it inspects the mounts under /sys/fs/cgroup and decides what +// MountMode this kernel is using. See each constant for a description of that mode. This type corresponds to +// the unified_cache variable in systemd/src/basic/cgroup-util.c +type MountMode int8 const ( - // unifModeUnknown indicates that we do not know if/how any - // cgroup filesystems are mounted underneath /sys/fs/cgroup - unifModeUnknown cgUnifiedMountMode = iota - // unifModeNone indicates that both systemd and the controllers - // are using v1 legacy mounts and there is no usage of the v2 - // unified hierarchy. a.k.a "legacy hierarchy" - unifModeNone cgUnifiedMountMode = iota - // unifModeSystemd indicates that systemd is using a v2 unified - // hierarcy for organizing processes into control groups, but all - // controller interaction is using v1 per-controller hierarchies. - // a.k.a. "hybrid hierarchy" - unifModeSystemd cgUnifiedMountMode = iota - // unifModeAll indicates that v2 API is in full usage and there - // are no v1 hierarchies exported. Programs (mainly container orchestrators - // such as docker,runc,etc) that rely on v1 APIs will be broken. - // a.k.a. "unified hierarchy" - unifModeAll cgUnifiedMountMode = iota + // MountModeUnknown indicates we do not recognize the mount pattern of the cgroup filesystems in /sys/fs/cgroup. + // systemd source calls this mode CGROUP_UNIFIED_UNKNOWN + MountModeUnknown MountMode = iota + // MountModeLegacy indicates both systemd and individual cgroups are using cgroup-v1 hierarchies. There is + // typically one mount point per hierarchy, and no usage of the cgroup-v2 unified hierarchy. + // systemd source calls this mode CGROUP_UNIFIED_NONE + MountModeLegacy MountMode = iota + // MountModeHybrid indicates the systemd controller is using cgroup-v2 unified hierarchy for organizing + // processes, but all other cgroups are using cgroup-v1 legacy hierarchies. + // systemd source calls this CGROUP_UNIFIED_SYSTEMD and also stores the unified_systemd_v232 flag + MountModeHybrid MountMode = iota + // MountModeUnified indicates cgroup-v2 API is in full usage and there are no cgroup-v1 hierarchies mounted. + // Non-updated programs (e.g. container orchestrators such as docker/runc) that rely on cgroup-v1 mounts will break. + // systemd source calls this CGROUP_UNIFIED_ALL + MountModeUnified MountMode = iota ) -func (c cgUnifiedMountMode) String() string { +func (c MountMode) String() string { return [...]string{"unknown", "none", "systemd", "all"}[c] } @@ -105,48 +102,48 @@ const ( // may require restarting systemd-exporter // Equivalent to systemd cgroup-util.c#cg_unified_cached var statfsFunc = unix.Statfs -func cgUnifiedCached() (cgUnifiedMountMode, error) { - // if cgroupUnified != unifModeUnknown { +func cgUnifiedCached() (MountMode, error) { + // if cgroupUnified != MountModeUnknown { // return cgroupUnified, nil // } var fs unix.Statfs_t err := statfsFunc("/sys/fs/cgroup/", &fs) if err != nil { - return unifModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup/)") + return MountModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup)") } switch fs.Type { case cgroup2SuperMagic: log.Debugf("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy") - return unifModeAll, nil + return MountModeUnified, nil case tmpFsMagic: err := statfsFunc("/sys/fs/cgroup/unified/", &fs) // Ignore err, we expect path to be missing on v232 if err == nil && fs.Type == cgroup2SuperMagic { log.Debugf("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller") - return unifModeSystemd, nil + return MountModeHybrid, nil } err = statfsFunc("/sys/fs/cgroup/systemd/", &fs) if err != nil { - return unifModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup/systemd)") + return MountModeUnknown, errors.Wrapf(err, "failed statfs(/sys/fs/cgroup/systemd)") } switch fs.Type { case cgroup2SuperMagic: log.Debugf("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)") - return unifModeSystemd, nil + return MountModeHybrid, nil case cgroupSuperMagic: log.Debugf("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy") - return unifModeNone, nil + return MountModeLegacy, nil default: - return unifModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup/systemd)", fs.Type) + return MountModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup/systemd)", fs.Type) } default: - return unifModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup)", fs.Type) + return MountModeUnknown, errors.Errorf("unknown magic number %x for fstype returned by statfs(/sys/fs/cgroup)", fs.Type) } } @@ -157,7 +154,7 @@ func (fs FS) cgGetPath(controller string, subpath string, suffix string) (string // relevant systemd source code in cgroup-util.[h|c] specifically cg_get_path // 2. Joins controller name with base path - if fs.cgroupUnified == unifModeUnknown { + if fs.cgroupUnified == MountModeUnknown { return "", errors.Errorf("Cannot determine path with unknown mounting hierarchy") } @@ -167,9 +164,9 @@ func (fs FS) cgGetPath(controller string, subpath string, suffix string) (string joined := "" switch fs.cgroupUnified { - case unifModeNone, unifModeSystemd: + case MountModeLegacy, MountModeHybrid: joined = fs.path(dn, subpath, suffix) - case unifModeAll: + case MountModeUnified: joined = fs.path(subpath, suffix) default: return "", errors.Errorf("unknown cgroup mount mode (e.g. unified mode) %d", fs.cgroupUnified) diff --git a/cgroup/cgroup_test.go b/cgroup/cgroup_test.go index 5836356..9c76d4a 100644 --- a/cgroup/cgroup_test.go +++ b/cgroup/cgroup_test.go @@ -105,17 +105,17 @@ func TestCgUnifiedCached(t *testing.T) { tables := []struct { name string statFn func(string,*unix.Statfs_t) error - expectedMode cgUnifiedMountMode + expectedMode MountMode errExpected bool }{ - {"NoCgroupMount", noCgroupMount, unifModeUnknown, true}, - {"UnknownCgroupMountType", unknownCgroupMount, unifModeUnknown, true}, - {"LegacyMount", legacyMount, unifModeNone, false}, - {"HybridMount, v232", hybridMountSystemdV232, unifModeSystemd, false}, - {"HybridMount, v233+", hybridMountSystemdV233, unifModeSystemd, false}, - {"MissingSystemdFolder", missingSystemdFolder, unifModeUnknown, true}, - {"UnknownSystemdFolderType", unknownSystemdFolderMountType, unifModeUnknown, true}, - {"UnifiedMount", unifiedMount, unifModeAll, false}, + {"NoCgroupMount", noCgroupMount, MountModeUnknown, true}, + {"UnknownCgroupMountType", unknownCgroupMount, MountModeUnknown, true}, + {"LegacyMount", legacyMount, MountModeLegacy, false}, + {"HybridMount, v232", hybridMountSystemdV232, MountModeHybrid, false}, + {"HybridMount, v233+", hybridMountSystemdV233, MountModeHybrid, false}, + {"MissingSystemdFolder", missingSystemdFolder, MountModeUnknown, true}, + {"UnknownSystemdFolderType", unknownSystemdFolderMountType, MountModeUnknown, true}, + {"UnifiedMount", unifiedMount, MountModeUnified, false}, } for _, table := range tables { @@ -131,27 +131,24 @@ func TestCgUnifiedCached(t *testing.T) { t.Errorf("%s: expected mode %s but got mode %s", table.name, table.expectedMode, mode) } } - - - } func TestNewFS(t *testing.T) { - if _, err := newFS("foobar", unifModeUnknown); err == nil { - t.Error("newFS should have failed with non-existing path") + if _, err := NewFS("foobar", MountModeUnknown); err == nil { + t.Error("NewFS should have failed with non-existing path") } - if _, err := newFS("cgroups_test.go", unifModeUnknown); err == nil { - t.Error("want newFS to fail if mount point is not a dir") + if _, err := NewFS("cgroups_test.go", MountModeUnknown); err == nil { + t.Error("want NewFS to fail if mount point is not a dir") } - if _, err := newFS(testFixturesHybrid, unifModeUnknown); err != nil { - t.Error("want newFS to succeed if mount point exists") + if _, err := NewFS(testFixturesHybrid, MountModeUnknown); err != nil { + t.Error("want NewFS to succeed if mount point exists") } } func getHybridFixtures(t *testing.T) FS { - fs, err := newFS(testFixturesHybrid, unifModeSystemd) + fs, err := NewFS(testFixturesHybrid, MountModeHybrid) if err != nil { t.Fatal("Unable to create hybrid text fixtures") } @@ -161,11 +158,11 @@ func getHybridFixtures(t *testing.T) FS { func TestCgSubpath(t *testing.T) { fs := getHybridFixtures(t) - fs.cgroupUnified = unifModeUnknown + fs.cgroupUnified = MountModeUnknown if _, err := fs.cgGetPath("cpu", "/system.slice", "cpuacct.usage_all"); err == nil { t.Error("should not be able to determine path with unknown mount mode") } - fs.cgroupUnified = unifModeSystemd + fs.cgroupUnified = MountModeHybrid path, err := fs.cgGetPath("cpu", "/system.slice", "cpuacct.usage_all") if err != nil { t.Error("should be able to determine path with systemd mount mode")