Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize network metrics collection #3302

Merged
merged 1 commit into from
Jun 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions container/common/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -437,3 +437,20 @@ func (m deviceIdentifierMap) Find(major, minor uint64, namer DeviceNamer) string
m[d] = s
return s
}

// RemoveNetMetrics is used to remove any network metrics from the given MetricSet.
// It returns the original set as is if remove is false, or if there are no metrics
// to remove.
func RemoveNetMetrics(metrics container.MetricSet, remove bool) container.MetricSet {
if !remove {
return metrics
}

// Check if there is anything we can remove, to avoid useless copying.
if !metrics.HasAny(container.AllNetworkMetrics) {
return metrics
}

// A copy of all metrics except for network ones.
return metrics.Difference(container.AllNetworkMetrics)
}
54 changes: 54 additions & 0 deletions container/common/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,16 @@ package common

import (
"errors"
"fmt"
"math"
"os"
"path/filepath"
"reflect"
"testing"

"github.com/stretchr/testify/assert"

"github.com/google/cadvisor/container"
info "github.com/google/cadvisor/info/v1"
v2 "github.com/google/cadvisor/info/v2"
)
Expand Down Expand Up @@ -193,3 +196,54 @@ func TestGetSpecCgroupV2Max(t *testing.T) {

assert.EqualValues(t, spec.Processes.Limit, max)
}

func TestRemoveNetMetrics(t *testing.T) {
for _, ts := range []struct {
desc string
in, out container.MetricSet
}{
{
desc: "nil set",
in: nil,
},
{
desc: "empty set",
in: container.MetricSet{},
},
{
desc: "nothing to remove",
in: container.MetricSet{container.MemoryUsageMetrics: struct{}{}, container.PerfMetrics: struct{}{}},
},
{
desc: "also nothing to remove",
in: container.AllMetrics.Difference(container.AllNetworkMetrics),
},
{
desc: "remove net from all",
in: container.AllMetrics,
out: container.AllMetrics.Difference(container.AllNetworkMetrics),
},
{
desc: "remove net from some",
in: container.MetricSet{container.MemoryUsageMetrics: struct{}{}, container.NetworkTcpUsageMetrics: struct{}{}},
out: container.MetricSet{container.MemoryUsageMetrics: struct{}{}},
},
} {
for _, remove := range []bool{true, false} {
ts, remove := ts, remove
desc := fmt.Sprintf("%s, remove: %v", ts.desc, remove)
t.Run(desc, func(t *testing.T) {
out := RemoveNetMetrics(ts.in, remove)
if !remove || ts.out == nil {
// Compare the actual underlying pointers. Can't use assert.Same
// because it checks for pointer type, and these are maps.
if reflect.ValueOf(ts.in) != reflect.ValueOf(out) {
t.Errorf("expected original map %p, got %p", ts.in, out)
}
} else {
assert.Equal(t, ts.out, out)
}
})
}
}
}
32 changes: 11 additions & 21 deletions container/containerd/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,22 @@ func newContainerdContainerHandler(
Aliases: []string{id, name},
}

libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootfs, int(taskPid), includedMetrics)
// Containers that don't have their own network -- this includes
// containers running in Kubernetes pods that use the network of the
// infrastructure container -- does not need their stats to be
// reported. This stops metrics being reported multiple times for each
// container in a pod.
metrics := common.RemoveNetMetrics(includedMetrics, cntr.Labels["io.cri-containerd.kind"] != "sandbox")

libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootfs, int(taskPid), metrics)

handler := &containerdContainerHandler{
machineInfoFactory: machineInfoFactory,
cgroupPaths: cgroupPaths,
fsInfo: fsInfo,
envs: make(map[string]string),
labels: cntr.Labels,
includedMetrics: includedMetrics,
includedMetrics: metrics,
reference: containerReference,
libcontainerHandler: libcontainerHandler,
}
Expand Down Expand Up @@ -164,22 +171,12 @@ func (h *containerdContainerHandler) ContainerReference() (info.ContainerReferen
return h.reference, nil
}

func (h *containerdContainerHandler) needNet() bool {
// Since containerd does not handle networking ideally we need to return based
// on includedMetrics list. Here the assumption is the presence of cri-containerd
// label
if h.includedMetrics.Has(container.NetworkUsageMetrics) {
//TODO change it to exported cri-containerd constants
return h.labels["io.cri-containerd.kind"] == "sandbox"
}
return false
}

func (h *containerdContainerHandler) GetSpec() (info.ContainerSpec, error) {
// TODO: Since we dont collect disk usage stats for containerd, we set hasFilesystem
// to false. Revisit when we support disk usage stats for containerd
hasFilesystem := false
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, h.needNet(), hasFilesystem)
hasNet := h.includedMetrics.Has(container.NetworkUsageMetrics)
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, hasNet, hasFilesystem)
spec.Labels = h.labels
spec.Envs = h.envs
spec.Image = h.image
Expand All @@ -204,13 +201,6 @@ func (h *containerdContainerHandler) GetStats() (*info.ContainerStats, error) {
if err != nil {
return stats, err
}
// Clean up stats for containers that don't have their own network - this
// includes containers running in Kubernetes pods that use the network of the
// infrastructure container. This stops metrics being reported multiple times
// for each container in a pod.
if !h.needNet() {
stats.Network = info.NetworkStats{}
}

// Get filesystem stats.
err = h.getFsStats(stats)
Expand Down
30 changes: 13 additions & 17 deletions container/crio/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,15 @@ func newCrioContainerHandler(
Namespace: CrioNamespace,
}

libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootFs, cInfo.Pid, includedMetrics)
// Find out if we need network metrics reported for this container.
// Containers that don't have their own network -- this includes
// containers running in Kubernetes pods that use the network of the
// infrastructure container -- does not need their stats to be
// reported. This stops metrics being reported multiple times for each
// container in a pod.
metrics := common.RemoveNetMetrics(includedMetrics, cInfo.Labels["io.kubernetes.container.name"] != "POD")

libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootFs, cInfo.Pid, metrics)

// TODO: extract object mother method
handler := &crioContainerHandler{
Expand All @@ -161,7 +169,7 @@ func newCrioContainerHandler(
rootfsStorageDir: rootfsStorageDir,
envs: make(map[string]string),
labels: cInfo.Labels,
includedMetrics: includedMetrics,
includedMetrics: metrics,
reference: containerReference,
libcontainerHandler: libcontainerHandler,
cgroupManager: cgroupManager,
Expand Down Expand Up @@ -210,16 +218,10 @@ func (h *crioContainerHandler) ContainerReference() (info.ContainerReference, er
return h.reference, nil
}

func (h *crioContainerHandler) needNet() bool {
if h.includedMetrics.Has(container.NetworkUsageMetrics) {
return h.labels["io.kubernetes.container.name"] == "POD"
}
return false
}

func (h *crioContainerHandler) GetSpec() (info.ContainerSpec, error) {
hasFilesystem := h.includedMetrics.Has(container.DiskUsageMetrics)
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, h.needNet(), hasFilesystem)
hasNet := h.includedMetrics.Has(container.NetworkUsageMetrics)
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, hasNet, hasFilesystem)

spec.Labels = h.labels
spec.Envs = h.envs
Expand Down Expand Up @@ -306,13 +308,7 @@ func (h *crioContainerHandler) GetStats() (*info.ContainerStats, error) {
return stats, err
}

if !h.needNet() {
// Clean up stats for containers that don't have their own network - this
// includes containers running in Kubernetes pods that use the network of the
// infrastructure container. This stops metrics being reported multiple times
// for each container in a pod.
stats.Network = info.NetworkStats{}
} else if len(stats.Network.Interfaces) == 0 {
if h.includedMetrics.Has(container.NetworkUsageMetrics) && len(stats.Network.Interfaces) == 0 {
// No network related information indicates that the pid of the
// container is not longer valid and we need to ask crio to
// provide the pid of another container from that pod
Expand Down
29 changes: 7 additions & 22 deletions container/docker/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ import (
"github.com/google/cadvisor/zfs"
"github.com/opencontainers/runc/libcontainer/cgroups"

dockercontainer "github.com/docker/docker/api/types/container"
docker "github.com/docker/docker/client"
"golang.org/x/net/context"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -72,9 +71,6 @@ type dockerContainerHandler struct {
// Image name used for this container.
image string

// The network mode of the container
networkMode dockercontainer.NetworkMode

// Filesystem handler.
fsHandler common.FsHandler

Expand Down Expand Up @@ -188,6 +184,9 @@ func newDockerContainerHandler(
return nil, fmt.Errorf("failed to inspect container %q: %v", id, err)
}

// Do not report network metrics for containers that share netns with another container.
metrics := common.RemoveNetMetrics(includedMetrics, ctnr.HostConfig.NetworkMode.IsContainer())

// TODO: extract object mother method
handler := &dockerContainerHandler{
machineInfoFactory: machineInfoFactory,
Expand All @@ -198,7 +197,7 @@ func newDockerContainerHandler(
rootfsStorageDir: rootfsStorageDir,
envs: make(map[string]string),
labels: ctnr.Config.Labels,
includedMetrics: includedMetrics,
includedMetrics: metrics,
zfsParent: zfsParent,
}
// Timestamp returned by Docker is in time.RFC3339Nano format.
Expand All @@ -207,7 +206,7 @@ func newDockerContainerHandler(
// This should not happen, report the error just in case
return nil, fmt.Errorf("failed to parse the create timestamp %q for container %q: %v", ctnr.Created, id, err)
}
handler.libcontainerHandler = containerlibcontainer.NewHandler(cgroupManager, rootFs, ctnr.State.Pid, includedMetrics)
handler.libcontainerHandler = containerlibcontainer.NewHandler(cgroupManager, rootFs, ctnr.State.Pid, metrics)

// Add the name and bare ID as aliases of the container.
handler.reference = info.ContainerReference{
Expand All @@ -217,7 +216,6 @@ func newDockerContainerHandler(
Namespace: DockerNamespace,
}
handler.image = ctnr.Config.Image
handler.networkMode = ctnr.HostConfig.NetworkMode
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why was this removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is because the only user of h.networkMode was func (h *dockerContainerHandler) needNet() bool.

That method is now gone, and so the field is unused.

// Only adds restartcount label if it's greater than 0
if ctnr.RestartCount > 0 {
handler.labels["restartcount"] = strconv.Itoa(ctnr.RestartCount)
Expand Down Expand Up @@ -344,16 +342,10 @@ func (h *dockerContainerHandler) ContainerReference() (info.ContainerReference,
return h.reference, nil
}

func (h *dockerContainerHandler) needNet() bool {
if h.includedMetrics.Has(container.NetworkUsageMetrics) {
return !h.networkMode.IsContainer()
}
return false
}

func (h *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) {
hasFilesystem := h.includedMetrics.Has(container.DiskUsageMetrics)
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, h.needNet(), hasFilesystem)
hasNetwork := h.includedMetrics.Has(container.NetworkUsageMetrics)
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, hasNetwork, hasFilesystem)

spec.Labels = h.labels
spec.Envs = h.envs
Expand Down Expand Up @@ -462,13 +454,6 @@ func (h *dockerContainerHandler) GetStats() (*info.ContainerStats, error) {
if err != nil {
return stats, err
}
// Clean up stats for containers that don't have their own network - this
// includes containers running in Kubernetes pods that use the network of the
// infrastructure container. This stops metrics being reported multiple times
// for each container in a pod.
if !h.needNet() {
stats.Network = info.NetworkStats{}
}

// Get filesystem stats.
err = h.getFsStats(stats)
Expand Down
17 changes: 17 additions & 0 deletions container/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,14 @@ var AllMetrics = MetricSet{
OOMMetrics: struct{}{},
}

// AllNetworkMetrics represents all network metrics that cAdvisor supports.
var AllNetworkMetrics = MetricSet{
NetworkUsageMetrics: struct{}{},
NetworkTcpUsageMetrics: struct{}{},
NetworkAdvancedTcpUsageMetrics: struct{}{},
NetworkUdpUsageMetrics: struct{}{},
}

func (mk MetricKind) String() string {
return string(mk)
}
Expand All @@ -104,6 +112,15 @@ func (ms MetricSet) Has(mk MetricKind) bool {
return exists
}

func (ms MetricSet) HasAny(ms1 MetricSet) bool {
for m := range ms1 {
if _, ok := ms[m]; ok {
return true
}
}
return false
}

func (ms MetricSet) add(mk MetricKind) {
ms[mk] = struct{}{}
}
Expand Down