diff --git a/.chloggen/filter-terminated-pods.yaml b/.chloggen/filter-terminated-pods.yaml new file mode 100755 index 000000000000..8b6a840765ea --- /dev/null +++ b/.chloggen/filter-terminated-pods.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: awscontainerinsightreceiver + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Filter terminated pods from node request metrics + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [27262] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/receiver/awscontainerinsightreceiver/internal/stores/podstore.go b/receiver/awscontainerinsightreceiver/internal/stores/podstore.go index 3edd62433d5a..bf464069a349 100644 --- a/receiver/awscontainerinsightreceiver/internal/stores/podstore.go +++ b/receiver/awscontainerinsightreceiver/internal/stores/podstore.go @@ -265,10 +265,12 @@ func (p *PodStore) refreshInternal(now time.Time, podList []corev1.Pod) { p.logger.Warn(fmt.Sprintf("podKey is unavailable, refresh pod store for pod %s", pod.Name)) continue } - tmpCPUReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getCPUCapacity(), cpuKey, getRequestForContainer) - cpuRequest += tmpCPUReq - tmpMemReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getMemCapacity(), memoryKey, getRequestForContainer) - memRequest += tmpMemReq + if pod.Status.Phase != corev1.PodSucceeded && pod.Status.Phase != corev1.PodFailed { + tmpCPUReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getCPUCapacity(), cpuKey, getRequestForContainer) + cpuRequest += tmpCPUReq + tmpMemReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getMemCapacity(), memoryKey, getRequestForContainer) + memRequest += tmpMemReq + } if pod.Status.Phase == corev1.PodRunning { podCount++ } diff --git a/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go b/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go index 89c91bdcb3d9..34a2d94c283f 100644 --- a/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go +++ b/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go @@ -624,6 +624,63 @@ func TestPodStore_decorateNode(t *testing.T) { assert.Equal(t, int(1), metric.GetField("node_number_of_running_pods").(int)) } +func TestPodStore_decorateNode_multiplePodStates(t *testing.T) { + podStore := getPodStore() + defer require.NoError(t, podStore.Shutdown()) + + tags := map[string]string{ci.MetricType: ci.TypeNode} + fields := map[string]any{ + ci.MetricName(ci.TypeNode, ci.CPUTotal): float64(100), + ci.MetricName(ci.TypeNode, ci.CPULimit): uint64(4000), + ci.MetricName(ci.TypeNode, ci.MemWorkingset): float64(100 * 1024 * 1024), + ci.MetricName(ci.TypeNode, ci.MemLimit): uint64(400 * 1024 * 1024), + } + metric := generateMetric(fields, tags) + + // terminated pods should not contribute to requests + failedPod := getBaseTestPodInfo() + failedPod.Status.Phase = corev1.PodFailed + succeededPod := getBaseTestPodInfo() + succeededPod.Status.Phase = corev1.PodSucceeded + podList := []corev1.Pod{*failedPod, *succeededPod} + podStore.refreshInternal(time.Now(), podList) + podStore.decorateNode(metric) + + assert.Equal(t, uint64(0), metric.GetField("node_cpu_request").(uint64)) + assert.Equal(t, uint64(4000), metric.GetField("node_cpu_limit").(uint64)) + assert.Equal(t, float64(0), metric.GetField("node_cpu_reserved_capacity").(float64)) + assert.Equal(t, float64(100), metric.GetField("node_cpu_usage_total").(float64)) + + assert.Equal(t, uint64(0), metric.GetField("node_memory_request").(uint64)) + assert.Equal(t, uint64(400*1024*1024), metric.GetField("node_memory_limit").(uint64)) + assert.Equal(t, float64(0), metric.GetField("node_memory_reserved_capacity").(float64)) + assert.Equal(t, float64(100*1024*1024), metric.GetField("node_memory_working_set").(float64)) + + // non-terminated pods should contribute to requests + pendingPod := getBaseTestPodInfo() + pendingPod.Status.Phase = corev1.PodPending + podList = append(podList, *pendingPod) + podStore.refreshInternal(time.Now(), podList) + podStore.decorateNode(metric) + assert.Equal(t, uint64(10), metric.GetField("node_cpu_request").(uint64)) + assert.Equal(t, float64(0.25), metric.GetField("node_cpu_reserved_capacity").(float64)) + + assert.Equal(t, uint64(50*1024*1024), metric.GetField("node_memory_request").(uint64)) + assert.Equal(t, float64(12.5), metric.GetField("node_memory_reserved_capacity").(float64)) + + runningPod := getBaseTestPodInfo() + runningPod.Status.Phase = corev1.PodRunning + podList = append(podList, *runningPod) + podStore.refreshInternal(time.Now(), podList) + podStore.decorateNode(metric) + + assert.Equal(t, uint64(20), metric.GetField("node_cpu_request").(uint64)) + assert.Equal(t, float64(0.5), metric.GetField("node_cpu_reserved_capacity").(float64)) + + assert.Equal(t, uint64(100*1024*1024), metric.GetField("node_memory_request").(uint64)) + assert.Equal(t, float64(25), metric.GetField("node_memory_reserved_capacity").(float64)) +} + func TestPodStore_Decorate(t *testing.T) { // not the metrics for decoration tags := map[string]string{}