open-telemetry · TylerHelmuth · Dec 8, 2023 · Oct 2, 2023 · Oct 2, 2023 · Oct 4, 2023
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: bug_fix
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: awscontainerinsightreceiver
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Filter terminated pods from node request metrics
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [27262]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [user]
@@ -265,10 +265,12 @@ func (p *PodStore) refreshInternal(now time.Time, podList []corev1.Pod) {
 			p.logger.Warn(fmt.Sprintf("podKey is unavailable, refresh pod store for pod %s", pod.Name))
 			continue
 		}
-		tmpCPUReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getCPUCapacity(), cpuKey, getRequestForContainer)
-		cpuRequest += tmpCPUReq
-		tmpMemReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getMemCapacity(), memoryKey, getRequestForContainer)
-		memRequest += tmpMemReq
+		if pod.Status.Phase != corev1.PodSucceeded && pod.Status.Phase != corev1.PodFailed {
+			tmpCPUReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getCPUCapacity(), cpuKey, getRequestForContainer)
+			cpuRequest += tmpCPUReq
+			tmpMemReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getMemCapacity(), memoryKey, getRequestForContainer)
+			memRequest += tmpMemReq
+		}
 		if pod.Status.Phase == corev1.PodRunning {
 			podCount++
 		}

@@ -624,6 +624,63 @@ func TestPodStore_decorateNode(t *testing.T) {
 	assert.Equal(t, int(1), metric.GetField("node_number_of_running_pods").(int))
 }
 
+func TestPodStore_decorateNode_multiplePodStates(t *testing.T) {
+	podStore := getPodStore()
+	defer require.NoError(t, podStore.Shutdown())
+
+	tags := map[string]string{ci.MetricType: ci.TypeNode}
+	fields := map[string]any{
+		ci.MetricName(ci.TypeNode, ci.CPUTotal):      float64(100),
+		ci.MetricName(ci.TypeNode, ci.CPULimit):      uint64(4000),
+		ci.MetricName(ci.TypeNode, ci.MemWorkingset): float64(100 * 1024 * 1024),
+		ci.MetricName(ci.TypeNode, ci.MemLimit):      uint64(400 * 1024 * 1024),
+	}
+	metric := generateMetric(fields, tags)
+
+	// terminated pods should not contribute to requests
+	failedPod := getBaseTestPodInfo()
+	failedPod.Status.Phase = corev1.PodFailed
+	succeededPod := getBaseTestPodInfo()
+	succeededPod.Status.Phase = corev1.PodSucceeded
+	podList := []corev1.Pod{*failedPod, *succeededPod}
+	podStore.refreshInternal(time.Now(), podList)
+	podStore.decorateNode(metric)
+
+	assert.Equal(t, uint64(0), metric.GetField("node_cpu_request").(uint64))
+	assert.Equal(t, uint64(4000), metric.GetField("node_cpu_limit").(uint64))
+	assert.Equal(t, float64(0), metric.GetField("node_cpu_reserved_capacity").(float64))
+	assert.Equal(t, float64(100), metric.GetField("node_cpu_usage_total").(float64))
+
+	assert.Equal(t, uint64(0), metric.GetField("node_memory_request").(uint64))
+	assert.Equal(t, uint64(400*1024*1024), metric.GetField("node_memory_limit").(uint64))
+	assert.Equal(t, float64(0), metric.GetField("node_memory_reserved_capacity").(float64))
+	assert.Equal(t, float64(100*1024*1024), metric.GetField("node_memory_working_set").(float64))
+
+	// non-terminated pods should contribute to requests
+	pendingPod := getBaseTestPodInfo()
+	pendingPod.Status.Phase = corev1.PodPending
+	podList = append(podList, *pendingPod)
+	podStore.refreshInternal(time.Now(), podList)
+	podStore.decorateNode(metric)
+	assert.Equal(t, uint64(10), metric.GetField("node_cpu_request").(uint64))
+	assert.Equal(t, float64(0.25), metric.GetField("node_cpu_reserved_capacity").(float64))
+
+	assert.Equal(t, uint64(50*1024*1024), metric.GetField("node_memory_request").(uint64))
+	assert.Equal(t, float64(12.5), metric.GetField("node_memory_reserved_capacity").(float64))
+
+	runningPod := getBaseTestPodInfo()
+	runningPod.Status.Phase = corev1.PodRunning
+	podList = append(podList, *runningPod)
+	podStore.refreshInternal(time.Now(), podList)
+	podStore.decorateNode(metric)
+
+	assert.Equal(t, uint64(20), metric.GetField("node_cpu_request").(uint64))
+	assert.Equal(t, float64(0.5), metric.GetField("node_cpu_reserved_capacity").(float64))
+
+	assert.Equal(t, uint64(100*1024*1024), metric.GetField("node_memory_request").(uint64))
+	assert.Equal(t, float64(25), metric.GetField("node_memory_reserved_capacity").(float64))
+}
+
 func TestPodStore_Decorate(t *testing.T) {
 	// not the metrics for decoration
 	tags := map[string]string{}