Skip to content

Commit

Permalink
[receiver/awscontainerinsight] Filter terminated pods from node reque…
Browse files Browse the repository at this point in the history
…st metrics. (#27299)

**Description:** The `node_<cpu|memory>_request` metrics and metrics
derived from them (`node_<cpu|memory>_reserved_capacity`) differ from
the output of `kubectl describe node <node_name>`. This is because
kubectl [filters out terminated
pods](https://github.com/kubernetes/kubectl/blob/302f330c8712e717ee45bbeff27e1d3008da9f00/pkg/describe/describe.go#L3624).
See linked issue for more details.

Adds a filter for terminated (succeeded/failed state) pods. 

**Link to tracking Issue:**
#27262

**Testing:** Added unit test to validate pod state filtering. Built and
deployed changes to cluster. Deployed `cpu-test` pod.


![image](https://github.com/amazon-contributing/opentelemetry-collector-contrib/assets/84729962/b557be2d-e14e-428a-895a-761f7724d9bd)


The gap is when the change was deployed. The metric drops after the
deployment due to the filter. The metric can be seen spiking up while
the `cpu-test` pod is running (~19:15) and then returns to the previous
request size after it has terminated.

**Documentation:** N/A
  • Loading branch information
jefchien authored Dec 8, 2023
1 parent 9c12d1e commit 5449419
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 4 deletions.
27 changes: 27 additions & 0 deletions .chloggen/filter-terminated-pods.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: bug_fix

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: awscontainerinsightreceiver

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Filter terminated pods from node request metrics

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [27262]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext:

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: [user]
10 changes: 6 additions & 4 deletions receiver/awscontainerinsightreceiver/internal/stores/podstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,10 +265,12 @@ func (p *PodStore) refreshInternal(now time.Time, podList []corev1.Pod) {
p.logger.Warn(fmt.Sprintf("podKey is unavailable, refresh pod store for pod %s", pod.Name))
continue
}
tmpCPUReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getCPUCapacity(), cpuKey, getRequestForContainer)
cpuRequest += tmpCPUReq
tmpMemReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getMemCapacity(), memoryKey, getRequestForContainer)
memRequest += tmpMemReq
if pod.Status.Phase != corev1.PodSucceeded && pod.Status.Phase != corev1.PodFailed {
tmpCPUReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getCPUCapacity(), cpuKey, getRequestForContainer)
cpuRequest += tmpCPUReq
tmpMemReq, _ := getResourceSettingForPod(&pod, p.nodeInfo.getMemCapacity(), memoryKey, getRequestForContainer)
memRequest += tmpMemReq
}
if pod.Status.Phase == corev1.PodRunning {
podCount++
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,63 @@ func TestPodStore_decorateNode(t *testing.T) {
assert.Equal(t, int(1), metric.GetField("node_number_of_running_pods").(int))
}

func TestPodStore_decorateNode_multiplePodStates(t *testing.T) {
podStore := getPodStore()
defer require.NoError(t, podStore.Shutdown())

tags := map[string]string{ci.MetricType: ci.TypeNode}
fields := map[string]any{
ci.MetricName(ci.TypeNode, ci.CPUTotal): float64(100),
ci.MetricName(ci.TypeNode, ci.CPULimit): uint64(4000),
ci.MetricName(ci.TypeNode, ci.MemWorkingset): float64(100 * 1024 * 1024),
ci.MetricName(ci.TypeNode, ci.MemLimit): uint64(400 * 1024 * 1024),
}
metric := generateMetric(fields, tags)

// terminated pods should not contribute to requests
failedPod := getBaseTestPodInfo()
failedPod.Status.Phase = corev1.PodFailed
succeededPod := getBaseTestPodInfo()
succeededPod.Status.Phase = corev1.PodSucceeded
podList := []corev1.Pod{*failedPod, *succeededPod}
podStore.refreshInternal(time.Now(), podList)
podStore.decorateNode(metric)

assert.Equal(t, uint64(0), metric.GetField("node_cpu_request").(uint64))
assert.Equal(t, uint64(4000), metric.GetField("node_cpu_limit").(uint64))
assert.Equal(t, float64(0), metric.GetField("node_cpu_reserved_capacity").(float64))
assert.Equal(t, float64(100), metric.GetField("node_cpu_usage_total").(float64))

assert.Equal(t, uint64(0), metric.GetField("node_memory_request").(uint64))
assert.Equal(t, uint64(400*1024*1024), metric.GetField("node_memory_limit").(uint64))
assert.Equal(t, float64(0), metric.GetField("node_memory_reserved_capacity").(float64))
assert.Equal(t, float64(100*1024*1024), metric.GetField("node_memory_working_set").(float64))

// non-terminated pods should contribute to requests
pendingPod := getBaseTestPodInfo()
pendingPod.Status.Phase = corev1.PodPending
podList = append(podList, *pendingPod)
podStore.refreshInternal(time.Now(), podList)
podStore.decorateNode(metric)
assert.Equal(t, uint64(10), metric.GetField("node_cpu_request").(uint64))
assert.Equal(t, float64(0.25), metric.GetField("node_cpu_reserved_capacity").(float64))

assert.Equal(t, uint64(50*1024*1024), metric.GetField("node_memory_request").(uint64))
assert.Equal(t, float64(12.5), metric.GetField("node_memory_reserved_capacity").(float64))

runningPod := getBaseTestPodInfo()
runningPod.Status.Phase = corev1.PodRunning
podList = append(podList, *runningPod)
podStore.refreshInternal(time.Now(), podList)
podStore.decorateNode(metric)

assert.Equal(t, uint64(20), metric.GetField("node_cpu_request").(uint64))
assert.Equal(t, float64(0.5), metric.GetField("node_cpu_reserved_capacity").(float64))

assert.Equal(t, uint64(100*1024*1024), metric.GetField("node_memory_request").(uint64))
assert.Equal(t, float64(25), metric.GetField("node_memory_reserved_capacity").(float64))
}

func TestPodStore_Decorate(t *testing.T) {
// not the metrics for decoration
tags := map[string]string{}
Expand Down

0 comments on commit 5449419

Please sign in to comment.