From 274af39a5e22781e1e5fb1c73814985d0b7e7c6b Mon Sep 17 00:00:00 2001 From: Daniel Nelson Date: Thu, 15 Nov 2018 15:43:47 -0800 Subject: [PATCH] Update kubernetes input docs (#4990) --- plugins/inputs/kubernetes/README.md | 360 +++++++++------------------- 1 file changed, 115 insertions(+), 245 deletions(-) diff --git a/plugins/inputs/kubernetes/README.md b/plugins/inputs/kubernetes/README.md index 099cf152650df..37d713d18f1b0 100644 --- a/plugins/inputs/kubernetes/README.md +++ b/plugins/inputs/kubernetes/README.md @@ -1,7 +1,5 @@ # Kubernetes Input Plugin -**This plugin is experimental and may cause high cardinality issues with moderate to large Kubernetes deployments** - This input plugin talks to the kubelet api using the `/stats/summary` endpoint to gather metrics about the running pods and containers for a single host. It is assumed that this plugin is running as part of a `daemonset` within a kubernetes installation. This means that telegraf is running on every node within the cluster. Therefore, you should configure this plugin to talk to its locally running kubelet. To find the ip address of the host you are running on you can issue a command like the following: @@ -10,256 +8,128 @@ $ curl -s $API_URL/api/v1/namespaces/$POD_NAMESPACE/pods/$HOSTNAME --header "Aut ``` In this case we used the downward API to pass in the `$POD_NAMESPACE` and `$HOSTNAME` is the hostname of the pod which is set by the kubernetes API. -## Summary Data +#### Series Cardinality Warning -```json -{ - "node": { - "nodeName": "node1", - "systemContainers": [ - { - "name": "kubelet", - "startTime": "2016-08-25T18:46:52Z", - "cpu": { - "time": "2016-09-27T16:57:31Z", - "usageNanoCores": 56652446, - "usageCoreNanoSeconds": 101437561712262 - }, - "memory": { - "time": "2016-09-27T16:57:31Z", - "usageBytes": 62529536, - "workingSetBytes": 62349312, - "rssBytes": 47509504, - "pageFaults": 4769397409, - "majorPageFaults": 13 - }, - "rootfs": { - "availableBytes": 84379979776, - "capacityBytes": 105553100800 - }, - "logs": { - "availableBytes": 84379979776, - "capacityBytes": 105553100800 - }, - "userDefinedMetrics": null - }, - { - "name": "bar", - "startTime": "2016-08-25T18:46:52Z", - "cpu": { - "time": "2016-09-27T16:57:31Z", - "usageNanoCores": 56652446, - "usageCoreNanoSeconds": 101437561712262 - }, - "memory": { - "time": "2016-09-27T16:57:31Z", - "usageBytes": 62529536, - "workingSetBytes": 62349312, - "rssBytes": 47509504, - "pageFaults": 4769397409, - "majorPageFaults": 13 - }, - "rootfs": { - "availableBytes": 84379979776, - "capacityBytes": 105553100800 - }, - "logs": { - "availableBytes": 84379979776, - "capacityBytes": 105553100800 - }, - "userDefinedMetrics": null - } - ], - "startTime": "2016-08-25T18:46:52Z", - "cpu": { - "time": "2016-09-27T16:57:41Z", - "usageNanoCores": 576996212, - "usageCoreNanoSeconds": 774129887054161 - }, - "memory": { - "time": "2016-09-27T16:57:41Z", - "availableBytes": 10726387712, - "usageBytes": 12313182208, - "workingSetBytes": 5081538560, - "rssBytes": 35586048, - "pageFaults": 351742, - "majorPageFaults": 1236 - }, - "network": { - "time": "2016-09-27T16:57:41Z", - "rxBytes": 213281337459, - "rxErrors": 0, - "txBytes": 292869995684, - "txErrors": 0 - }, - "fs": { - "availableBytes": 84379979776, - "capacityBytes": 105553100800, - "usedBytes": 16754286592 - }, - "runtime": { - "imageFs": { - "availableBytes": 84379979776, - "capacityBytes": 105553100800, - "usedBytes": 5809371475 - } - } - }, - "pods": [ - { - "podRef": { - "name": "foopod", - "namespace": "foons", - "uid": "6d305b06-8419-11e6-825c-42010af000ae" - }, - "startTime": "2016-09-26T18:45:42Z", - "containers": [ - { - "name": "foocontainer", - "startTime": "2016-09-26T18:46:43Z", - "cpu": { - "time": "2016-09-27T16:57:32Z", - "usageNanoCores": 846503, - "usageCoreNanoSeconds": 56507553554 - }, - "memory": { - "time": "2016-09-27T16:57:32Z", - "usageBytes": 30789632, - "workingSetBytes": 30789632, - "rssBytes": 30695424, - "pageFaults": 10761, - "majorPageFaults": 0 - }, - "rootfs": { - "availableBytes": 84379979776, - "capacityBytes": 105553100800, - "usedBytes": 57344 - }, - "logs": { - "availableBytes": 84379979776, - "capacityBytes": 105553100800, - "usedBytes": 24576 - }, - "userDefinedMetrics": null - } - ], - "network": { - "time": "2016-09-27T16:57:34Z", - "rxBytes": 70749124, - "rxErrors": 0, - "txBytes": 47813506, - "txErrors": 0 - }, - "volume": [ - { - "availableBytes": 7903948800, - "capacityBytes": 7903961088, - "usedBytes": 12288, - "name": "volume1" - }, - { - "availableBytes": 7903956992, - "capacityBytes": 7903961088, - "usedBytes": 4096, - "name": "volume2" - }, - { - "availableBytes": 7903948800, - "capacityBytes": 7903961088, - "usedBytes": 12288, - "name": "volume3" - }, - { - "availableBytes": 7903952896, - "capacityBytes": 7903961088, - "usedBytes": 8192, - "name": "volume4" - } - ] - } - ] - } - ``` +This plugin may produce a high number of series which, when not controlled +for, will cause high load on your database. Use the following techniques to +avoid cardinality issues: - ### Daemonset YAML +- Use [metric filtering][] options to exclude unneeded measurements and tags. +- Write to a database with an appropriate [retention policy][]. +- Limit series cardinality in your database using the + [max-series-per-database][] and [max-values-per-tag][] settings. +- Consider using the [Time Series Index][tsi]. +- Monitor your databases [series cardinality][]. +- Consult the [InfluxDB documentation][influx-docs] for the most up-to-date techniques. -```yaml -apiVersion: extensions/v1beta1 -kind: DaemonSet -metadata: - name: telegraf - namespace: telegraf -spec: - template: - metadata: - labels: - app: telegraf - spec: - serviceAccount: telegraf - containers: - - name: telegraf - image: quay.io/org/image:latest - imagePullPolicy: IfNotPresent - env: - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: "HOST_PROC" - value: "/rootfs/proc" - - name: "HOST_SYS" - value: "/rootfs/sys" - volumeMounts: - - name: sysro - mountPath: /rootfs/sys - readOnly: true - - name: procro - mountPath: /rootfs/proc - readOnly: true - - name: varrunutmpro - mountPath: /var/run/utmp - readOnly: true - - name: logger-redis-creds - mountPath: /var/run/secrets/deis/redis/creds - volumes: - - name: sysro - hostPath: - path: /sys - - name: procro - hostPath: - path: /proc - - name: varrunutmpro - hostPath: - path: /var/run/utmp -``` +### Configuration -### Line Protocol +```toml +[[inputs.kubernetes]] + ## URL for the kubelet + url = "http://127.0.0.1:10255" -#### kubernetes_pod_container -``` -kubernetes_pod_container,host=ip-10-0-0-0.ec2.internal, -container_name=deis-controller,namespace=deis, -node_name=ip-10-0-0-0.ec2.internal, pod_name=deis-controller-3058870187-xazsr, cpu_usage_core_nanoseconds=2432835i,cpu_usage_nanocores=0i, -logsfs_avaialble_bytes=121128271872i,logsfs_capacity_bytes=153567944704i, -logsfs_used_bytes=20787200i,memory_major_page_faults=0i, -memory_page_faults=175i,memory_rss_bytes=0i, -memory_usage_bytes=0i,memory_working_set_bytes=0i, -rootfs_available_bytes=121128271872i,rootfs_capacity_bytes=153567944704i, -rootfs_used_bytes=1110016i 1476477530000000000 - ``` + ## Use bearer token for authorization + # bearer_token = /path/to/bearer/token -#### kubernetes_pod_volume -``` -kubernetes_pod_volume,host=ip-10-0-0-0.ec2.internal,name=default-token-f7wts, -namespace=kube-system,node_name=ip-10-0-0-0.ec2.internal, -pod_name=kubernetes-dashboard-v1.1.1-t4x4t, available_bytes=8415240192i, -capacity_bytes=8415252480i,used_bytes=12288i 1476477530000000000 + ## Set response_timeout (default 5 seconds) + # response_timeout = "5s" + + ## Optional TLS Config + # tls_ca = /path/to/cafile + # tls_cert = /path/to/certfile + # tls_key = /path/to/keyfile + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false ``` -#### kubernetes_pod_network +### DaemonSet + +For recommendations on running Telegraf as a DaemonSet see [Monitoring Kubernetes +Architecture][k8s-telegraf] or view the [Helm charts][tick-charts]. + +### Metrics + +- kubernetes_node + - tags: + - node_name + - fields: + - cpu_usage_nanocores + - cpu_usage_core_nanoseconds + - memory_available_bytes + - memory_usage_bytes + - memory_working_set_bytes + - memory_rss_bytes + - memory_page_faults + - memory_major_page_faults + - network_rx_bytes + - network_rx_errors + - network_tx_bytes + - network_tx_errors + - fs_available_bytes + - fs_capacity_bytes + - fs_used_bytes + - runtime_image_fs_available_bytes + - runtime_image_fs_capacity_bytes + - runtime_image_fs_used_bytes + +- kubernetes_pod_container + - tags: + - container_name + - namespace + - node_name + - pod_name + - fields: + - cpu_usage_nanocores + - cpu_usage_core_nanoseconds + - memory_usage_bytes + - memory_working_set_bytes + - memory_rss_bytes + - memory_page_faults + - memory_major_page_faults + - rootfs_available_bytes + - rootfs_capacity_bytes + - rootfs_used_bytes + - logsfs_avaialble_bytes + - logsfs_capacity_bytes + - logsfs_used_bytes + +- kubernetes_pod_volume + - tags: + - volume_name + - namespace + - node_name + - pod_name + - fields: + - available_bytes + - capacity_bytes + - used_bytes + +- kubernetes_pod_network + - tags: + - namespace + - node_name + - pod_name + - fields: + - rx_bytes + - rx_errors + - tx_bytes + - tx_errors + +### Example Output + ``` -kubernetes_pod_network,host=ip-10-0-0-0.ec2.internal,namespace=deis, -node_name=ip-10-0-0-0.ec2.internal,pod_name=deis-controller-3058870187-xazsr, -rx_bytes=120671099i,rx_errors=0i, -tx_bytes=102451983i,tx_errors=0i 1476477530000000000 +kubernetes_pod_container,host=ip-10-0-0-0.ec2.internal,container_name=deis-controller,namespace=deis,node_name=ip-10-0-0-0.ec2.internal,pod_name=deis-controller-3058870187-xazsr cpu_usage_core_nanoseconds=2432835i,cpu_usage_nanocores=0i,logsfs_avaialble_bytes=121128271872i,logsfs_capacity_bytes=153567944704i,logsfs_used_bytes=20787200i,memory_major_page_faults=0i,memory_page_faults=175i,memory_rss_bytes=0i,memory_usage_bytes=0i,memory_working_set_bytes=0i,rootfs_available_bytes=121128271872i,rootfs_capacity_bytes=153567944704i,rootfs_used_bytes=1110016i 1476477530000000000 +kubernetes_pod_volume,host=ip-10-0-0-0.ec2.internal,name=default-token-f7wts,namespace=kube-system,node_name=ip-10-0-0-0.ec2.internal,pod_name=kubernetes-dashboard-v1.1.1-t4x4t available_bytes=8415240192i,capacity_bytes=8415252480i,used_bytes=12288i 1476477530000000000 +kubernetes_pod_network,host=ip-10-0-0-0.ec2.internal,namespace=deis,node_name=ip-10-0-0-0.ec2.internal,pod_name=deis-controller-3058870187-xazsr rx_bytes=120671099i,rx_errors=0i,tx_bytes=102451983i,tx_errors=0i 1476477530000000000 ``` + +[metric filtering]: https://github.com/influxdata/telegraf/blob/master/docs/CONFIGURATION.md#metric-filtering +[retention policy]: https://docs.influxdata.com/influxdb/latest/guides/downsampling_and_retention/ +[max-series-per-database]: https://docs.influxdata.com/influxdb/latest/administration/config/#max-series-per-database-1000000 +[max-values-per-tag]: https://docs.influxdata.com/influxdb/latest/administration/config/#max-values-per-tag-100000 +[tsi]: https://docs.influxdata.com/influxdb/latest/concepts/time-series-index/ +[series cardinality]: https://docs.influxdata.com/influxdb/latest/query_language/spec/#show-cardinality +[influx-docs]: https://docs.influxdata.com/influxdb/latest/ +[k8s-telegraf]: https://www.influxdata.com/blog/monitoring-kubernetes-architecture/ +[tick-charts]: https://github.com/influxdata/tick-charts