diff --git a/host/all-journald-logs.yaml b/host/all-journald-logs.yaml index 108aabe..87fd668 100644 --- a/host/all-journald-logs.yaml +++ b/host/all-journald-logs.yaml @@ -5,23 +5,6 @@ metadata: name: all-journald-logs spec: hostCollectors: - # Systemd Service Configurations for CRI, Kubelet - - run: - collectorName: "systemctl-cat-journald" - command: "systemctl" - args: ["cat", "systemd-journald"] - - run: - collectorName: "systemctl-cat-docker" - command: "systemctl" - args: ["cat", "docker"] - - run: - collectorName: "systemctl-cat-containerd" - command: "systemctl" - args: ["cat", "containerd"] - - run: - collectorName: "systemctl-cat-kubelet" - command: "systemctl" - args: ["cat", "kubelet"] # Logs for CRI, Kubelet, Kernel - run: collectorName: "journalctl-containerd-all" diff --git a/host/cri.yaml b/host/cri.yaml deleted file mode 100644 index e4fda11..0000000 --- a/host/cri.yaml +++ /dev/null @@ -1,70 +0,0 @@ -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: networking-issues-non-airgap -spec: - hostCollectors: - - diskUsage: - collectorName: root - path: / - - diskUsage: - collectorName: tmp - path: /tmp - - diskUsage: - collectorName: var-lib-kubelet - path: /var/lib/kubelet - - diskUsage: - collectorName: var-lib-docker - path: /var/lib/docker - - diskUsage: - collectorName: var-lib-containerd - path: /var/lib/containerd - - run: - collectorName: "docker-info" - command: "docker" - args: ["info"] - - run: - collectorName: "crictl-info" - command: "crictl" - args: ["info"] - - run: - collectorName: "crictl-ps" - command: "crictl" - args: ["ps", "-a"] - - run: - collectorName: "docker-ps" - command: "docker" - args: ["ps", "-a"] - - run: - collectorName: "docker-system-df" - command: "docker" - args: ["system", "df", "-v"] - - run: - collectorName: "systemctl-docker-status" - command: "systemctl" - args: ["status", "docker"] - - run: - collectorName: "systemctl-kubelet-status" - command: "systemctl" - args: ["status", "kubelet"] - - run: - collectorName: "systemctl-containerd-status" - command: "systemctl" - args: ["status", "containerd"] - # Logs for CRI, Kubelet, Kernel - - run: - collectorName: "journalctl-containerd" - command: "journalctl" - args: ["-u", "containerd", "--no-pager", "-S", "7 days ago"] - - run: - collectorName: "journalctl-kubelet" - command: "journalctl" - args: ["-u", "kubelet", "--no-pager", "-S", "7 days ago"] - - run: - collectorName: "journalctl-docker" - command: "journalctl" - args: ["-u", "docker", "--no-pager", "-S", "7 days ago"] - - run: - collectorName: "journalctl-dmesg" - command: "journalctl" - args: ["--dmesg", "--no-pager", "-S", "7 days ago"] diff --git a/host/cluster-down.yaml b/host/default.yaml similarity index 77% rename from host/cluster-down.yaml rename to host/default.yaml index 2b5c10b..d0a8a8f 100644 --- a/host/cluster-down.yaml +++ b/host/default.yaml @@ -2,9 +2,9 @@ apiVersion: troubleshoot.sh/v1beta2 kind: SupportBundle metadata: - name: cluster-down + name: default spec: - uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/host/cluster-down.yaml + uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/host/default.yaml hostCollectors: # System Info Collectors - blockDevices: {} @@ -14,6 +14,7 @@ spec: - ipv4Interfaces: {} - memory: {} - time: {} + - ipv4Interfaces: {} # Certificate Info for ETCD and K8s API - certificate: collectorName: k8s-api-keypair @@ -85,7 +86,7 @@ spec: - run: collectorName: "iostat" command: "iostat" - args: [] + args: ["-x"] - run: collectorName: "sestatus" command: "sestatus" @@ -138,6 +139,10 @@ spec: collectorName: "netstat-ports" command: "netstat" args: ["-t", "-u", "-l", "-p", "-n"] + - run: + collectorName: "netstat-route-table" + command: "netstat" + args: ["-r", "-n"] - run: collectorName: "sysctl" command: "sysctl" @@ -282,6 +287,85 @@ spec: - copy: collectorName: "kurl-logs" path: /var/log/kurl/* + - run: + collectorName: "kubeadm.conf" + command: "cat" + args: ["/opt/replicated/kubeadm.conf"] + - run: + collectorName: "kubeadm-init-raw.yaml" + command: "cat" + args: ["/opt/replicated/kubeadm-init-raw.yaml"] + - run: + collectorName: "kubeadm-flags.env" + command: "cat" + args: ["/var/lib/kubelet/kubeadm-flags.env"] + - run: + collectorName: "kurl-host-preflights" + command: "tail" + args: ["-n", "+1", "/var/lib/kurl/host-preflights/*"] + - run: + collectorName: "kubeadm-kustomize-patches" + command: "sh" + args: ["-c", "find /var/lib/kurl/kustomize -type f -exec tail -n +1 {} +;"] + - run: + collectorName: "tmp-kubeadm.conf" + command: "cat" + args: ["/var/lib/kubelet/tmp-kubeadm.conf"] + - http: + collectorName: curl-api-replicated-com + get: + url: https://api.replicated.com/healthz + - http: + collectorName: curl-get-replicated-com + get: + url: https://get.replicated.com/healthz + - http: + collectorName: curl-registry-replicated-com + get: + url: https://registry.replicated.com/healthz + - http: + collectorName: curl-proxy-replicated-com + get: + url: https://proxy.replicated.com/healthz + - http: + collectorName: curl-k8s-kurl-sh + get: + url: https://k8s.kurl.sh/healthz + - http: + collectorName: curl-replicated-app + get: + url: https://replicated.app/healthz + # System Info Collectors + - run: + collectorName: "du-root" + command: "sh" + args: ["-c", "du -Shax / --exclude /proc | sort -rh | head -20"] + - run: + collectorName: "mount" + command: "mount" + args: ["-l"] + - run: + collectorName: "vmstat" + command: "vmstat" + args: ["-w"] + - run: + collectorName: "ps-high-load" + command: "sh" + args: ["-c", "ps -eo s,user,cmd | grep ^[RD] | sort | uniq -c | sort -nbr | head -20"] + - filesystemPerformance: + collectorName: filesystem-latency-two-minute-benchmark + timeout: 2m + directory: /var/lib/etcd + fileSize: 22Mi + operationSizeBytes: 2300 + datasync: true + enableBackgroundIOPS: true + backgroundIOPSWarmupSeconds: 10 + backgroundWriteIOPS: 300 + backgroundWriteIOPSJobs: 6 + backgroundReadIOPS: 50 + backgroundReadIOPSJobs: 1 + exclude: true hostAnalyzers: - certificate: collectorName: k8s-api-keypair @@ -473,6 +557,87 @@ spec: message: curl -k https://localhost:6443/healthz returned HTTP CODE response 200. - warn: message: "Unexpected response. HTTP CODE response is not 200. Please, run `curl -ki https://localhost:6443/healthz` to check further information." + - http: + checkName: curl-api-replicated-com + collectorName: curl-api-replicated-com + outcomes: + - warn: + when: "error" + message: Error connecting to https://api.replicated.com/healthz + - pass: + when: "statusCode == 200" + message: Connected to https://api.replicated.com/healthz + - warn: + message: "Unexpected response" + - http: + checkName: curl-get-replicated-com + collectorName: curl-get-replicated-com + outcomes: + - warn: + when: "error" + message: Error connecting to https://get.replicated.com/healthz + - pass: + when: "statusCode == 200" + message: Connected to https://get.replicated.com/healthz + - warn: + message: "Unexpected response" + - http: + checkName: curl-registry-replicated-com + collectorName: curl-registry-replicated-com + outcomes: + - warn: + when: "error" + message: Error connecting to https://registry.replicated.com/healthz + - pass: + when: "statusCode == 200" + message: Connected to https://registry.replicated.com/healthz + - warn: + message: "Unexpected response" + - http: + checkName: curl-proxy-replicated-com + collectorName: curl-proxy-replicated-com + outcomes: + - warn: + when: "error" + message: Error connecting to https://proxy.replicated.com/healthz + - pass: + when: "statusCode == 200" + message: Connected to https://proxy.replicated.com/healthz + - warn: + message: "Unexpected response" + - http: + checkName: curl-k8s-kurl-sh + collectorName: curl-k8s-kurl-sh + outcomes: + - warn: + when: "error" + message: Error connecting to https://k8s.kurl.sh/healthz + - pass: + when: "statusCode == 200" + message: Connected to https://k8s.kurl.sh/healthz + - warn: + message: "Unexpected response" + - http: + checkName: curl-replicated-app + collectorName: curl-replicated-app + outcomes: + - warn: + when: "error" + message: Error connecting to https://replicated.app/healthz + - pass: + when: "statusCode == 200" + message: Connected to https://replicated.app/healthz + - warn: + message: "Unexpected response" + - filesystemPerformance: + collectorName: filesystem-latency-two-minute-benchmark + outcomes: + - pass: + when: "p99 < 10ms" + message: "Write latency is ok (p99 target < 10ms)" + - warn: + message: "Write latency is high. p99 target >= 10ms)" + exclude: true analyzers: - textAnalyze: checkName: Hostname Mismatch diff --git a/host/kubeadm-bootstrap.yaml b/host/kubeadm-bootstrap.yaml deleted file mode 100644 index 0a08dfa..0000000 --- a/host/kubeadm-bootstrap.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: kubeadm-bootstrap -spec: - hostCollectors: - - run: - collectorName: "kubeadm.conf" - command: "cat" - args: ["/opt/replicated/kubeadm.conf"] - - run: - collectorName: "kubeadm-init-raw.yaml" - command: "cat" - args: ["/opt/replicated/kubeadm-init-raw.yaml"] - - run: - collectorName: "kubeadm-flags.env" - command: "cat" - args: ["/var/lib/kubelet/kubeadm-flags.env"] - - run: - collectorName: "kurl-host-preflights" - command: "tail" - args: ["-n", "+1", "/var/lib/kurl/host-preflights/*"] - - run: - collectorName: "kubeadm-kustomize-patches" - command: "sh" - args: ["-c", "find /var/lib/kurl/kustomize -type f -exec tail -n +1 {} +;"] - - run: - collectorName: "tmp-kubeadm.conf" - command: "cat" - args: ["/var/lib/kubelet/tmp-kubeadm.conf"] diff --git a/host/networking-issues.yaml b/host/networking-issues.yaml deleted file mode 100644 index 6b7559f..0000000 --- a/host/networking-issues.yaml +++ /dev/null @@ -1,167 +0,0 @@ -# Spec to identify issues with networking on the host in a non-airgapped deployment -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: networking-issues-non-airgap -spec: - hostCollectors: - - ipv4Interfaces: {} - - certificate: - collectorName: k8s-api-keypair - certificatePath: /etc/kubernetes/pki/apiserver.crt - keyPath: /etc/kubernetes/pki/apiserver.key - - certificate: - collectorName: etcd-keypair - certificatePath: /etc/kubernetes/pki/etcd/server.crt - keyPath: /etc/kubernetes/pki/etcd/server.key - - http: - collectorName: curl-api-replicated-com - get: - url: https://api.replicated.com/healthz - - http: - collectorName: curl-get-replicated-com - get: - url: https://get.replicated.com/healthz - - http: - collectorName: curl-registry-replicated-com - get: - url: https://registry.replicated.com/healthz - - http: - collectorName: curl-proxy-replicated-com - get: - url: https://proxy.replicated.com/healthz - - http: - collectorName: curl-k8s-kurl-sh - get: - url: https://k8s.kurl.sh/healthz - - http: - collectorName: curl-replicated-app - get: - url: https://replicated.app/healthz - - run: - collectorName: "sysctl" - command: "sysctl" - args: ["-a"] - - run: - collectorName: "iptables" - command: "iptables" - args: ["-L", "-v"] - - run: - collectorName: "netstat-route-table" - command: "netstat" - args: ["-r", "-n"] - - run: - collectorName: "netstat-ports" - command: "netstat" - args: ["-t", "-u", "-l", "-p", "-n"] - - run: - collectorName: "systemctl-firewalld-status" - command: "systemctl" - args: ["status", "firewalld"] - - run: - collectorName: "journalctl-dmesg" - command: "journalctl" - args: ["--dmesg", "--no-pager", "-S", "7 days ago"] - hostAnalyzers: - - certificate: - collectorName: k8s-api-keypair - outcomes: - - fail: - when: "key-pair-missing" - message: Certificate key pair not found in /etc/kubernetes/pki/apiserver.* - - fail: - when: "key-pair-switched" - message: Cert and key pair are switched - - fail: - when: "key-pair-encrypted" - message: Private key is encrypted - - fail: - when: "key-pair-mismatch" - message: Cert and key do not match - - fail: - when: "key-pair-invalid" - message: Certificate key pair is invalid - - pass: - when: "key-pair-valid" - message: Certificate key pair is valid - - certificate: - collectorName: etcd-keypair - outcomes: - - fail: - when: "key-pair-missing" - message: Certificate key pair not found in /etc/kubernetes/pki/etcd/server.* - - fail: - when: "key-pair-switched" - message: Cert and key pair are switched - - fail: - when: "key-pair-encrypted" - message: Private key is encrypted - - fail: - when: "key-pair-mismatch" - message: Cert and key do not match - - fail: - when: "key-pair-invalid" - message: Certificate key pair is invalid - - pass: - when: "key-pair-valid" - message: Certificate key pair is valid - - http: - checkName: curl-api-replicated-com - collectorName: curl-api-replicated-com - outcomes: - - warn: - when: "error" - message: Error connecting to https://api.replicated.com/healthz - - pass: - when: "statusCode == 200" - message: Connected to https://api.replicated.com/healthz - - warn: - message: "Unexpected response" - - http: - checkName: curl-registry-replicated-com - collectorName: curl-registry-replicated-com - outcomes: - - warn: - when: "error" - message: Error connecting to https://registry.replicated.com/healthz - - pass: - when: "statusCode == 200" - message: Connected to https://registry.replicated.com/healthz - - warn: - message: "Unexpected response" - - http: - checkName: curl-proxy-replicated-com - collectorName: curl-proxy-replicated-com - outcomes: - - warn: - when: "error" - message: Error connecting to https://proxy.replicated.com/healthz - - pass: - when: "statusCode == 200" - message: Connected to https://proxy.replicated.com/healthz - - warn: - message: "Unexpected response" - - http: - checkName: curl-k8s-kurl-sh - collectorName: curl-k8s-kurl-sh - outcomes: - - warn: - when: "error" - message: Error connecting to https://k8s.kurl.sh/healthz - - pass: - when: "statusCode == 200" - message: Connected to https://k8s.kurl.sh/healthz - - warn: - message: "Unexpected response" - - http: - checkName: curl-replicated-app - collectorName: curl-replicated-app - outcomes: - - warn: - when: "error" - message: Error connecting to https://replicated.app/healthz - - pass: - when: "statusCode == 200" - message: Connected to https://replicated.app/healthz - - warn: - message: "Unexpected response" diff --git a/host/resource-contention.yaml b/host/resource-contention.yaml deleted file mode 100644 index 10d9bf4..0000000 --- a/host/resource-contention.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# Spec to gather additional information about cpu, memory, and disk on the system to identify potential resource contention and performance issues -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: resource-contention -spec: - hostCollectors: - # System Info Collectors - - blockDevices: {} - - cpu: {} - - hostOS: {} - - hostServices: {} - - ipv4Interfaces: {} - - memory: {} - - time: {} - - run: - collectorName: "uptime" - command: "uptime" - args: [] - - run: - collectorName: "free" - command: "free" - args: ["-m"] - - run: - collectorName: "top" - command: "top" - args: ["-b", "-n", "1"] - - run: - collectorName: "uname" - command: "uname" - args: ["-a"] - - run: - collectorName: "df" - command: "df" - args: ["-h"] - - run: - collectorName: "du-root" - command: "sh" - args: ["-c", "du -Shax / --exclude /proc | sort -rh | head -20"] - - run: - collectorName: "mount" - command: "mount" - args: ["-l"] - - run: - collectorName: "iostat" - command: "iostat" - args: ["-x"] - - run: - collectorName: "vmstat" - command: "vmstat" - args: ["-w"] - - run: - collectorName: "iostat" - command: "iostat" - args: ["-x"] - - run: - collectorName: "ps-high-load" - command: "sh" - args: ["-c", "ps -eo s,user,cmd | grep ^[RD] | sort | uniq -c | sort -nbr | head -20"] - - run: - collectorName: "journalctl-dmesg" - command: "journalctl" - args: ["--dmesg", "--no-pager", "-S", "7 days ago"] - - filesystemPerformance: - collectorName: filesystem-latency-two-minute-benchmark - timeout: 2m - directory: /var/lib/etcd - fileSize: 22Mi - operationSizeBytes: 2300 - datasync: true - enableBackgroundIOPS: true - backgroundIOPSWarmupSeconds: 10 - backgroundWriteIOPS: 300 - backgroundWriteIOPSJobs: 6 - backgroundReadIOPS: 50 - backgroundReadIOPSJobs: 1 - exclude: true - hostAnalyzers: - - cpu: - checkName: "Number of CPUs" - outcomes: - - warn: - when: "count < 4" - message: At least 4 CPU cores are recommended for kURL https://kurl.sh/docs/install-with-kurl/system-requirements - - pass: - message: This server has at least 4 CPU cores - - memory: - checkName: "Amount of Memory" - outcomes: - - warn: - when: "< 8G" - message: At least 8G of memory is recommended for kURL https://kurl.sh/docs/install-with-kurl/system-requirements - - pass: - message: The system has at least 8G of memory - - filesystemPerformance: - collectorName: filesystem-latency-two-minute-benchmark - outcomes: - - pass: - when: "p99 < 10ms" - message: "Write latency is ok (p99 target < 10ms)" - - warn: - message: "Write latency is high. p99 target >= 10ms)" - exclude: true diff --git a/in-cluster/cant-generate-bundle-with-kots.yaml b/in-cluster/cant-generate-bundle-with-kots.yaml deleted file mode 100644 index 60eda50..0000000 --- a/in-cluster/cant-generate-bundle-with-kots.yaml +++ /dev/null @@ -1,230 +0,0 @@ -# This is a more minimal spec compared to the default used to pull relevant info in determining bundle generation could be hanging when done via KOTS -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: default -spec: - collectors: - - clusterInfo: {} - - clusterResources: {} - - exec: # this is removable when we don't need to support kots <= 1.87 - args: - - "-U" - - kotsadm - collectorName: kotsadm-postgres-db - command: - - pg_dump - containerName: kotsadm-postgres - name: kots/admin_console - selector: - - app=kotsadm-postgres - timeout: 10s - - exec: - collectorName: kotsadm-rqlite-db - name: kots/admin_console - selector: - - app=kotsadm-rqlite - command: - - sh - - -c - - | - wget -qO- kotsadm:${RQLITE_PASSWORD}@localhost:4001/db/backup?fmt=sql - timeout: 10s - - exec: - args: - - "http://localhost:3030/goroutines" - collectorName: kotsadm-goroutines - command: - - curl - containerName: kotsadm - name: kots/admin_console - selector: - - app=kotsadm - timeout: 10s - - exec: - args: - - "http://localhost:3030/goroutines" - collectorName: kotsadm-operator-goroutines - command: - - curl - containerName: kotsadm-operator - name: kots/admin_console - selector: - - app=kotsadm-operator - timeout: 10s - - logs: - collectorName: kotsadm-postgres-db - name: kots/admin_console - selector: - - app=kotsadm-postgres - - logs: - collectorName: kotsadm-api - name: kots/admin_console - selector: - - app=kotsadm-api - - logs: - collectorName: kotsadm-operator - name: kots/admin_console - selector: - - app=kotsadm-operator - - logs: - collectorName: kotsadm - name: kots/admin_console - selector: - - app=kotsadm - - logs: - collectorName: kurl-proxy-kotsadm - name: kots/admin_console - selector: - - app=kurl-proxy-kotsadm - - logs: - collectorName: kotsadm-dex - name: kots/admin_console - selector: - - app=kotsadm-dex - - logs: - collectorName: kotsadm-fs-minio - name: kots/admin_console - selector: - - app=kotsadm-fs-minio - - logs: - collectorName: kotsadm-s3-ops - name: kots/admin_console - selector: - - app=kotsadm-s3-ops - - logs: - collectorName: registry - name: kots/kurl - selector: - - app=registry - namespace: kurl - - logs: - collectorName: ekc-operator - name: kots/kurl - selector: - - app=ekc-operator - namespace: kurl - - secret: - collectorName: kotsadm-replicated-registry - name: kotsadm-replicated-registry # NOTE: this will not live under the kots/ directory like other collectors - includeValue: false - key: .dockerconfigjson - - exec: - collectorName: weave-status - command: - - /home/weave/weave - args: - - --local - - status - containerName: weave - exclude: "" - name: kots/kurl/weave - namespace: kube-system - selector: - - name=weave-net - timeout: 10s - - exec: - collectorName: weave-report - command: - - /home/weave/weave - args: - - --local - - report - containerName: weave - exclude: "" - name: kots/kurl/weave - namespace: kube-system - selector: - - name=weave-net - timeout: 10s - - logs: - collectorName: weave-net - selector: - - name=weave-net - namespace: kube-system - name: kots/kurl/weave - - configMap: - collectorName: kurl-current-config - name: kurl-current-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - - configMap: - collectorName: kurl-last-config - name: kurl-last-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - analyzers: - - containerRuntime: - outcomes: - - fail: - when: "== gvisor" - message: The Admin Console does not support using the gvisor runtime - - pass: - message: A supported container runtime is present on all nodes - - clusterPodStatuses: - outcomes: - - fail: - when: "!= Healthy" - message: "Status: {{ .Status.Reason }}" - - statefulsetStatus: {} - - deploymentStatus: {} - - jobStatus: {} - - replicasetStatus: {} - - weaveReport: - reportFileGlob: kots/kurl/weave/kube-system/*/weave-report-stdout.txt - - textAnalyze: - checkName: Weave Status - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-status-stdout.txt - outcomes: - - fail: - message: Weave is not ready - - pass: - message: Weave is ready - regex: 'Status: ready' - - textAnalyze: - checkName: Weave Report - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt - outcomes: - - fail: - message: Weave is not ready - - pass: - message: Weave is ready - regex: '"Ready": true' - - textAnalyze: - checkName: Weave IP Allocation - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt - outcomes: - - fail: - message: IP Allocation issues detected. Please run `rm /var/lib/weave/weave-netdata.db && reboot` on each node to resolve this. - - pass: - message: Weave is ready, there are no IP allocation issues. - regex: '"IP Allocation was seeded by different peers": false' - - textAnalyze: - checkName: Inter-pod Networking - exclude: "" - ignoreIfNoFiles: true - fileName: kots/goldpinger/*/kotsadm-*/goldpinger-statistics-stdout.txt - outcomes: - - fail: - when: "OK = false" - message: Some nodes have pod communication issues - - pass: - message: Goldpinger can communicate properly - regexGroups: '"OK": ?(?P\w+)' - - nodeResources: - checkName: Node status check - outcomes: - - fail: - when: "nodeCondition(Ready) == False" - message: "Not all nodes are online." - - fail: - when: "nodeCondition(Ready) == Unknown" - message: "Not all nodes are online." - - pass: - message: "All nodes are online." diff --git a/in-cluster/default-kurl.yaml b/in-cluster/default-kurl.yaml deleted file mode 100644 index fb76e35..0000000 --- a/in-cluster/default-kurl.yaml +++ /dev/null @@ -1,398 +0,0 @@ -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: default -spec: - collectors: - - copyFromHost: - collectorName: "copy apiserver audit logs" - image: alpine - hostPath: "/var/log/apiserver/" - name: "logs" - extractArchive: true - - copyFromHost: - collectorName: "copy kURL logs" - image: alpine - hostPath: "/var/log/kurl/" - name: "logs" - extractArchive: true - - clusterInfo: {} - - clusterResources: {} - - ceph: {} - - longhorn: {} - - exec: # this is removable when we don't need to support kots <= 1.87 - args: - - "-U" - - kotsadm - collectorName: kotsadm-postgres-db - command: - - pg_dump - containerName: kotsadm-postgres - name: kots/admin_console - selector: - - app=kotsadm-postgres - timeout: 10s - - exec: - collectorName: kotsadm-rqlite-db - name: kots/admin_console - selector: - - app=kotsadm-rqlite - command: - - sh - - -c - - | - wget -qO- kotsadm:${RQLITE_PASSWORD}@localhost:4001/db/backup?fmt=sql - timeout: 10s - - exec: - args: - - "http://localhost:3030/goroutines" - collectorName: kotsadm-goroutines - command: - - curl - containerName: kotsadm - name: kots/admin_console - selector: - - app=kotsadm - timeout: 10s - - exec: - args: - - "http://localhost:3030/goroutines" - collectorName: kotsadm-operator-goroutines - command: - - curl - containerName: kotsadm-operator - name: kots/admin_console - selector: - - app=kotsadm-operator - timeout: 10s - - logs: - collectorName: kurl-control-plane - name: kots/kurl/control-plane - selector: - - tier=control-plane - - logs: - collectorName: kotsadm-postgres-db - name: kots/admin_console - selector: - - app=kotsadm-postgres - - logs: - collectorName: kotsadm-api - name: kots/admin_console - selector: - - app=kotsadm-api - - logs: - collectorName: kotsadm-operator - name: kots/admin_console - selector: - - app=kotsadm-operator - - logs: - collectorName: kotsadm - name: kots/admin_console - selector: - - app=kotsadm - - logs: - collectorName: kurl-proxy-kotsadm - name: kots/admin_console - selector: - - app=kurl-proxy-kotsadm - - logs: - collectorName: kotsadm-dex - name: kots/admin_console - selector: - - app=kotsadm-dex - - logs: - collectorName: kotsadm-fs-minio - name: kots/admin_console - selector: - - app=kotsadm-fs-minio - - logs: - collectorName: kotsadm-s3-ops - name: kots/admin_console - selector: - - app=kotsadm-s3-ops - - logs: - collectorName: registry - name: kots/kurl - selector: - - app=registry - namespace: kurl - - logs: - collectorName: ekc-operator - name: kots/kurl - selector: - - app=ekc-operator - namespace: kurl - - secret: - collectorName: kotsadm-replicated-registry - name: kotsadm-replicated-registry # NOTE: this will not live under the kots/ directory like other collectors - includeValue: false - key: .dockerconfigjson - - logs: - collectorName: rook-ceph-logs - namespace: rook-ceph - name: kots/rook - - exec: - collectorName: weave-status - command: - - /home/weave/weave - args: - - --local - - status - containerName: weave - exclude: "" - name: kots/kurl/weave - namespace: kube-system - selector: - - name=weave-net - timeout: 10s - - exec: - collectorName: weave-report - command: - - /home/weave/weave - args: - - --local - - report - containerName: weave - exclude: "" - name: kots/kurl/weave - namespace: kube-system - selector: - - name=weave-net - timeout: 10s - - logs: - collectorName: weave-net - selector: - - name=weave-net - namespace: kube-system - name: kots/kurl/weave - - logs: - collectorName: minio - selector: - - app=minio - namespace: minio - name: kots/kurl/minio - - exec: - args: - - "http://goldpinger.kurl.svc.cluster.local:80/check_all" - collectorName: goldpinger-statistics - command: - - curl - containerName: kotsadm - name: kots/goldpinger - selector: - - app=kotsadm - timeout: 10s - - copyFromHost: - collectorName: kurl-host-preflights - name: kots/kurl/host-preflights - hostPath: /var/lib/kurl/host-preflights - extractArchive: true - image: alpine - imagePullPolicy: IfNotPresent - timeout: 1m - - configMap: - collectorName: coredns - name: coredns - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kube-proxy - name: kube-proxy - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kubeadm-config - name: kubeadm-config - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kubelet-config - name: kubelet-config - namespace: kube-system - includeAllData: true - - configMap: - collectorName: kurl-config - name: kurl-config - namespace: kube-system - includeAllData: true - - configMap: - collectorName: weave-net - name: weave-net - namespace: kube-system - includeAllData: true - - configMap: - collectorName: ekco-config - name: ekco-config - namespace: kurl - includeAllData: true - - configMap: - collectorName: kurl-current-config - name: kurl-current-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - - configMap: - collectorName: kurl-last-config - name: kurl-last-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - - collectd: - collectorName: collectd - hostPath: /var/lib/collectd/rrd - image: alpine - imagePullPolicy: IfNotPresent - timeout: 5m - - logs: - collectorName: projectcontour-logs - namespace: projectcontour - name: projectcontour/logs - - http: - collectorName: replicated.app-health-check - get: - url: https://replicated.app/healthz - analyzers: - - containerRuntime: - outcomes: - - fail: - when: "== gvisor" - message: The Admin Console does not support using the gvisor runtime - - pass: - message: A supported container runtime is present on all nodes - - cephStatus: {} - - longhorn: {} - - clusterPodStatuses: - outcomes: - - fail: - when: "!= Healthy" - message: "Status: {{ .Status.Reason }}" - - statefulsetStatus: {} - - deploymentStatus: {} - - jobStatus: {} - - replicasetStatus: {} - - weaveReport: - reportFileGlob: kots/kurl/weave/kube-system/*/weave-report-stdout.txt - - textAnalyze: - checkName: Weave Status - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-status-stdout.txt - outcomes: - - fail: - message: Weave is not ready - - pass: - message: Weave is ready - regex: 'Status: ready' - - textAnalyze: - checkName: Weave Report - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt - outcomes: - - fail: - message: Weave is not ready - - pass: - message: Weave is ready - regex: '"Ready": true' - - textAnalyze: - checkName: Weave IP Allocation - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt - outcomes: - - fail: - message: IP Allocation issues detected. Please run `rm /var/lib/weave/weave-netdata.db && reboot` on each node to resolve this. - - pass: - message: Weave is ready, there are no IP allocation issues. - regex: '"IP Allocation was seeded by different peers": false' - - textAnalyze: - checkName: Inter-pod Networking - exclude: "" - ignoreIfNoFiles: true - fileName: kots/goldpinger/*/kotsadm-*/goldpinger-statistics-stdout.txt - outcomes: - - fail: - when: "OK = false" - message: Some nodes have pod communication issues - - pass: - message: Goldpinger can communicate properly - regexGroups: '"OK": ?(?P\w+)' - - nodeResources: - checkName: Node status check - outcomes: - - fail: - when: "nodeCondition(Ready) == False" - message: "Not all nodes are online." - - fail: - when: "nodeCondition(Ready) == Unknown" - message: "Not all nodes are online." - - pass: - message: "All nodes are online." - - clusterPodStatuses: - checkName: contour pods unhealthy - namespaces: - - projectcontour - outcomes: - - fail: - when: "!= Healthy" # Catch all unhealthy pods. A pod is considered healthy if it has a status of Completed, or Running and all of its containers are ready. - message: A Contour pod, {{ .Name }}, is unhealthy with a status of {{ .Status.Reason }}. Restarting the pod may fix the issue. - - textAnalyze: - checkName: longhorn multipath conflict - exclude: "" - ignoreIfNoFiless: true - fileName: longhorn/longhorn-system/logs/longhorn-csi-plugin-*/longhorn-csi-plugin.log - outcomes: - - fail: - when: "true" - uri: "https://longhorn.io/kb/troubleshooting-volume-with-multipath/" - message: "Longhorn volumes may be in use by system multipath." - - pass: - when: "false" - message: "No block-device conflicts detected" - regex: '.*is apparently in use by the system;.*' - - textAnalyze: - checkName: Minio disk full - fileName: cluster-resources/pods/logs/kurl/registry-*/registry.log - regex: '.*XMinioStorageFull: Storage backend has reached its minimum free disk threshold.*' - outcomes: - - fail: - when: "true" - message: "Minio Disk Full" - - pass: - when: "false" - message: "Minio Disk Ok" - - textAnalyze: - checkName: Known issue with Rook < 1.4 - exclude: "" - ignoreIfNoFiles: true - fileName: /ceph/status.json - regex: '\"ceph_release\": \"nautilus\"|\"status\": \"HEALTH_WARN\"' - outcomes: - - fail: - when: "true" - message: "If you have been removing and adding nodes then, you might want ensure that you are not facing the scenario described in the community topic: https://community.replicated.com/t/1099" - - pass: - when: "false" - message: "You are not using a Rook versions < 1.4 and/or your Ceph status is OK" - - textAnalyze: - checkName: Rook rbd filesystem consistency - fileName: /kots/rook/rook-ceph-agent-*.log - regex: 'UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.' - outcomes: - - fail: - when: "true" - message: "One or more rook rbd(s) were detected to have filesystem inconsistencies and require manual intervention" - - pass: - when: "false" - message: "Rook filesystem consistency ok" - - jsonCompare: - checkName: https://replicated.app host health check - fileName: replicated.app-health-check.json - path: "response.status" - value: "200" - outcomes: - - fail: - when: "false" - message: https://replicated.app is unhealthy. License and software update checks from replicated will fail. If this is locked down environment, please check your proxy settings. - uri: https://kurl.sh/docs/install-with-kurl/proxy-installs - - pass: - when: "true" - message: https://replicated.app host is healthy diff --git a/in-cluster/default.yaml b/in-cluster/default.yaml index db4c52b..4df817d 100644 --- a/in-cluster/default.yaml +++ b/in-cluster/default.yaml @@ -3,14 +3,44 @@ kind: SupportBundle metadata: name: default spec: + uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml collectors: + - runPod: + name: ekco-resources + namespace: kurl + podSpec: + containers: + - name: inspect-ekco-pods + image: adamancini/netshoot + command: ["sh", "-c", "--"] + args: + [ + "kubectl get pod -n kurl --selector app=ekc-operator --field-selector status.phase=Running -o json | jq -r .items[]", + ] + restartPolicy: Never + dnsPolicy: ClusterFirst + serviceAccount: ekco + - copyFromHost: + collectorName: "copy apiserver audit logs" + image: alpine + hostPath: "/var/log/apiserver/" + name: "logs" + extractArchive: true + - copyFromHost: + collectorName: "copy kURL logs" + image: alpine + hostPath: "/var/log/kurl/" + name: "logs" + extractArchive: true - clusterInfo: {} - clusterResources: {} + - ceph: {} + - longhorn: {} - exec: # this is removable when we don't need to support kots <= 1.87 args: - "-U" - kotsadm - collectorName: kotsadm-postgres-db + collectorName: kotsadm-postgres-db-dump command: - pg_dump containerName: kotsadm-postgres @@ -51,6 +81,11 @@ spec: selector: - app=kotsadm-operator timeout: 10s + - logs: + collectorName: kurl-control-plane + name: kots/kurl/control-plane + selector: + - tier=control-plane - logs: collectorName: kotsadm-postgres-db name: kots/admin_console @@ -91,15 +126,142 @@ spec: name: kots/admin_console selector: - app=kotsadm-s3-ops + - logs: + collectorName: registry + name: kots/kurl + selector: + - app=registry + namespace: kurl + - logs: + collectorName: ekc-operator + name: kots/kurl + selector: + - app=ekc-operator + namespace: kurl + - secret: + collectorName: kotsadm-replicated-registry + name: kotsadm-replicated-registry # NOTE: this will not live under the kots/ directory like other collectors + includeValue: false + key: .dockerconfigjson - logs: collectorName: rook-ceph-logs namespace: rook-ceph name: kots/rook + - exec: + collectorName: weave-status + command: + - /home/weave/weave + args: + - --local + - status + containerName: weave + exclude: "" + name: kots/kurl/weave + namespace: kube-system + selector: + - name=weave-net + timeout: 10s + - exec: + collectorName: weave-report + command: + - /home/weave/weave + args: + - --local + - report + containerName: weave + exclude: "" + name: kots/kurl/weave + namespace: kube-system + selector: + - name=weave-net + timeout: 10s - logs: collectorName: rqlite-logs name: kots/rqlite/logs selector: - app=kotsadm-rqlite + - logs: + collectorName: weave-net + selector: + - name=weave-net + namespace: kube-system + name: kots/kurl/weave + - logs: + collectorName: minio + selector: + - app=minio + namespace: minio + name: kots/kurl/minio + - exec: + args: + - "http://goldpinger.kurl.svc.cluster.local:80/check_all" + collectorName: goldpinger-statistics + command: + - curl + containerName: kotsadm + name: kots/goldpinger + selector: + - app=kotsadm + timeout: 10s + - copyFromHost: + collectorName: kurl-host-preflights + name: kots/kurl/host-preflights + hostPath: /var/lib/kurl/host-preflights + extractArchive: true + image: alpine + imagePullPolicy: IfNotPresent + timeout: 1m + - configMap: + collectorName: coredns + name: coredns + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kube-proxy + name: kube-proxy + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kubeadm-config + name: kubeadm-config + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kubelet-config + name: kubelet-config + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kurl-config + name: kurl-config + namespace: kube-system + includeAllData: true + - configMap: + collectorName: weave-net + name: weave-net + namespace: kube-system + includeAllData: true + - configMap: + collectorName: ekco-config + name: ekco-config + namespace: kurl + includeAllData: true + - configMap: + collectorName: kurl-current-config + name: kurl-current-config # NOTE: this will not live under the kots/ directory like other collectors + namespace: kurl + includeAllData: true + - configMap: + collectorName: kurl-last-config + name: kurl-last-config # NOTE: this will not live under the kots/ directory like other collectors + namespace: kurl + includeAllData: true + - collectd: + collectorName: collectd + hostPath: /var/lib/collectd/rrd + image: alpine + imagePullPolicy: IfNotPresent + timeout: 5m - logs: collectorName: projectcontour-logs namespace: projectcontour @@ -113,16 +275,38 @@ spec: image: busybox:1 command: ["wget"] args: ["-q", "-T", "5", "http://kotsadm-rqlite:4001/status?pretty", "-O-"] - - secret: - collectorName: kotsadm-replicated-registry - name: kotsadm-replicated-registry # NOTE: this will not live under the kots/ directory like other collectors - includeValue: false - key: .dockerconfigjson - http: collectorName: replicated.app-health-check get: url: https://replicated.app/healthz analyzers: + - deploymentStatus: + checkName: Check EKCO is operational + name: ekc-operator + namespace: kurl + outcomes: + - fail: + when: absent + message: EKCO is not installed - please add the EKCO component to your kURL spec and re-run the installer script + - fail: + when: "< 1" + message: EKCO does not have any Ready pods + - pass: + message: EKCO is installed and running + - textAnalyze: + checkName: Check installed EKCO version for critical fixes + fileName: ekco-resources/ekco-resources.log + regexGroups: '"image": "replicated/ekco:v(?P\d+)\.(?P\d+)\.(?P\d+)"' + outcomes: + - warn: + when: "Minor < 4" + message: A critical update for cluster certificate rotation has been released in EKCO 0.4.0. Please upgrade to the latest available version. + - warn: + when: "Minor < 19" + message: A critical fix for registry certificate rotation has been released in EKCO 0.19.3. Please upgrade to the latest available version. + - pass: + when: "Minor > 20" + message: EKCO version is recent - containerRuntime: outcomes: - fail: @@ -130,6 +314,8 @@ spec: message: The Admin Console does not support using the gvisor runtime - pass: message: A supported container runtime is present on all nodes + - cephStatus: {} + - longhorn: {} - clusterPodStatuses: outcomes: - fail: @@ -139,6 +325,53 @@ spec: - deploymentStatus: {} - jobStatus: {} - replicasetStatus: {} + - weaveReport: + reportFileGlob: kots/kurl/weave/kube-system/*/weave-report-stdout.txt + - textAnalyze: + checkName: Weave Status + exclude: "" + ignoreIfNoFiles: true + fileName: kots/kurl/weave/kube-system/weave-net-*/weave-status-stdout.txt + outcomes: + - fail: + message: Weave is not ready + - pass: + message: Weave is ready + regex: 'Status: ready' + - textAnalyze: + checkName: Weave Report + exclude: "" + ignoreIfNoFiles: true + fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt + outcomes: + - fail: + message: Weave is not ready + - pass: + message: Weave is ready + regex: '"Ready": true' + - textAnalyze: + checkName: Weave IP Allocation + exclude: "" + ignoreIfNoFiles: true + fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt + outcomes: + - fail: + message: IP Allocation issues detected. Please run `rm /var/lib/weave/weave-netdata.db && reboot` on each node to resolve this. + - pass: + message: Weave is ready, there are no IP allocation issues. + regex: '"IP Allocation was seeded by different peers": false' + - textAnalyze: + checkName: Inter-pod Networking + exclude: "" + ignoreIfNoFiles: true + fileName: kots/goldpinger/*/kotsadm-*/goldpinger-statistics-stdout.txt + outcomes: + - fail: + when: "OK = false" + message: Some nodes have pod communication issues + - pass: + message: Goldpinger can communicate properly + regexGroups: '"OK": ?(?P\w+)' - nodeResources: checkName: Node status check outcomes: @@ -158,6 +391,55 @@ spec: - fail: when: "!= Healthy" # Catch all unhealthy pods. A pod is considered healthy if it has a status of Completed, or Running and all of its containers are ready. message: A Contour pod, {{ .Name }}, is unhealthy with a status of {{ .Status.Reason }}. Restarting the pod may fix the issue. + - textAnalyze: + checkName: longhorn multipath conflict + exclude: "" + ignoreIfNoFiless: true + fileName: longhorn/longhorn-system/logs/longhorn-csi-plugin-*/longhorn-csi-plugin.log + outcomes: + - fail: + when: "true" + uri: "https://longhorn.io/kb/troubleshooting-volume-with-multipath/" + message: "Longhorn volumes may be in use by system multipath." + - pass: + when: "false" + message: "No block-device conflicts detected" + regex: '.*is apparently in use by the system;.*' + - textAnalyze: + checkName: Minio disk full + fileName: cluster-resources/pods/logs/kurl/registry-*/registry.log + regex: '.*XMinioStorageFull: Storage backend has reached its minimum free disk threshold.*' + outcomes: + - fail: + when: "true" + message: "Minio Disk Full" + - pass: + when: "false" + message: "Minio Disk Ok" + - textAnalyze: + checkName: Known issue with Rook < 1.4 + exclude: "" + ignoreIfNoFiles: true + fileName: /ceph/status.json + regex: '\"ceph_release\": \"nautilus\"|\"status\": \"HEALTH_WARN\"' + outcomes: + - fail: + when: "true" + message: "If you have been removing and adding nodes then, you might want ensure that you are not facing the scenario described in the community topic: https://community.replicated.com/t/1099" + - pass: + when: "false" + message: "You are not using a Rook versions < 1.4 and/or your Ceph status is OK" + - textAnalyze: + checkName: Rook rbd filesystem consistency + fileName: /kots/rook/rook-ceph-agent-*.log + regex: 'UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.' + outcomes: + - fail: + when: "true" + message: "One or more rook rbd(s) were detected to have filesystem inconsistencies and require manual intervention" + - pass: + when: "false" + message: "Rook filesystem consistency ok" - jsonCompare: checkName: https://replicated.app host health check fileName: replicated.app-health-check.json diff --git a/in-cluster/ekco.yaml b/in-cluster/ekco.yaml deleted file mode 100644 index 6467f2f..0000000 --- a/in-cluster/ekco.yaml +++ /dev/null @@ -1,50 +0,0 @@ -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: ekco -spec: - uri: https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/ekco.yaml - collectors: - - runPod: - name: ekco-resources - namespace: kurl - podSpec: - containers: - - name: inspect-ekco-pods - image: adamancini/netshoot - command: ["sh", "-c", "--"] - args: - [ - "kubectl get pod -n kurl --selector app=ekc-operator --field-selector status.phase=Running -o json | jq -r .items[]", - ] - restartPolicy: Never - dnsPolicy: ClusterFirst - serviceAccount: ekco - analyzers: - - deploymentStatus: - checkName: Check EKCO is operational - name: ekc-operator - namespace: kurl - outcomes: - - fail: - when: absent - message: EKCO is not installed - please add the EKCO component to your kURL spec and re-run the installer script - - fail: - when: "< 1" - message: EKCO does not have any Ready pods - - pass: - message: EKCO is installed and running - - textAnalyze: - checkName: Check installed EKCO version for critical fixes - fileName: ekco-resources/ekco-resources.log - regexGroups: '"image": "replicated/ekco:v(?P\d+)\.(?P\d+)\.(?P\d+)"' - outcomes: - - warn: - when: "Minor < 4" - message: A critical update for cluster certificate rotation has been released in EKCO 0.4.0. Please upgrade to the latest available version. - - warn: - when: "Minor < 19" - message: A critical fix for registry certificate rotation has been released in EKCO 0.19.3. Please upgrade to the latest available version. - - pass: - when: "Minor > 20" - message: EKCO version is recent diff --git a/in-cluster/networking-issues-kurl.yaml b/in-cluster/networking-issues-kurl.yaml deleted file mode 100644 index b9c03a3..0000000 --- a/in-cluster/networking-issues-kurl.yaml +++ /dev/null @@ -1,167 +0,0 @@ -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: networking-issues-kurl -spec: - collectors: - - copyFromHost: - collectorName: "copy kURL logs" - image: busybox:1 - hostPath: "/var/log/kurl/" - name: "logs" - extractArchive: true - - logs: - collectorName: kurl-control-plane - name: kots/kurl/control-plane - selector: - - tier=control-plane - - logs: - collectorName: kurl-proxy-kotsadm - name: kots/admin_console - selector: - - app=kurl-proxy-kotsadm - - logs: - collectorName: ekc-operator - name: kots/kurl - selector: - - app=ekc-operator - namespace: kurl - - exec: - collectorName: weave-status - command: - - /home/weave/weave - args: - - --local - - status - containerName: weave - exclude: "" - name: kots/kurl/weave - namespace: kube-system - selector: - - name=weave-net - timeout: 10s - - exec: - collectorName: weave-report - command: - - /home/weave/weave - args: - - --local - - report - containerName: weave - exclude: "" - name: kots/kurl/weave - namespace: kube-system - selector: - - name=weave-net - timeout: 10s - - logs: - collectorName: weave-net - selector: - - name=weave-net - namespace: kube-system - name: kots/kurl/weave - - exec: - args: - - "http://goldpinger.kurl.svc.cluster.local:80/check_all" - collectorName: goldpinger-statistics - command: - - curl - containerName: kotsadm - name: kots/goldpinger - selector: - - app=kotsadm - timeout: 10s - - copyFromHost: - collectorName: kurl-host-preflights - name: kots/kurl/host-preflights - hostPath: /var/lib/kurl/host-preflights - extractArchive: true - image: alpine - imagePullPolicy: IfNotPresent - timeout: 1m - - configMap: - collectorName: kurl-current-config - name: kurl-current-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - - configMap: - collectorName: kurl-last-config - name: kurl-last-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - analyzers: - - clusterPodStatuses: - outcomes: - - fail: - when: "!= Healthy" - message: "Status: {{ .Status.Reason }}" - - statefulsetStatus: {} - - deploymentStatus: {} - - jobStatus: {} - - replicasetStatus: {} - - weaveReport: - reportFileGlob: kots/kurl/weave/kube-system/*/weave-report-stdout.txt - - textAnalyze: - checkName: Weave Status - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-status-stdout.txt - outcomes: - - fail: - message: Weave is not ready - - pass: - message: Weave is ready - regex: 'Status: ready' - - textAnalyze: - checkName: Weave Report - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt - outcomes: - - fail: - message: Weave is not ready - - pass: - message: Weave is ready - regex: '"Ready": true' - - textAnalyze: - checkName: Weave IP Allocation - exclude: "" - ignoreIfNoFiles: true - fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt - outcomes: - - fail: - message: IP Allocation issues detected. Please run `rm /var/lib/weave/weave-netdata.db && reboot` on each node to resolve this. - - pass: - message: Weave is ready, there are no IP allocation issues. - regex: '"IP Allocation was seeded by different peers": false' - - textAnalyze: - checkName: Inter-pod Networking - exclude: "" - ignoreIfNoFiles: true - fileName: kots/goldpinger/*/kotsadm-*/goldpinger-statistics-stdout.txt - outcomes: - - fail: - when: "OK = false" - message: Some nodes have pod communication issues - - pass: - message: Goldpinger can communicate properly - regexGroups: '"OK": ?(?P\w+)' - - nodeResources: - checkName: Node status check - outcomes: - - fail: - when: "nodeCondition(Ready) == False" - message: "Not all nodes are online." - - fail: - when: "nodeCondition(Ready) == Unknown" - message: "Not all nodes are online." - - pass: - message: "All nodes are online." - - clusterPodStatuses: - checkName: contour pods unhealthy - namespaces: - - projectcontour - outcomes: - - fail: - when: "!= Healthy" # Catch all unhealthy pods. A pod is considered healthy if it has a status of Completed, or Running and all of its containers are ready. - message: A Contour pod, {{ .Name }}, is unhealthy with a status of {{ .Status.Reason }}. Restarting the pod may fix the issue. diff --git a/in-cluster/no-exec.yaml b/in-cluster/no-exec.yaml deleted file mode 100644 index accbb00..0000000 --- a/in-cluster/no-exec.yaml +++ /dev/null @@ -1,114 +0,0 @@ -apiVersion: troubleshoot.sh/v1beta2 -kind: SupportBundle -metadata: - name: default -spec: - collectors: - - clusterInfo: {} - - clusterResources: {} - - logs: - collectorName: kotsadm-postgres-db - name: kots/admin_console - selector: - - app=kotsadm-postgres - - logs: - collectorName: kotsadm-api - name: kots/admin_console - selector: - - app=kotsadm-api - - logs: - collectorName: kotsadm-operator - name: kots/admin_console - selector: - - app=kotsadm-operator - - logs: - collectorName: kotsadm - name: kots/admin_console - selector: - - app=kotsadm - - logs: - collectorName: kurl-proxy-kotsadm - name: kots/admin_console - selector: - - app=kurl-proxy-kotsadm - - logs: - collectorName: kotsadm-dex - name: kots/admin_console - selector: - - app=kotsadm-dex - - logs: - collectorName: kotsadm-fs-minio - name: kots/admin_console - selector: - - app=kotsadm-fs-minio - - logs: - collectorName: kotsadm-s3-ops - name: kots/admin_console - selector: - - app=kotsadm-s3-ops - - logs: - collectorName: registry - name: kots/kurl - selector: - - app=registry - namespace: kurl - - logs: - collectorName: ekc-operator - name: kots/kurl - selector: - - app=ekc-operator - namespace: kurl - - secret: - collectorName: kotsadm-replicated-registry - name: kotsadm-replicated-registry # NOTE: this will not live under the kots/ directory like other collectors - includeValue: false - key: .dockerconfigjson - - logs: - collectorName: rook-ceph-logs - namespace: rook-ceph - name: kots/rook - - logs: - collectorName: weave-net - selector: - - name=weave-net - namespace: kube-system - name: kots/kurl/weave - - configMap: - collectorName: kurl-current-config - name: kurl-current-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - - configMap: - collectorName: kurl-last-config - name: kurl-last-config # NOTE: this will not live under the kots/ directory like other collectors - namespace: kurl - includeAllData: true - analyzers: - - clusterPodStatuses: - outcomes: - - fail: - when: "!= Healthy" - message: "Status: {{ .Status.Reason }}" - - statefulsetStatus: {} - - deploymentStatus: {} - - jobStatus: {} - - replicasetStatus: {} - - nodeResources: - checkName: Node status check - outcomes: - - fail: - when: "nodeCondition(Ready) == False" - message: "Not all nodes are online." - - fail: - when: "nodeCondition(Ready) == Unknown" - message: "Not all nodes are online." - - pass: - message: "All nodes are online." - - clusterPodStatuses: - checkName: contour pods unhealthy - namespaces: - - projectcontour - outcomes: - - fail: - when: "!= Healthy" # Catch all unhealthy pods. A pod is considered healthy if it has a status of Completed, or Running and all of its containers are ready. - message: A Contour pod, {{ .Name }}, is unhealthy with a status of {{ .Status.Reason }}. Restarting the pod may fix the issue.