From e487a11d60809baa9cc6c9a782efc74946229311 Mon Sep 17 00:00:00 2001 From: Andrew Durbin Date: Fri, 9 Aug 2024 16:27:29 -0600 Subject: [PATCH] Edge Node Clusters: nodes, volumes, replicas, upgrades Node Info: role, age, ip Pod Info: status, restarts, ip Volume and Replica Info: rebuild progress, health Upgrade Status: cluster wide and node components Signed-off-by: Andrew Durbin --- proto/info/edge_node_cluster.proto | 267 +++++++++++++++++++++++++++++ proto/info/error.proto | 62 +++++++ proto/info/info.proto | 55 +----- proto/profile/local_profile.proto | 1 + 4 files changed, 333 insertions(+), 52 deletions(-) create mode 100644 proto/info/edge_node_cluster.proto create mode 100644 proto/info/error.proto diff --git a/proto/info/edge_node_cluster.proto b/proto/info/edge_node_cluster.proto new file mode 100644 index 00000000..62a97e74 --- /dev/null +++ b/proto/info/edge_node_cluster.proto @@ -0,0 +1,267 @@ +// Copyright(c) 2024 Zededa, Inc. +// All rights reserved. + +syntax = "proto3"; + +package org.lfedge.eve.info; + +import "info/error.proto"; + +option go_package = "github.com/lf-edge/eve-api/go/info"; +option java_package = "org.lfedge.eve.info"; + +import "google/protobuf/timestamp.proto"; + +// KubeNodeStatus follows Kubernetes Node Conditions +// Refer here: https://pkg.go.dev/k8s.io/api/core/v1#NodeConditionType +enum KubeNodeConditionType { + KUBE_NODE_CONDITION_TYPE_READY = 0; + KUBE_NODE_CONDITION_TYPE_MEMORY_PRESSURE = 1; + KUBE_NODE_CONDITION_TYPE_DISK_PRESSURE = 2; + KUBE_NODE_CONDITION_TYPE_PID_PRESSURE = 3; + KUBE_NODE_CONDITION_TYPE_NETWORK_UNAVAIL = 4; +} + +message KubeNodeCondition { + // Type of the condition + KubeNodeConditionType type = 1; + + // Condition state + bool set = 2; +} + +message KubeNodeInfo { + // Name of the node, will match device name + string name = 1; + + // Status of the node + repeated KubeNodeCondition conditions = 2; + + // Role of the node is server or not + bool role_server = 3; + + // Creation Time of the node in the cluster + google.protobuf.Timestamp creation_timestamp = 4; + + // Version of the API Server running on the node + string api_server_sersion = 5; + + // Internal IP address of the node + string internal_ip = 6; + + bool schedulable = 7; +} + +// StorageHealthStatus is a higher level tracking status to show redundancy/failure-zone level +// and rebuild progress. +enum StorageHealthStatus { + STORAGE_HEALTH_STATUS_UNKNOWN = 0; + STORAGE_HEALTH_STATUS_HEALTHY = 1; + STORAGE_HEALTH_STATUS_DEGRADED_2_REPLICA_AVAILABLE_REPLICATING = 2; + STORAGE_HEALTH_STATUS_DEGRADED_2_REPLICA_AVAILABLE_NOT_REPLICATING = 3; + STORAGE_HEALTH_STATUS_DEGRADED_1_REPLICA_AVAILABLE_REPLICATING = 4; + STORAGE_HEALTH_STATUS_DEGRADED_1_REPLICA_AVAILABLE_NOT_REPLICATING = 5; + STORAGE_HEALTH_STATUS_FAILED = 6; +} + +// StorageVolumeState is the kubernetes 'state' field of a replicated csi-driver volume. +// Refer to: https://github.com/longhorn/longhorn-manager/blob/v1.6.2/k8s/pkg/apis/longhorn/v1beta1/volume.go#L14 +enum StorageVolumeState { + STORAGE_VOLUME_STATE_UNKNOWN = 0; + STORAGE_VOLUME_STATE_CREATING = 1; + STORAGE_VOLUME_STATE_ATTACHED = 2; + STORAGE_VOLUME_STATE_DETACHED = 3; + STORAGE_VOLUME_STATE_ATTACHING = 4; + STORAGE_VOLUME_STATE_DETACHING = 5; + STORAGE_VOLUME_STATE_DELETING = 6; +} + +// StorageVolumeRobustness is the 'robustness' of a replicated csi-driver volume. +// Refer to: https://github.com/longhorn/longhorn-manager/blob/v1.6.2/k8s/pkg/apis/longhorn/v1beta1/volume.go#L25 +enum StorageVolumeRobustness { + STORAGE_VOLUME_ROBUSTNESS_UNKNOWN = 0; + STORAGE_VOLUME_ROBUSTNESS_HEALTHY = 1; + STORAGE_VOLUME_ROBUSTNESS_DEGRADED = 2; + STORAGE_VOLUME_ROBUSTNESS_FAULTED = 3; +} + +// StorageVolumePVCStatus is the kubernetes 'phase' of a PVC. Listed as status in cli. +// Refer to: https://kubernetes.io/docs/concepts/storage/persistent-volumes#phase +enum StorageVolumePVCStatus { + STORAGE_VOLUME_PVC_STATUS_UNKNOWN = 0; + STORAGE_VOLUME_PVC_STATUS_BOUND = 1; + STORAGE_VOLUME_PVC_STATUS_PENDING = 2; // Accepted but not yet scheduled + STORAGE_VOLUME_PVC_STATUS_AVAILABLE = 3; + STORAGE_VOLUME_PVC_STATUS_RELEASED = 4; + STORAGE_VOLUME_PVC_STATUS_FAILED = 5; +} + +// StorageVolumeReplicaStatus is a higher level status which combines replica and engine +// status to show a simplified view of a replica rebuild state. +enum StorageVolumeReplicaStatus { + STORAGE_VOLUME_REPLICA_STATUS_UNKNOWN = 0; + STORAGE_VOLUME_REPLICA_STATUS_REBUILDING = 1; + STORAGE_VOLUME_REPLICA_STATUS_ONLINE = 2; + STORAGE_VOLUME_REPLICA_STATUS_FAILED = 3; // Replacement/Rebuilt replica not yet scheduled. +} + +message KubePodNameSpaceInfo { + // Name of the namespace + string name = 1; + + // Number of pods in the namespace + uint32 pod_count = 2; + + // Number of pods in the namespace that are running + uint32 pod_running_count = 3; + + // Number of pods in the namespace that are pending + uint32 pod_pending_count = 4; + + // Number of pods in the namespace that are failed + uint32 pod_failed_count = 5; + + // Number of pods in the namespace that are succeeded + uint32 pod_succeeded_count = 6; +} + +enum KubePodStatus { + KUBE_POD_STATUS_UNKNOWN = 0; + KUBE_POD_STATUS_PENDING = 1; + KUBE_POD_STATUS_RUNNING = 2; + KUBE_POD_STATUS_SUCCEEDED = 3; + KUBE_POD_STATUS_CONTAINER_CREATING = 4; + KUBE_POD_STATUS_CRASHLOOP_BACKOFF = 5; + KUBE_POD_STATUS_ERROR = 6; + KUBE_POD_STATUS_EVICTED = 7; + KUBE_POD_STATUS_FAILED = 8; +} + +message KubeEVEAppPodInfo { + // Name of the EVE application + string name = 1; + + // Application Status + KubePodStatus status = 2; + + // Restart count of the application + uint32 restart_count = 3; + + // Restart time of the application, seconds ago + google.protobuf.Timestamp restart_time = 4; + + // Creation Time of the application + google.protobuf.Timestamp creation_timestamp = 5; + + // IP address of the application, on cni0 interface + string ip_address = 6; + + // Node name on which the application is running + string node_name = 7; +} + +message KubeVolumeReplicaInfo { + // Name of the volume replica + string name = 1; + + // Node replica resides on, will match node name + string owner_node = 2; + + // Rebuild progress of the volume replica + uint32 rebuild_progress_percentage = 3; + + // Replica status + StorageVolumeReplicaStatus status = 4; +} + +message KubeVolumeInfo { + // Name of the volume + string name = 1; + + // Status of the volume + StorageVolumeState state = 2; + + // Robustness of the volume + StorageVolumeRobustness robustness = 3; + + // Creation Time of the volume in the cluster + google.protobuf.Timestamp creation_timestamp = 4; + + // Provisioned size of the volume in bytes + uint64 provisioned_bytes = 5; + + // Allocated size of the volume in bytes + uint64 allocated_bytes = 6; + + // PV/PVC status of the volume + StorageVolumePVCStatus pvc_status = 7; + + // Replicas of the volume + repeated KubeVolumeReplicaInfo replica = 8; +} + +message KubeStorageInfo { + // Overall status of Longhorn + StorageHealthStatus health = 1; + + // Time of the most recent health status transition + google.protobuf.Timestamp transition_time = 2; + + // Status of all the volumes in Longhorn + repeated KubeVolumeInfo volumes = 3; +} + +// KubeCompUpgradeStatus will track status of each +// KubeComp which will upgrade serially in a cluster. +enum KubeCompUpgradeStatus { + KUBE_COMP_UPGRADE_STATUS_UNKNOWN = 0; + KUBE_COMP_UPGRADE_STATUS_DOWNLOAD = 1; + KUBE_COMP_UPGRADE_STATUS_DOWNLOAD_FAILED = 2; + KUBE_COMP_UPGRADE_STATUS_IN_PROGRESS = 3; + KUBE_COMP_UPGRADE_STATUS_FAILED = 4; + KUBE_COMP_UPGRADE_STATUS_COMPLETED = 5; +} + +// KubeComp is a component installed in eve after usb install. +// These are provided to show more detail on cluster upgrade progress. +enum KubeComp { + KUBE_COMP_UNKNOWN = 0; + KUBE_COMP_CONTAINERD = 1; //every node will publish + KUBE_COMP_K3S = 2; // every node will publish + KUBE_COMP_MULTUS = 3; // Only the first node to upgrade eve-os will publish the remaining here and below + KUBE_COMP_KUBEVIRT = 4; + KUBE_COMP_CDI = 5; + KUBE_COMP_LONGHORN = 6; +} + +message KubeClusterUpgradeStatus { + // current_node will have an empty value when no node is in an upgrade + string current_node = 1; + + // component currently under upgrade, COMP_UNKNOWN when no upgrades in progress + KubeComp component = 2; + + // status of the current component under upgrade, KUBE_COMP_UPGRADE_STATUS_UNKNOWN when + // no upgrades in progress + KubeCompUpgradeStatus status = 3; + + // Error info in case of failure + org.lfedge.eve.info.ErrorInfo error = 4; +} + +message KubeClusterInfo { + // Info message on a list of cluster nodes + repeated KubeNodeInfo nodes = 1; + + // Info message on a list of namespaces's pods summary + repeated KubePodNameSpaceInfo pod_name_spaces = 2; + + // Info message on a list of EVE applications + repeated KubeEVEAppPodInfo eve_apps = 3; + + // Info message on cluster storage + KubeStorageInfo storage = 4; + + // The current status of the cluster upgrade + KubeClusterUpgradeStatus upgrade = 5; +} diff --git a/proto/info/error.proto b/proto/info/error.proto new file mode 100644 index 00000000..478a8dae --- /dev/null +++ b/proto/info/error.proto @@ -0,0 +1,62 @@ +// Copyright(c) 2024 Zededa, Inc. +// All rights reserved. + +syntax = "proto3"; + +package org.lfedge.eve.info; +option go_package = "github.com/lf-edge/eve-api/go/info"; +option java_package = "org.lfedge.eve.info"; + +import "google/protobuf/timestamp.proto"; + +// Entity contains the entity type +enum Entity { + // Invalid Device Entity + ENTITY_UNSPECIFIED = 0; + // Base OS entity + ENTITY_BASE_OS = 1; + // System Adapter Entity + ENTITY_SYSTEM_ADAPTER = 2; + // Vault Entity + ENTITY_VAULT = 3; + // Attestation Entity + ENTITY_ATTESTATION = 4; + // App Instance Entity + ENTITY_APP_INSTANCE = 5; + // Port Entity + ENTITY_PORT = 6; + // Network Entity + ENTITY_NETWORK = 7; + // Network Instance Entity + ENTITY_NETWORK_INSTANCE = 8; + // ContentTree Entity + ENTITY_CONTENT_TREE = 9; + // Blob Entity + ENTITY_CONTENT_BLOB = 10; + // VOLUME Entity + ENTITY_VOLUME = 11; +} + +// Severity tells the severity type +enum Severity { + SEVERITY_UNSPECIFIED = 0; // severity unspecified + SEVERITY_NOTICE = 1; // severity notice + SEVERITY_WARNING = 2; // severity warning + SEVERITY_ERROR = 3; // severity error +} + +// DeviceEntity contains the device entity details +message DeviceEntity { + Entity entity = 1; // entity type + string entity_id = 2; // entity uuid + string entity_name = 3; // entity name +} + +// Errors in response to the application of configuration +message ErrorInfo { + string description = 1; + google.protobuf.Timestamp timestamp = 2; // Timestamp at which error had occurred + Severity severity = 3; // Severity of the error + repeated DeviceEntity entities = 4; // objects referenced by the description or retry_condition + string retry_condition = 5; // condition to retry +} \ No newline at end of file diff --git a/proto/info/info.proto b/proto/info/info.proto index 2a0672e5..e4419ec5 100644 --- a/proto/info/info.proto +++ b/proto/info/info.proto @@ -12,6 +12,8 @@ import "evecommon/devmodelcommon.proto"; import "evecommon/evecommon.proto"; import "info/patch_envelope.proto"; import "info/ntpsources.proto"; +import "info/error.proto"; +import "info/edge_node_cluster.proto"; // Deprecated: see deprecatedMetricItem below enum DepMetricItemType { @@ -233,58 +235,6 @@ message ZInfoSW { string imageName = 9; // Name of the disk image } -// Errors in response to the application of configuration -message ErrorInfo { - string description = 1; - google.protobuf.Timestamp timestamp = 2; // Timestamp at which error had occurred - Severity severity = 3; // Severity of the error - repeated DeviceEntity entities = 4; // objects referenced by the description or retry_condition - string retry_condition = 5; // condition to retry -} - -// DeviceEntity contains the device entity details -message DeviceEntity { - Entity entity = 1; // entity type - string entity_id = 2; // entity uuid - string entity_name = 3; // entity name -} - -// Entity contains the entity type -enum Entity { - // Invalid Device Entity - ENTITY_UNSPECIFIED = 0; - // Base OS entity - ENTITY_BASE_OS = 1; - // System Adapter Entity - ENTITY_SYSTEM_ADAPTER = 2; - // Vault Entity - ENTITY_VAULT = 3; - // Attestation Entity - ENTITY_ATTESTATION = 4; - // App Instance Entity - ENTITY_APP_INSTANCE = 5; - // Port Entity - ENTITY_PORT = 6; - // Network Entity - ENTITY_NETWORK = 7; - // Network Instance Entity - ENTITY_NETWORK_INSTANCE = 8; - // ContentTree Entity - ENTITY_CONTENT_TREE = 9; - // Blob Entity - ENTITY_CONTENT_BLOB = 10; - // VOLUME Entity - ENTITY_VOLUME = 11; -} - -// Severity tells the severity type -enum Severity { - SEVERITY_UNSPECIFIED = 0; // severity unspecified - SEVERITY_NOTICE = 1; // severity notice - SEVERITY_WARNING = 2; // severity warning - SEVERITY_ERROR = 3; // severity error -} - enum HwSecurityModuleStatus { UNKNOWN = 0; //HSM Status is not known NOTFOUND = 1; //No HSM found @@ -1233,6 +1183,7 @@ message ZInfoMsg { // 21 reserved ZInfoClusterNode cluster_node = 22; ZInfoNTPSources ntp_sources = 23; + KubeClusterInfo cluster_info = 24; } google.protobuf.Timestamp atTimeStamp = 6; } diff --git a/proto/profile/local_profile.proto b/proto/profile/local_profile.proto index a937f6fe..76b71d59 100644 --- a/proto/profile/local_profile.proto +++ b/proto/profile/local_profile.proto @@ -4,6 +4,7 @@ syntax = "proto3"; import "info/info.proto"; +import "info/error.proto"; import "metrics/metrics.proto"; import "google/protobuf/timestamp.proto";