From bf4746bbf505a77d23f4d57a7769b235153522c3 Mon Sep 17 00:00:00 2001 From: shreddedbacon Date: Fri, 30 Sep 2022 09:29:02 +1000 Subject: [PATCH 01/10] chore: split harbor out a bit, and continue on errors in a single namespace to prevent blocking --- internal/harbor/harbor.go | 55 +++ internal/harbor/harbor_credentialrotation.go | 137 ++++++++ internal/harbor/harbor_helpers.go | 150 ++++++++ ...gration_test.go => harbor_helpers_test.go} | 0 internal/harbor/harborintegration.go | 322 ------------------ 5 files changed, 342 insertions(+), 322 deletions(-) create mode 100644 internal/harbor/harbor.go create mode 100644 internal/harbor/harbor_credentialrotation.go create mode 100644 internal/harbor/harbor_helpers.go rename internal/harbor/{harborintegration_test.go => harbor_helpers_test.go} (100%) delete mode 100644 internal/harbor/harborintegration.go diff --git a/internal/harbor/harbor.go b/internal/harbor/harbor.go new file mode 100644 index 00000000..9c8618bf --- /dev/null +++ b/internal/harbor/harbor.go @@ -0,0 +1,55 @@ +package harbor + +import ( + "time" + + "github.com/go-logr/logr" + harborclientv3 "github.com/mittwald/goharbor-client/v3/apiv2" + + harborclientv5 "github.com/mittwald/goharbor-client/v5/apiv2" + "github.com/mittwald/goharbor-client/v5/apiv2/pkg/config" + + ctrl "sigs.k8s.io/controller-runtime" +) + +// Harbor defines a harbor struct +type Harbor struct { + URL string + Hostname string + API string + Username string + Password string + Log logr.Logger + ClientV3 *harborclientv3.RESTClient + ClientV5 *harborclientv5.RESTClient + DeleteDisabled bool + WebhookAddition bool + RobotPrefix string + ExpiryInterval time.Duration + RotateInterval time.Duration + RobotAccountExpiry time.Duration + ControllerNamespace string + NamespacePrefix string + RandomNamespacePrefix bool + WebhookURL string + WebhookEventTypes []string + LagoonTargetName string + Config *config.Options +} + +// New create a new harbor connection. +func New(harbor Harbor) (*Harbor, error) { + harbor.Log = ctrl.Log.WithName("controllers").WithName("HarborIntegration") + c, err := harborclientv3.NewRESTClientForHost(harbor.API, harbor.Username, harbor.Password) + if err != nil { + return nil, err + } + harbor.ClientV3 = c + harbor.Config = &config.Options{} + c2, err := harborclientv5.NewRESTClientForHost(harbor.API, harbor.Username, harbor.Password, harbor.Config) + if err != nil { + return nil, err + } + harbor.ClientV5 = c2 + return &harbor, nil +} diff --git a/internal/harbor/harbor_credentialrotation.go b/internal/harbor/harbor_credentialrotation.go new file mode 100644 index 00000000..18e8cc05 --- /dev/null +++ b/internal/harbor/harbor_credentialrotation.go @@ -0,0 +1,137 @@ +package harbor + +import ( + "fmt" + "sort" + + "context" + "time" + + lagoonv1beta1 "github.com/uselagoon/remote-controller/apis/lagoon/v1beta1" + "github.com/uselagoon/remote-controller/internal/helpers" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/selection" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// RotateRobotCredentials will attempt to recreate any robot account credentials that need to be rotated. +func (h *Harbor) RotateRobotCredentials(ctx context.Context, cl client.Client) { + opLog := ctrl.Log.WithName("handlers").WithName("RotateRobotCredentials") + namespaces := &corev1.NamespaceList{} + labelRequirements, _ := labels.NewRequirement("lagoon.sh/environmentType", selection.Exists, nil) + // @TODO: do this later so we can only run robot credentials for specific controllers + // labelRequirements2, _ := labels.NewRequirement("lagoon.sh/controller", selection.Equals, []string{h.ControllerNamespace}) + listOption := (&client.ListOptions{}).ApplyOptions([]client.ListOption{ + client.MatchingLabelsSelector{ + Selector: labels.NewSelector().Add(*labelRequirements), + // @TODO: do this later so we can only run robot credentials for specific controllers + // Selector: labels.NewSelector().Add(*labelRequirements).Add(*labelRequirements2), + }, + }) + if err := cl.List(ctx, namespaces, listOption); err != nil { + opLog.Error(err, fmt.Sprintf("Unable to list namespaces created by Lagoon, there may be none or something went wrong")) + return + } + // go over every namespace that has a lagoon.sh label + // and attempt to create and update the robot account credentials as requred. + for _, ns := range namespaces.Items { + if ns.Status.Phase == corev1.NamespaceTerminating { + // if the namespace is terminating, don't try to renew the robot credentials + opLog.Info(fmt.Sprintf("Namespace %s is being terminated, aborting robot credentials check", ns.ObjectMeta.Name)) + continue + } + opLog.Info(fmt.Sprintf("Checking if %s needs robot credentials rotated", ns.ObjectMeta.Name)) + // check for running builds! + lagoonBuilds := &lagoonv1beta1.LagoonBuildList{} + listOption := (&client.ListOptions{}).ApplyOptions([]client.ListOption{ + client.InNamespace(ns.ObjectMeta.Name), + client.MatchingLabels(map[string]string{ + "lagoon.sh/controller": h.ControllerNamespace, // created by this controller + }), + }) + if err := cl.List(context.Background(), lagoonBuilds, listOption); err != nil { + opLog.Error(err, fmt.Sprintf("Unable to list Lagoon build pods, there may be none or something went wrong")) + continue + } + runningBuilds := false + sort.Slice(lagoonBuilds.Items, func(i, j int) bool { + return lagoonBuilds.Items[i].ObjectMeta.CreationTimestamp.After(lagoonBuilds.Items[j].ObjectMeta.CreationTimestamp.Time) + }) + // if there are any builds pending or running, don't try and refresh the credentials as this + // could break the build + if len(lagoonBuilds.Items) > 0 { + if helpers.ContainsString( + helpers.BuildRunningPendingStatus, + lagoonBuilds.Items[0].Labels["lagoon.sh/buildStatus"], + ) { + runningBuilds = true + } + } + if !runningBuilds { + // only continue if there isn't any running builds + robotCreds := &helpers.RegistryCredentials{} + curVer, err := h.GetHarborVersion(ctx) + if err != nil { + // @TODO: resource unknown + opLog.Error(err, "error checking harbor version") + continue + } + if h.UseV2Functions(curVer) { + hProject, err := h.CreateProjectV2(ctx, ns.Labels["lagoon.sh/project"]) + if err != nil { + // @TODO: resource unknown + opLog.Error(err, "error getting or creating project") + break + } + time.Sleep(1 * time.Second) // wait 1 seconds + robotCreds, err = h.CreateOrRefreshRobotV2(ctx, + cl, + hProject, + ns.Labels["lagoon.sh/environment"], + ns.ObjectMeta.Name, + h.RobotAccountExpiry) + if err != nil { + opLog.Error(err, "error getting or creating robot account") + continue + } + } else { + hProject, err := h.CreateProject(ctx, ns.Labels["lagoon.sh/project"]) + if err != nil { + // @TODO: resource unknown + opLog.Error(err, "error getting or creating project") + continue + } + time.Sleep(1 * time.Second) // wait 1 seconds + robotCreds, err = h.CreateOrRefreshRobot(ctx, + cl, + hProject, + ns.Labels["lagoon.sh/environment"], + ns.ObjectMeta.Name, + time.Now().Add(h.RobotAccountExpiry).Unix()) + if err != nil { + opLog.Error(err, "error getting or creating robot account") + continue + } + } + time.Sleep(1 * time.Second) // wait 1 seconds + if robotCreds != nil { + // if we have robotcredentials to create, do that here + if err := UpsertHarborSecret(ctx, + cl, + ns.ObjectMeta.Name, + "lagoon-internal-registry-secret", //secret name in kubernetes + h.Hostname, + robotCreds); err != nil { + opLog.Error(err, "error creating or updating robot account credentials") + continue + } + opLog.Info(fmt.Sprintf("Robot credentials rotated for %s", ns.ObjectMeta.Name)) + } + } else { + opLog.Info(fmt.Sprintf("There are running or pending builds in %s, skipping", ns.ObjectMeta.Name)) + } + } +} diff --git a/internal/harbor/harbor_helpers.go b/internal/harbor/harbor_helpers.go new file mode 100644 index 00000000..3d054f07 --- /dev/null +++ b/internal/harbor/harbor_helpers.go @@ -0,0 +1,150 @@ +package harbor + +import ( + "fmt" + "strings" + + "context" + "encoding/base64" + "encoding/json" + "time" + + "github.com/uselagoon/remote-controller/internal/helpers" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/coreos/go-semver/semver" +) + +type robotAccountCredential struct { + Name string `json:"name"` + CreatedAt int64 `json:"created_at"` + Token string `json:"token"` +} + +// GetHarborVersion returns the version of harbor. +func (h *Harbor) GetHarborVersion(ctx context.Context) (string, error) { + harborVersion, err := h.ClientV5.GetSystemInfo(ctx) + if err != nil { + return "", err + } + // harbor versions are returned as `v2.1.2-abcdef`, this returns just the `2.1.2` of the version + // `[1:] strips the v` + version := strings.Split(*harborVersion.HarborVersion, "-")[0][1:] + return version, nil +} + +// UseV2Functions . +func (h *Harbor) UseV2Functions(version string) bool { + currentVersion := semver.New(version) + harborV2 := semver.New("2.2.0") + // invert the result + return !currentVersion.LessThan(*harborV2) +} + +// addPrefix adds the robot account prefix to robot accounts +// @TODO: Harbor 2.2.0 changes this behavior, see note below in `matchRobotAccount` +func (h *Harbor) addPrefix(str string) string { + return h.RobotPrefix + str +} + +// matchRobotAccount will check if the robotaccount exists or not +func (h *Harbor) matchRobotAccount(robotName string, + projectName string, + environmentName string, +) bool { + // pre global-robot-accounts (2.2.0+) + if robotName == h.addPrefix(fmt.Sprintf("%s-%s", environmentName, helpers.HashString(h.LagoonTargetName)[0:8])) { + return true + } + return false +} + +// matchRobotAccountV2 will check if the robotaccount exists or not +func (h *Harbor) matchRobotAccountV2(robotName string, + projectName string, + environmentName string, +) bool { + if robotName == h.addPrefixV2(projectName, environmentName) { + return true + } + return false +} + +// already expired? +func (h *Harbor) shouldRotate(creationTime string, interval time.Duration) bool { + created, err := time.Parse(time.RFC3339Nano, creationTime) + if err != nil { + h.Log.Error(err, "error parsing time") + return true + } + return created.UTC().Add(interval).Before(time.Now().UTC()) +} + +// expiresSoon checks if the robot account will expire soon +func (h *Harbor) expiresSoon(expiresAt int64, duration time.Duration) bool { + now := time.Now().UTC().Add(duration) + expiry := time.Unix(expiresAt, 0) + return expiry.Before(now) +} + +// makeHarborSecret creates the secret definition. +func makeHarborSecret(credentials robotAccountCredential) helpers.RegistryCredentials { + return helpers.RegistryCredentials{ + Username: credentials.Name, + Password: credentials.Token, + Auth: base64.StdEncoding.EncodeToString( + []byte( + fmt.Sprintf("%s:%s", credentials.Name, credentials.Token), + ), + )} +} + +// UpsertHarborSecret will create or update the secret in kubernetes. +func UpsertHarborSecret(ctx context.Context, cl client.Client, ns, name, baseURL string, registryCreds *helpers.RegistryCredentials) error { + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: name, + }, + Type: corev1.SecretTypeDockerConfigJson, + } + dcj := &helpers.Auths{ + Registries: make(map[string]helpers.RegistryCredentials), + } + err := cl.Get(ctx, types.NamespacedName{ + Namespace: ns, + Name: name, + }, secret) + if err != nil { + // if the secret doesn't exist + // create it + dcj.Registries[baseURL] = *registryCreds + dcjBytes, _ := json.Marshal(dcj) + secret.Data = map[string][]byte{ + corev1.DockerConfigJsonKey: []byte(dcjBytes), + } + err := cl.Create(ctx, secret) + if err != nil { + return fmt.Errorf("could not create secret %s/%s: %s", secret.ObjectMeta.Namespace, secret.ObjectMeta.Name, err.Error()) + } + return nil + } + // if the secret exists + // update the secret with the new credentials + json.Unmarshal([]byte(secret.Data[corev1.DockerConfigJsonKey]), &dcj) + // add or update the credential + dcj.Registries[baseURL] = *registryCreds + dcjBytes, _ := json.Marshal(dcj) + secret.Data = map[string][]byte{ + corev1.DockerConfigJsonKey: []byte(dcjBytes), + } + err = cl.Update(ctx, secret) + if err != nil { + return fmt.Errorf("could not update secret: %s/%s", secret.ObjectMeta.Namespace, secret.ObjectMeta.Name) + } + return nil +} diff --git a/internal/harbor/harborintegration_test.go b/internal/harbor/harbor_helpers_test.go similarity index 100% rename from internal/harbor/harborintegration_test.go rename to internal/harbor/harbor_helpers_test.go diff --git a/internal/harbor/harborintegration.go b/internal/harbor/harborintegration.go deleted file mode 100644 index fb94d99c..00000000 --- a/internal/harbor/harborintegration.go +++ /dev/null @@ -1,322 +0,0 @@ -package harbor - -import ( - "fmt" - "sort" - "strings" - - "context" - "encoding/base64" - "encoding/json" - "time" - - "github.com/go-logr/logr" - harborclientv3 "github.com/mittwald/goharbor-client/v3/apiv2" - - harborclientv5 "github.com/mittwald/goharbor-client/v5/apiv2" - "github.com/mittwald/goharbor-client/v5/apiv2/pkg/config" - - lagoonv1beta1 "github.com/uselagoon/remote-controller/apis/lagoon/v1beta1" - "github.com/uselagoon/remote-controller/internal/helpers" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/selection" - "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - - "github.com/coreos/go-semver/semver" -) - -type robotAccountCredential struct { - Name string `json:"name"` - CreatedAt int64 `json:"created_at"` - Token string `json:"token"` -} - -// Harbor defines a harbor struct -type Harbor struct { - URL string - Hostname string - API string - Username string - Password string - Log logr.Logger - ClientV3 *harborclientv3.RESTClient - ClientV5 *harborclientv5.RESTClient - DeleteDisabled bool - WebhookAddition bool - RobotPrefix string - ExpiryInterval time.Duration - RotateInterval time.Duration - RobotAccountExpiry time.Duration - ControllerNamespace string - NamespacePrefix string - RandomNamespacePrefix bool - WebhookURL string - WebhookEventTypes []string - LagoonTargetName string - Config *config.Options -} - -// NewHarbor create a new harbor connection. -func NewHarbor(harbor Harbor) (*Harbor, error) { - harbor.Log = ctrl.Log.WithName("controllers").WithName("HarborIntegration") - c, err := harborclientv3.NewRESTClientForHost(harbor.API, harbor.Username, harbor.Password) - if err != nil { - return nil, err - } - harbor.ClientV3 = c - harbor.Config = &config.Options{} - c2, err := harborclientv5.NewRESTClientForHost(harbor.API, harbor.Username, harbor.Password, harbor.Config) - if err != nil { - return nil, err - } - harbor.ClientV5 = c2 - return &harbor, nil -} - -// GetHarborVersion returns the version of harbor. -func (h *Harbor) GetHarborVersion(ctx context.Context) (string, error) { - harborVersion, err := h.ClientV5.GetSystemInfo(ctx) - if err != nil { - return "", err - } - // harbor versions are returned as `v2.1.2-abcdef`, this returns just the `2.1.2` of the version - // `[1:] strips the v` - version := strings.Split(*harborVersion.HarborVersion, "-")[0][1:] - return version, nil -} - -// UseV2Functions . -func (h *Harbor) UseV2Functions(version string) bool { - currentVersion := semver.New(version) - harborV2 := semver.New("2.2.0") - // invert the result - return !currentVersion.LessThan(*harborV2) -} - -// RotateRobotCredentials will attempt to recreate any robot account credentials that need to be rotated. -func (h *Harbor) RotateRobotCredentials(ctx context.Context, cl client.Client) { - opLog := ctrl.Log.WithName("handlers").WithName("RotateRobotCredentials") - namespaces := &corev1.NamespaceList{} - labelRequirements, _ := labels.NewRequirement("lagoon.sh/environmentType", selection.Exists, nil) - // @TODO: do this later so we can only run robot credentials for specific controllers - // labelRequirements2, _ := labels.NewRequirement("lagoon.sh/controller", selection.Equals, []string{h.ControllerNamespace}) - listOption := (&client.ListOptions{}).ApplyOptions([]client.ListOption{ - client.MatchingLabelsSelector{ - Selector: labels.NewSelector().Add(*labelRequirements), - // @TODO: do this later so we can only run robot credentials for specific controllers - // Selector: labels.NewSelector().Add(*labelRequirements).Add(*labelRequirements2), - }, - }) - if err := cl.List(ctx, namespaces, listOption); err != nil { - opLog.Error(err, fmt.Sprintf("Unable to list namespaces created by Lagoon, there may be none or something went wrong")) - return - } - // go over every namespace that has a lagoon.sh label - // and attempt to create and update the robot account credentials as requred. - for _, ns := range namespaces.Items { - if ns.Status.Phase == corev1.NamespaceTerminating { - // if the namespace is terminating, don't try to renew the robot credentials - opLog.Info(fmt.Sprintf("Namespace %s is being terminated, aborting robot credentials check", ns.ObjectMeta.Name)) - return - } - opLog.Info(fmt.Sprintf("Checking if %s needs robot credentials rotated", ns.ObjectMeta.Name)) - // check for running builds! - lagoonBuilds := &lagoonv1beta1.LagoonBuildList{} - listOption := (&client.ListOptions{}).ApplyOptions([]client.ListOption{ - client.InNamespace(ns.ObjectMeta.Name), - client.MatchingLabels(map[string]string{ - "lagoon.sh/controller": h.ControllerNamespace, // created by this controller - }), - }) - if err := cl.List(context.Background(), lagoonBuilds, listOption); err != nil { - opLog.Error(err, fmt.Sprintf("Unable to list Lagoon build pods, there may be none or something went wrong")) - return - } - runningBuilds := false - sort.Slice(lagoonBuilds.Items, func(i, j int) bool { - return lagoonBuilds.Items[i].ObjectMeta.CreationTimestamp.After(lagoonBuilds.Items[j].ObjectMeta.CreationTimestamp.Time) - }) - // if there are any builds pending or running, don't try and refresh the credentials as this - // could break the build - if len(lagoonBuilds.Items) > 0 { - if helpers.ContainsString( - helpers.BuildRunningPendingStatus, - lagoonBuilds.Items[0].Labels["lagoon.sh/buildStatus"], - ) { - runningBuilds = true - } - } - if !runningBuilds { - // only continue if there isn't any running builds - robotCreds := &helpers.RegistryCredentials{} - curVer, err := h.GetHarborVersion(ctx) - if err != nil { - // @TODO: resource unknown - opLog.Error(err, "error checking harbor version") - break - } - if h.UseV2Functions(curVer) { - hProject, err := h.CreateProjectV2(ctx, ns.Labels["lagoon.sh/project"]) - if err != nil { - // @TODO: resource unknown - opLog.Error(err, "error getting or creating project") - break - } - time.Sleep(1 * time.Second) // wait 1 seconds - robotCreds, err = h.CreateOrRefreshRobotV2(ctx, - cl, - hProject, - ns.Labels["lagoon.sh/environment"], - ns.ObjectMeta.Name, - h.RobotAccountExpiry) - if err != nil { - opLog.Error(err, "error getting or creating robot account") - break - } - } else { - hProject, err := h.CreateProject(ctx, ns.Labels["lagoon.sh/project"]) - if err != nil { - // @TODO: resource unknown - opLog.Error(err, "error getting or creating project") - break - } - time.Sleep(1 * time.Second) // wait 1 seconds - robotCreds, err = h.CreateOrRefreshRobot(ctx, - cl, - hProject, - ns.Labels["lagoon.sh/environment"], - ns.ObjectMeta.Name, - time.Now().Add(h.RobotAccountExpiry).Unix()) - if err != nil { - opLog.Error(err, "error getting or creating robot account") - break - } - } - time.Sleep(1 * time.Second) // wait 1 seconds - if robotCreds != nil { - // if we have robotcredentials to create, do that here - if err := UpsertHarborSecret(ctx, - cl, - ns.ObjectMeta.Name, - "lagoon-internal-registry-secret", //secret name in kubernetes - h.Hostname, - robotCreds); err != nil { - opLog.Error(err, "error creating or updating robot account credentials") - break - } - opLog.Info(fmt.Sprintf("Robot credentials rotated for %s", ns.ObjectMeta.Name)) - } - } else { - opLog.Info(fmt.Sprintf("There are running or pending builds in %s, skipping", ns.ObjectMeta.Name)) - } - } -} - -// addPrefix adds the robot account prefix to robot accounts -// @TODO: Harbor 2.2.0 changes this behavior, see note below in `matchRobotAccount` -func (h *Harbor) addPrefix(str string) string { - return h.RobotPrefix + str -} - -// matchRobotAccount will check if the robotaccount exists or not -func (h *Harbor) matchRobotAccount(robotName string, - projectName string, - environmentName string, -) bool { - // pre global-robot-accounts (2.2.0+) - if robotName == h.addPrefix(fmt.Sprintf("%s-%s", environmentName, helpers.HashString(h.LagoonTargetName)[0:8])) { - return true - } - return false -} - -// matchRobotAccountV2 will check if the robotaccount exists or not -func (h *Harbor) matchRobotAccountV2(robotName string, - projectName string, - environmentName string, -) bool { - if robotName == h.addPrefixV2(projectName, environmentName) { - return true - } - return false -} - -// already expired? -func (h *Harbor) shouldRotate(creationTime string, interval time.Duration) bool { - created, err := time.Parse(time.RFC3339Nano, creationTime) - if err != nil { - h.Log.Error(err, "error parsing time") - return true - } - return created.UTC().Add(interval).Before(time.Now().UTC()) -} - -// expiresSoon checks if the robot account will expire soon -func (h *Harbor) expiresSoon(expiresAt int64, duration time.Duration) bool { - now := time.Now().UTC().Add(duration) - expiry := time.Unix(expiresAt, 0) - return expiry.Before(now) -} - -// makeHarborSecret creates the secret definition. -func makeHarborSecret(credentials robotAccountCredential) helpers.RegistryCredentials { - return helpers.RegistryCredentials{ - Username: credentials.Name, - Password: credentials.Token, - Auth: base64.StdEncoding.EncodeToString( - []byte( - fmt.Sprintf("%s:%s", credentials.Name, credentials.Token), - ), - )} -} - -// UpsertHarborSecret will create or update the secret in kubernetes. -func UpsertHarborSecret(ctx context.Context, cl client.Client, ns, name, baseURL string, registryCreds *helpers.RegistryCredentials) error { - secret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: ns, - Name: name, - }, - Type: corev1.SecretTypeDockerConfigJson, - } - dcj := &helpers.Auths{ - Registries: make(map[string]helpers.RegistryCredentials), - } - err := cl.Get(ctx, types.NamespacedName{ - Namespace: ns, - Name: name, - }, secret) - if err != nil { - // if the secret doesn't exist - // create it - dcj.Registries[baseURL] = *registryCreds - dcjBytes, _ := json.Marshal(dcj) - secret.Data = map[string][]byte{ - corev1.DockerConfigJsonKey: []byte(dcjBytes), - } - err := cl.Create(ctx, secret) - if err != nil { - return fmt.Errorf("could not create secret %s/%s: %s", secret.ObjectMeta.Namespace, secret.ObjectMeta.Name, err.Error()) - } - return nil - } - // if the secret exists - // update the secret with the new credentials - json.Unmarshal([]byte(secret.Data[corev1.DockerConfigJsonKey]), &dcj) - // add or update the credential - dcj.Registries[baseURL] = *registryCreds - dcjBytes, _ := json.Marshal(dcj) - secret.Data = map[string][]byte{ - corev1.DockerConfigJsonKey: []byte(dcjBytes), - } - err = cl.Update(ctx, secret) - if err != nil { - return fmt.Errorf("could not update secret: %s/%s", secret.ObjectMeta.Namespace, secret.ObjectMeta.Name) - } - return nil -} From f7cf4cfaceed0dd63227f5a6d4886e36b626efcd Mon Sep 17 00:00:00 2001 From: shreddedbacon Date: Fri, 30 Sep 2022 09:29:33 +1000 Subject: [PATCH 02/10] fix: continue on any single namespace error to prevent blocking --- internal/utilities/pruner/build_pruner.go | 4 ++-- internal/utilities/pruner/task_pruner.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/utilities/pruner/build_pruner.go b/internal/utilities/pruner/build_pruner.go index 974e90ab..5568077c 100644 --- a/internal/utilities/pruner/build_pruner.go +++ b/internal/utilities/pruner/build_pruner.go @@ -33,7 +33,7 @@ func (p *Pruner) LagoonBuildPruner() { if ns.Status.Phase == corev1.NamespaceTerminating { // if the namespace is terminating, don't try to renew the robot credentials opLog.Info(fmt.Sprintf("Namespace %s is being terminated, aborting build pruner", ns.ObjectMeta.Name)) - return + continue } opLog.Info(fmt.Sprintf("Checking LagoonBuilds in namespace %s", ns.ObjectMeta.Name)) lagoonBuilds := &lagoonv1beta1.LagoonBuildList{} @@ -45,7 +45,7 @@ func (p *Pruner) LagoonBuildPruner() { }) if err := p.Client.List(context.Background(), lagoonBuilds, listOption); err != nil { opLog.Error(err, fmt.Sprintf("Unable to list LagoonBuild resources, there may be none or something went wrong")) - return + continue } // sort the build pods by creation timestamp sort.Slice(lagoonBuilds.Items, func(i, j int) bool { diff --git a/internal/utilities/pruner/task_pruner.go b/internal/utilities/pruner/task_pruner.go index eca4d994..0dea5280 100644 --- a/internal/utilities/pruner/task_pruner.go +++ b/internal/utilities/pruner/task_pruner.go @@ -33,7 +33,7 @@ func (p *Pruner) LagoonTaskPruner() { if ns.Status.Phase == corev1.NamespaceTerminating { // if the namespace is terminating, don't try to renew the robot credentials opLog.Info(fmt.Sprintf("Namespace %s is being terminated, aborting task pruner", ns.ObjectMeta.Name)) - return + continue } opLog.Info(fmt.Sprintf("Checking LagoonTasks in namespace %s", ns.ObjectMeta.Name)) lagoonTasks := &lagoonv1beta1.LagoonTaskList{} @@ -45,7 +45,7 @@ func (p *Pruner) LagoonTaskPruner() { }) if err := p.Client.List(context.Background(), lagoonTasks, listOption); err != nil { opLog.Error(err, fmt.Sprintf("Unable to list LagoonTask resources, there may be none or something went wrong")) - return + continue } // sort the build pods by creation timestamp sort.Slice(lagoonTasks.Items, func(i, j int) bool { From 9b335ea1d5d4186c18a8cd5274154affb9034f1f Mon Sep 17 00:00:00 2001 From: shreddedbacon Date: Fri, 30 Sep 2022 09:30:02 +1000 Subject: [PATCH 03/10] refactor: rename harbor func to new instead of newharbor --- controllers/v1beta1/build_helpers.go | 2 +- internal/utilities/deletions/process.go | 2 +- main.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/controllers/v1beta1/build_helpers.go b/controllers/v1beta1/build_helpers.go index 8443f4e3..99432a59 100644 --- a/controllers/v1beta1/build_helpers.go +++ b/controllers/v1beta1/build_helpers.go @@ -185,7 +185,7 @@ func (r *LagoonBuildReconciler) getOrCreateNamespace(ctx context.Context, namesp // if local/regional harbor is enabled if r.LFFHarborEnabled { // create the harbor client - lagoonHarbor, err := harbor.NewHarbor(r.Harbor) + lagoonHarbor, err := harbor.New(r.Harbor) if err != nil { return fmt.Errorf("Error creating harbor client, check your harbor configuration. Error was: %v", err) } diff --git a/internal/utilities/deletions/process.go b/internal/utilities/deletions/process.go index 844a60ae..cc8ea60f 100644 --- a/internal/utilities/deletions/process.go +++ b/internal/utilities/deletions/process.go @@ -34,7 +34,7 @@ func (d *Deletions) ProcessDeletion(ctx context.Context, opLog logr.Logger, name then delete them */ if d.CleanupHarborRepositoryOnDelete && cleanupHarbor { - lagoonHarbor, err := harbor.NewHarbor(d.Harbor) + lagoonHarbor, err := harbor.New(d.Harbor) if err != nil { return err } diff --git a/main.go b/main.go index 004db992..346e5d5b 100644 --- a/main.go +++ b/main.go @@ -683,7 +683,7 @@ func main() { // use cron to run a task pod cleanup task // this will check any Lagoon task pods and attempt to delete them c.AddFunc(harborCredentialCron, func() { - lagoonHarbor, _ := harbor.NewHarbor(harborConfig) + lagoonHarbor, _ := harbor.New(harborConfig) lagoonHarbor.RotateRobotCredentials(context.Background(), mgr.GetClient()) }) } From d72d4d29d42631502639353212b6d251f1494d7f Mon Sep 17 00:00:00 2001 From: shreddedbacon Date: Tue, 4 Oct 2022 10:25:14 +1100 Subject: [PATCH 04/10] chore: add clearer label for idling and storage calculator --- apis/lagoon/v1beta1/lagoonbuild_types.go | 42 ++++++++++---------- apis/lagoon/v1beta1/zz_generated.deepcopy.go | 4 +- controllers/v1beta1/build_helpers.go | 20 ++++++++-- 3 files changed, 39 insertions(+), 27 deletions(-) diff --git a/apis/lagoon/v1beta1/lagoonbuild_types.go b/apis/lagoon/v1beta1/lagoonbuild_types.go index 9c0164c3..faaaa290 100644 --- a/apis/lagoon/v1beta1/lagoonbuild_types.go +++ b/apis/lagoon/v1beta1/lagoonbuild_types.go @@ -107,27 +107,27 @@ type Build struct { // Project contains the project information from lagoon. type Project struct { - ID *uint `json:"id,omitempty"` - Name string `json:"name"` - Environment string `json:"environment"` - EnvironmentID *uint `json:"environmentId,omitempty"` - UILink string `json:"uiLink,omitempty"` - GitURL string `json:"gitUrl"` - NamespacePattern string `json:"namespacePattern,omitempty"` - RouterPattern string `json:"routerPattern,omitempty"` - EnvironmentType string `json:"environmentType"` - ProductionEnvironment string `json:"productionEnvironment"` - StandbyEnvironment string `json:"standbyEnvironment"` - DeployTarget string `json:"deployTarget"` - ProjectSecret string `json:"projectSecret"` - SubFolder string `json:"subfolder,omitempty"` - Key []byte `json:"key"` - Monitoring Monitoring `json:"monitoring"` - Variables Variables `json:"variables"` - Registry string `json:"registry,omitempty"` - EnvironmentIdling *int `json:"environmentIdling,omitempty"` - ProjectIdling *int `json:"projectIdling,omitempty"` - StorageCalculatorDisabled *int `json:"storageCalc,omitempty"` + ID *uint `json:"id,omitempty"` + Name string `json:"name"` + Environment string `json:"environment"` + EnvironmentID *uint `json:"environmentId,omitempty"` + UILink string `json:"uiLink,omitempty"` + GitURL string `json:"gitUrl"` + NamespacePattern string `json:"namespacePattern,omitempty"` + RouterPattern string `json:"routerPattern,omitempty"` + EnvironmentType string `json:"environmentType"` + ProductionEnvironment string `json:"productionEnvironment"` + StandbyEnvironment string `json:"standbyEnvironment"` + DeployTarget string `json:"deployTarget"` + ProjectSecret string `json:"projectSecret"` + SubFolder string `json:"subfolder,omitempty"` + Key []byte `json:"key"` + Monitoring Monitoring `json:"monitoring"` + Variables Variables `json:"variables"` + Registry string `json:"registry,omitempty"` + EnvironmentIdling *int `json:"environmentIdling,omitempty"` + ProjectIdling *int `json:"projectIdling,omitempty"` + StorageCalculator *int `json:"storageCalc,omitempty"` } // Variables contains the project and environment variables from lagoon. diff --git a/apis/lagoon/v1beta1/zz_generated.deepcopy.go b/apis/lagoon/v1beta1/zz_generated.deepcopy.go index 92db6e66..1652e7d5 100644 --- a/apis/lagoon/v1beta1/zz_generated.deepcopy.go +++ b/apis/lagoon/v1beta1/zz_generated.deepcopy.go @@ -580,8 +580,8 @@ func (in *Project) DeepCopyInto(out *Project) { *out = new(int) **out = **in } - if in.StorageCalculatorDisabled != nil { - in, out := &in.StorageCalculatorDisabled, &out.StorageCalculatorDisabled + if in.StorageCalculator != nil { + in, out := &in.StorageCalculator, &out.StorageCalculator *out = new(int) **out = **in } diff --git a/controllers/v1beta1/build_helpers.go b/controllers/v1beta1/build_helpers.go index 99432a59..aa73eab8 100644 --- a/controllers/v1beta1/build_helpers.go +++ b/controllers/v1beta1/build_helpers.go @@ -114,16 +114,28 @@ func (r *LagoonBuildReconciler) getOrCreateNamespace(ctx context.Context, namesp } // set the auto idling values if they are defined if lagoonBuild.Spec.Project.EnvironmentIdling != nil { + // eventually deprecate 'lagoon.sh/environmentAutoIdle' for 'lagoon.sh/environmentIdlingEnabled' nsLabels["lagoon.sh/environmentAutoIdle"] = fmt.Sprintf("%d", *lagoonBuild.Spec.Project.EnvironmentIdling) + if *lagoonBuild.Spec.Project.ProjectIdling == 1 { + nsLabels["lagoon.sh/environmentIdlingEnabled"] = "true" + } else { + nsLabels["lagoon.sh/environmentIdlingEnabled"] = "false" + } } if lagoonBuild.Spec.Project.ProjectIdling != nil { + // eventually deprecate 'lagoon.sh/projectAutoIdle' for 'lagoon.sh/projectIdlingEnabled' nsLabels["lagoon.sh/projectAutoIdle"] = fmt.Sprintf("%d", *lagoonBuild.Spec.Project.ProjectIdling) + if *lagoonBuild.Spec.Project.ProjectIdling == 1 { + nsLabels["lagoon.sh/projectIdlingEnabled"] = "true" + } else { + nsLabels["lagoon.sh/projectIdlingEnabled"] = "false" + } } - if lagoonBuild.Spec.Project.StorageCalculatorDisabled != nil { - if *lagoonBuild.Spec.Project.StorageCalculatorDisabled != 1 { - nsLabels["lagoon.sh/storageCalculatorDisabled"] = "true" + if lagoonBuild.Spec.Project.StorageCalculator != nil { + if *lagoonBuild.Spec.Project.StorageCalculator == 1 { + nsLabels["lagoon.sh/storageCalculatorEnabled"] = "true" } else { - nsLabels["lagoon.sh/storageCalculatorDisabled"] = "false" + nsLabels["lagoon.sh/storageCalculatorEnabled"] = "false" } } // add the required lagoon labels to the namespace when creating From 9b9be5d6a9aca7de6aeb20730b00ebc2bebc125a Mon Sep 17 00:00:00 2001 From: shreddedbacon Date: Tue, 4 Oct 2022 15:51:17 +1100 Subject: [PATCH 05/10] fix: use the right field name --- apis/lagoon/v1beta1/lagoonbuild_types.go | 2 +- config/crd/bases/crd.lagoon.sh_lagoonbuilds.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apis/lagoon/v1beta1/lagoonbuild_types.go b/apis/lagoon/v1beta1/lagoonbuild_types.go index faaaa290..ae108b31 100644 --- a/apis/lagoon/v1beta1/lagoonbuild_types.go +++ b/apis/lagoon/v1beta1/lagoonbuild_types.go @@ -127,7 +127,7 @@ type Project struct { Registry string `json:"registry,omitempty"` EnvironmentIdling *int `json:"environmentIdling,omitempty"` ProjectIdling *int `json:"projectIdling,omitempty"` - StorageCalculator *int `json:"storageCalc,omitempty"` + StorageCalculator *int `json:"storageCalculator,omitempty"` } // Variables contains the project and environment variables from lagoon. diff --git a/config/crd/bases/crd.lagoon.sh_lagoonbuilds.yaml b/config/crd/bases/crd.lagoon.sh_lagoonbuilds.yaml index 001398d8..0c72b6d8 100644 --- a/config/crd/bases/crd.lagoon.sh_lagoonbuilds.yaml +++ b/config/crd/bases/crd.lagoon.sh_lagoonbuilds.yaml @@ -106,7 +106,7 @@ spec: type: string standbyEnvironment: type: string - storageCalc: + storageCalculator: type: integer subfolder: type: string From f1293870b44a889235afe5da66d6cbaa2156a233 Mon Sep 17 00:00:00 2001 From: Blaize Kaye Date: Fri, 21 Oct 2022 10:17:10 +1300 Subject: [PATCH 06/10] First pass at adding old process pruner --- .../utilities/pruner/old_process_pruner.go | 115 ++++++++++++++++++ internal/utilities/pruner/pruner.go | 19 +-- main.go | 15 +++ 3 files changed, 141 insertions(+), 8 deletions(-) create mode 100644 internal/utilities/pruner/old_process_pruner.go diff --git a/internal/utilities/pruner/old_process_pruner.go b/internal/utilities/pruner/old_process_pruner.go new file mode 100644 index 00000000..62636375 --- /dev/null +++ b/internal/utilities/pruner/old_process_pruner.go @@ -0,0 +1,115 @@ +package pruner + +import ( + "context" + "fmt" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/selection" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "time" + //lagoonv1beta1 "github.com/uselagoon/remote-controller/apis/lagoon/v1beta1" + //"github.com/uselagoon/remote-controller/internal/helpers" +) + +// LagoonOldProcPruner will identify and remove any long running builds or tasks. +func (p *Pruner) LagoonOldProcPruner() { + opLog := ctrl.Log.WithName("utilities").WithName("LagoonOldProcPruner") + opLog.Info("Beginning marking old build and task pods") + namespaces := &corev1.NamespaceList{} + labelRequirements, _ := labels.NewRequirement("lagoon.sh/environmentType", selection.Exists, nil) + listOption := (&client.ListOptions{}).ApplyOptions([]client.ListOption{ + client.MatchingLabelsSelector{ + Selector: labels.NewSelector().Add(*labelRequirements), + }, + }) + if err := p.Client.List(context.Background(), namespaces, listOption); err != nil { + opLog.Error(err, fmt.Sprintf("Unable to list namespaces created by Lagoon, there may be none or something went wrong")) + return + } + + //now we iterate through each namespace, and look for build/task pods + for _, ns := range namespaces.Items { + if ns.Status.Phase == corev1.NamespaceTerminating { + // if the namespace is terminating, don't search it for long running tasks + opLog.Info(fmt.Sprintf("Namespace %s is being terminated, skipping build/task pruner", ns.ObjectMeta.Name)) + continue + } + + //Now let's look for build and task pods + // NOTE: build and task pods really are the same kind of thing, right - they're simply pods, but tagged differently + // So what we're going to do is just find those that match the various + + podList := corev1.PodList{ + TypeMeta: metav1.TypeMeta{}, + ListMeta: metav1.ListMeta{}, + Items: nil, + } + + listOption := (&client.ListOptions{}).ApplyOptions([]client.ListOption{ + client.InNamespace(ns.ObjectMeta.Name), + client.MatchingLabels(map[string]string{ + "lagoon.sh/controller": p.ControllerNamespace, // created by this controller + }), + }) + if err := p.Client.List(context.Background(), &podList, listOption); err != nil { + opLog.Error(err, fmt.Sprintf("Unable to list pod resources, there may be none or something went wrong")) + continue + } + + opLog.Info(fmt.Sprintf("Running task/build recon")) + + hours, err := time.ParseDuration(fmt.Sprintf("%vh", p.TimeoutForWorkerPods)) + if err != nil { + opLog.Error(err, + fmt.Sprintf( + "Unable to parse TimeoutForWorkerPods '%v' - cannot run long running task removal process.", + p.TimeoutForWorkerPods, + ), + ) + return + } + removeIfCreatedBefore := time.Now().Add(-time.Hour * hours) + + for _, pod := range podList.Items { + if pod.Status.Phase == corev1.PodRunning { + opLog.Info(fmt.Sprintf("Found a pod named - %v", pod.Name)) + + if pod.CreationTimestamp.Time.Before(removeIfCreatedBefore) || true { + if podType, ok := pod.GetLabels()["lagoon.sh/jobType"]; ok { + switch podType { + case "task": + //podTypeName = "task" + pod.ObjectMeta.Labels["lagoon.sh/cancelTask"] = "true" + pod.ObjectMeta.Annotations["lagoon.sh/cancelReason"] = fmt.Sprintf("Cancelled task due to timeout") + break + case "build": + //podTypeName = "build" + pod.ObjectMeta.Labels["lagoon.sh/cancelBuild"] = "true" + pod.ObjectMeta.Annotations["lagoon.sh/cancelReason"] = fmt.Sprintf("Cancelled build due to timeout") + break + default: + return + } + // this isn't actually a job, so we skip + if err := p.Client.Update(context.Background(), &pod); err != nil { + opLog.Error(err, + fmt.Sprintf( + "Unable to update %s to cancel it.", + pod.Name, + ), + ) + } + opLog.Info("Cancelled pod %v - timeout", pod.Name) + } else { + continue + } + + } + } + } + opLog.Info(fmt.Sprintf("Endint task/build recon")) + } +} diff --git a/internal/utilities/pruner/pruner.go b/internal/utilities/pruner/pruner.go index 08073245..ac663933 100644 --- a/internal/utilities/pruner/pruner.go +++ b/internal/utilities/pruner/pruner.go @@ -22,6 +22,7 @@ type Pruner struct { NamespacePrefix string RandomNamespacePrefix bool DeletionHandler *deletions.Deletions + TimeoutForWorkerPods int EnableDebug bool } @@ -34,15 +35,17 @@ func New( taskPodsToKeep int, controllerNamespace string, deletionHandler *deletions.Deletions, + timeoutForWorkerPods int, enableDebug bool) *Pruner { return &Pruner{ - Client: client, - BuildsToKeep: buildsToKeep, - TasksToKeep: tasksToKeep, - BuildPodsToKeep: buildPodsToKeep, - TaskPodsToKeep: taskPodsToKeep, - ControllerNamespace: controllerNamespace, - DeletionHandler: deletionHandler, - EnableDebug: enableDebug, + Client: client, + BuildsToKeep: buildsToKeep, + TasksToKeep: tasksToKeep, + BuildPodsToKeep: buildPodsToKeep, + TaskPodsToKeep: taskPodsToKeep, + ControllerNamespace: controllerNamespace, + DeletionHandler: deletionHandler, + TimeoutForWorkerPods: timeoutForWorkerPods, + EnableDebug: enableDebug, } } diff --git a/main.go b/main.go index 346e5d5b..46df4192 100644 --- a/main.go +++ b/main.go @@ -146,6 +146,8 @@ func main() { var pvcRetryInterval int var cleanNamespacesEnabled bool var cleanNamespacesCron string + var pruneLongRunningWorkerPods bool + var timeoutForLongRunningWorkerPods int var lffQoSEnabled bool var qosMaxBuilds int @@ -321,6 +323,11 @@ func main() { flag.StringVar(&cleanNamespacesCron, "namespace-cleanup-cron", "30 * * * *", "The cron definition for how often to run the namespace resources cleanup.") + // LongRuning Worker Pod Timeout config + flag.BoolVar(&pruneLongRunningWorkerPods, "enable-longrunning-pod-cleanup", true, + "Tells the controller to remove Task and Build pods that have been running for too long.") + flag.IntVar(&timeoutForLongRunningWorkerPods, "timeout-longrunning-pod-cleanup", 6, "How many hours a build or task pod should run before forcefully closed.") + // QoS configuration flag.BoolVar(&lffQoSEnabled, "enable-qos", false, "Flag to enable this controller with QoS for builds.") flag.IntVar(&qosMaxBuilds, "qos-max-builds", 20, "The number of builds that can run at any one time.") @@ -639,6 +646,7 @@ func main() { taskPodsToKeep, controllerNamespace, deletion, + timeoutForLongRunningWorkerPods, enableDebug, ) // if the lagoonbuild cleanup is enabled, add the cronjob for it @@ -696,6 +704,13 @@ func main() { }) } + if pruneLongRunningWorkerPods || true { + setupLog.Info("starting long running task cleanup task") + c.AddFunc("* * * * *", func() { + resourceCleanup.LagoonOldProcPruner() + }) + } + c.Start() setupLog.Info("starting controllers") From 795e3ea5d74277633b2b96c533dbee39fe9ae141 Mon Sep 17 00:00:00 2001 From: Blaize Kaye Date: Fri, 21 Oct 2022 16:49:17 +1300 Subject: [PATCH 07/10] Adds messages, splits process --- .../v1beta1/podmonitor_buildhandlers.go | 8 +- .../v1beta1/podmonitor_taskhandlers.go | 8 +- .../utilities/pruner/old_process_pruner.go | 77 ++++++++++--------- internal/utilities/pruner/pruner.go | 25 +++--- main.go | 27 ++++--- 5 files changed, 84 insertions(+), 61 deletions(-) diff --git a/controllers/v1beta1/podmonitor_buildhandlers.go b/controllers/v1beta1/podmonitor_buildhandlers.go index dc1d266a..09eb8aeb 100644 --- a/controllers/v1beta1/podmonitor_buildhandlers.go +++ b/controllers/v1beta1/podmonitor_buildhandlers.go @@ -452,10 +452,14 @@ func (r *LagoonMonitorReconciler) updateDeploymentWithLogs( allContainerLogs, err = r.collectLogs(ctx, req, jobPod) if err == nil { if cancel { + cancellationMessage := "Build cancelled" + if cancellationDetails, ok := jobPod.GetAnnotations()["lagoon.sh/cancelReason"]; ok { + cancellationMessage = fmt.Sprintf("%v : %v", cancellationMessage, cancellationDetails) + } allContainerLogs = append(allContainerLogs, []byte(fmt.Sprintf(` ======================================== -Build cancelled -========================================`))...) +%v +========================================`, cancellationMessage))...) } } else { allContainerLogs = []byte(fmt.Sprintf(` diff --git a/controllers/v1beta1/podmonitor_taskhandlers.go b/controllers/v1beta1/podmonitor_taskhandlers.go index e5c0274b..0622876f 100644 --- a/controllers/v1beta1/podmonitor_taskhandlers.go +++ b/controllers/v1beta1/podmonitor_taskhandlers.go @@ -307,10 +307,14 @@ func (r *LagoonMonitorReconciler) updateTaskWithLogs( allContainerLogs, err = r.collectLogs(ctx, req, jobPod) if err == nil { if cancel { + cancellationMessage := "Task cancelled" + if cancellationDetails, ok := jobPod.GetAnnotations()["lagoon.sh/cancelReason"]; ok { + cancellationMessage = fmt.Sprintf("%v : %v", cancellationMessage, cancellationDetails) + } allContainerLogs = append(allContainerLogs, []byte(fmt.Sprintf(` ======================================== -Task cancelled -========================================`))...) +%v +========================================`, cancellationMessage))...) } } else { allContainerLogs = []byte(fmt.Sprintf(` diff --git a/internal/utilities/pruner/old_process_pruner.go b/internal/utilities/pruner/old_process_pruner.go index 62636375..a44f4280 100644 --- a/internal/utilities/pruner/old_process_pruner.go +++ b/internal/utilities/pruner/old_process_pruner.go @@ -15,7 +15,7 @@ import ( ) // LagoonOldProcPruner will identify and remove any long running builds or tasks. -func (p *Pruner) LagoonOldProcPruner() { +func (p *Pruner) LagoonOldProcPruner(pruneBuilds, pruneTasks bool) { opLog := ctrl.Log.WithName("utilities").WithName("LagoonOldProcPruner") opLog.Info("Beginning marking old build and task pods") namespaces := &corev1.NamespaceList{} @@ -38,10 +38,6 @@ func (p *Pruner) LagoonOldProcPruner() { continue } - //Now let's look for build and task pods - // NOTE: build and task pods really are the same kind of thing, right - they're simply pods, but tagged differently - // So what we're going to do is just find those that match the various - podList := corev1.PodList{ TypeMeta: metav1.TypeMeta{}, ListMeta: metav1.ListMeta{}, @@ -59,57 +55,64 @@ func (p *Pruner) LagoonOldProcPruner() { continue } - opLog.Info(fmt.Sprintf("Running task/build recon")) + hours, err := time.ParseDuration(fmt.Sprintf("%vh", p.TimeoutForBuildPods)) + if err != nil { + opLog.Error(err, + fmt.Sprintf( + "Unable to parse TimeoutForBuildPods '%v' - cannot run long running task removal process.", + p.TimeoutForBuildPods, + ), + ) + return + } + + removeBuildIfCreatedBefore := time.Now().Add(-time.Hour * hours) - hours, err := time.ParseDuration(fmt.Sprintf("%vh", p.TimeoutForWorkerPods)) + hours, err = time.ParseDuration(fmt.Sprintf("%vh", p.TimeoutForTaskPods)) if err != nil { opLog.Error(err, fmt.Sprintf( - "Unable to parse TimeoutForWorkerPods '%v' - cannot run long running task removal process.", - p.TimeoutForWorkerPods, + "Unable to parse TimeoutForTaskPods '%v' - cannot run long running task removal process.", + p.TimeoutForBuildPods, ), ) return } - removeIfCreatedBefore := time.Now().Add(-time.Hour * hours) + removeTaskIfCreatedBefore := time.Now().Add(-time.Hour * hours) for _, pod := range podList.Items { if pod.Status.Phase == corev1.PodRunning { - opLog.Info(fmt.Sprintf("Found a pod named - %v", pod.Name)) - - if pod.CreationTimestamp.Time.Before(removeIfCreatedBefore) || true { - if podType, ok := pod.GetLabels()["lagoon.sh/jobType"]; ok { - switch podType { - case "task": - //podTypeName = "task" + if podType, ok := pod.GetLabels()["lagoon.sh/jobType"]; ok { + switch podType { + case "task": + if pod.CreationTimestamp.Time.Before(removeTaskIfCreatedBefore) && pruneTasks { pod.ObjectMeta.Labels["lagoon.sh/cancelTask"] = "true" pod.ObjectMeta.Annotations["lagoon.sh/cancelReason"] = fmt.Sprintf("Cancelled task due to timeout") - break - case "build": - //podTypeName = "build" + } + break + case "build": + if pod.CreationTimestamp.Time.Before(removeBuildIfCreatedBefore) && pruneBuilds { pod.ObjectMeta.Labels["lagoon.sh/cancelBuild"] = "true" pod.ObjectMeta.Annotations["lagoon.sh/cancelReason"] = fmt.Sprintf("Cancelled build due to timeout") - break - default: - return } - // this isn't actually a job, so we skip - if err := p.Client.Update(context.Background(), &pod); err != nil { - opLog.Error(err, - fmt.Sprintf( - "Unable to update %s to cancel it.", - pod.Name, - ), - ) - } - opLog.Info("Cancelled pod %v - timeout", pod.Name) - } else { - continue + break + default: + return } - + // this isn't actually a job, so we skip + if err := p.Client.Update(context.Background(), &pod); err != nil { + opLog.Error(err, + fmt.Sprintf( + "Unable to update %s to cancel it.", + pod.Name, + ), + ) + } + opLog.Info(fmt.Sprintf("Cancelled pod %v - timeout", pod.Name)) + } else { + continue } } } - opLog.Info(fmt.Sprintf("Endint task/build recon")) } } diff --git a/internal/utilities/pruner/pruner.go b/internal/utilities/pruner/pruner.go index ac663933..058e4c35 100644 --- a/internal/utilities/pruner/pruner.go +++ b/internal/utilities/pruner/pruner.go @@ -22,7 +22,8 @@ type Pruner struct { NamespacePrefix string RandomNamespacePrefix bool DeletionHandler *deletions.Deletions - TimeoutForWorkerPods int + TimeoutForBuildPods int + TimeoutForTaskPods int EnableDebug bool } @@ -35,17 +36,19 @@ func New( taskPodsToKeep int, controllerNamespace string, deletionHandler *deletions.Deletions, - timeoutForWorkerPods int, + timeoutForBuildPods int, + timeoutForTaskPods int, enableDebug bool) *Pruner { return &Pruner{ - Client: client, - BuildsToKeep: buildsToKeep, - TasksToKeep: tasksToKeep, - BuildPodsToKeep: buildPodsToKeep, - TaskPodsToKeep: taskPodsToKeep, - ControllerNamespace: controllerNamespace, - DeletionHandler: deletionHandler, - TimeoutForWorkerPods: timeoutForWorkerPods, - EnableDebug: enableDebug, + Client: client, + BuildsToKeep: buildsToKeep, + TasksToKeep: tasksToKeep, + BuildPodsToKeep: buildPodsToKeep, + TaskPodsToKeep: taskPodsToKeep, + ControllerNamespace: controllerNamespace, + DeletionHandler: deletionHandler, + TimeoutForBuildPods: timeoutForBuildPods, + TimeoutForTaskPods: timeoutForTaskPods, + EnableDebug: enableDebug, } } diff --git a/main.go b/main.go index 46df4192..cc85ba57 100644 --- a/main.go +++ b/main.go @@ -146,8 +146,11 @@ func main() { var pvcRetryInterval int var cleanNamespacesEnabled bool var cleanNamespacesCron string - var pruneLongRunningWorkerPods bool - var timeoutForLongRunningWorkerPods int + var pruneLongRunningBuildPods bool + var pruneLongRunningTaskPods bool + var timeoutForLongRunningBuildPods int + var timeoutForLongRunningTaskPods int + var pruneLongRunningPodsCron string var lffQoSEnabled bool var qosMaxBuilds int @@ -324,9 +327,14 @@ func main() { "The cron definition for how often to run the namespace resources cleanup.") // LongRuning Worker Pod Timeout config - flag.BoolVar(&pruneLongRunningWorkerPods, "enable-longrunning-pod-cleanup", true, - "Tells the controller to remove Task and Build pods that have been running for too long.") - flag.IntVar(&timeoutForLongRunningWorkerPods, "timeout-longrunning-pod-cleanup", 6, "How many hours a build or task pod should run before forcefully closed.") + flag.StringVar(&pruneLongRunningPodsCron, "longrunning-pod-cleanup-cron", "30 * * * *", + "The cron definition for how often to run the long running Task/Build cleanup process.") + flag.BoolVar(&pruneLongRunningBuildPods, "enable-longrunning-build-pod-cleanup", true, + "Tells the controller to remove Build pods that have been running for too long.") + flag.BoolVar(&pruneLongRunningTaskPods, "enable-longrunning-task-pod-cleanup", true, + "Tells the controller to remove Task pods that have been running for too long.") + flag.IntVar(&timeoutForLongRunningBuildPods, "timeout-longrunning-build-pod-cleanup", 6, "How many hours a build pod should run before forcefully closed.") + flag.IntVar(&timeoutForLongRunningTaskPods, "timeout-longrunning-task-pod-cleanup", 6, "How many hours a task pod should run before forcefully closed.") // QoS configuration flag.BoolVar(&lffQoSEnabled, "enable-qos", false, "Flag to enable this controller with QoS for builds.") @@ -646,7 +654,8 @@ func main() { taskPodsToKeep, controllerNamespace, deletion, - timeoutForLongRunningWorkerPods, + timeoutForLongRunningBuildPods, + timeoutForLongRunningTaskPods, enableDebug, ) // if the lagoonbuild cleanup is enabled, add the cronjob for it @@ -704,10 +713,10 @@ func main() { }) } - if pruneLongRunningWorkerPods || true { + if pruneLongRunningTaskPods || pruneLongRunningBuildPods { setupLog.Info("starting long running task cleanup task") - c.AddFunc("* * * * *", func() { - resourceCleanup.LagoonOldProcPruner() + c.AddFunc(pruneLongRunningPodsCron, func() { + resourceCleanup.LagoonOldProcPruner(pruneLongRunningBuildPods, pruneLongRunningTaskPods) }) } From dbd955628daa05a11c34e296e06596ac110b45b3 Mon Sep 17 00:00:00 2001 From: Blaize Kaye Date: Tue, 25 Oct 2022 13:51:53 +1300 Subject: [PATCH 08/10] Fixes hour timeout and adds further labels --- .../utilities/pruner/old_process_pruner.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/internal/utilities/pruner/old_process_pruner.go b/internal/utilities/pruner/old_process_pruner.go index a44f4280..c4880984 100644 --- a/internal/utilities/pruner/old_process_pruner.go +++ b/internal/utilities/pruner/old_process_pruner.go @@ -44,11 +44,15 @@ func (p *Pruner) LagoonOldProcPruner(pruneBuilds, pruneTasks bool) { Items: nil, } + jobTypeLabelRequirements, _ := labels.NewRequirement("lagoon.sh/jobType", selection.Exists, nil) listOption := (&client.ListOptions{}).ApplyOptions([]client.ListOption{ client.InNamespace(ns.ObjectMeta.Name), client.MatchingLabels(map[string]string{ "lagoon.sh/controller": p.ControllerNamespace, // created by this controller }), + client.MatchingLabelsSelector{ + Selector: labels.NewSelector().Add(*jobTypeLabelRequirements), + }, }) if err := p.Client.List(context.Background(), &podList, listOption); err != nil { opLog.Error(err, fmt.Sprintf("Unable to list pod resources, there may be none or something went wrong")) @@ -66,7 +70,7 @@ func (p *Pruner) LagoonOldProcPruner(pruneBuilds, pruneTasks bool) { return } - removeBuildIfCreatedBefore := time.Now().Add(-time.Hour * hours) + removeBuildIfCreatedBefore := time.Now().Add(-hours) hours, err = time.ParseDuration(fmt.Sprintf("%vh", p.TimeoutForTaskPods)) if err != nil { @@ -78,28 +82,33 @@ func (p *Pruner) LagoonOldProcPruner(pruneBuilds, pruneTasks bool) { ) return } - removeTaskIfCreatedBefore := time.Now().Add(-time.Hour * hours) + removeTaskIfCreatedBefore := time.Now().Add(-hours) for _, pod := range podList.Items { if pod.Status.Phase == corev1.PodRunning { if podType, ok := pod.GetLabels()["lagoon.sh/jobType"]; ok { + + updatePod := false switch podType { case "task": if pod.CreationTimestamp.Time.Before(removeTaskIfCreatedBefore) && pruneTasks { + updatePod = true pod.ObjectMeta.Labels["lagoon.sh/cancelTask"] = "true" pod.ObjectMeta.Annotations["lagoon.sh/cancelReason"] = fmt.Sprintf("Cancelled task due to timeout") } - break case "build": if pod.CreationTimestamp.Time.Before(removeBuildIfCreatedBefore) && pruneBuilds { + updatePod = true pod.ObjectMeta.Labels["lagoon.sh/cancelBuild"] = "true" pod.ObjectMeta.Annotations["lagoon.sh/cancelReason"] = fmt.Sprintf("Cancelled build due to timeout") } - break default: return } - // this isn't actually a job, so we skip + if !updatePod { + return + } + if err := p.Client.Update(context.Background(), &pod); err != nil { opLog.Error(err, fmt.Sprintf( From 5d4cf0281eb35100c49703c452f357462ba711ba Mon Sep 17 00:00:00 2001 From: Blaize Kaye Date: Wed, 26 Oct 2022 10:24:52 +1300 Subject: [PATCH 09/10] Adds ns annotation build/task kill override --- .../utilities/pruner/old_process_pruner.go | 80 +++++++++----- .../pruner/old_process_pruner_test.go | 101 ++++++++++++++++++ 2 files changed, 156 insertions(+), 25 deletions(-) create mode 100644 internal/utilities/pruner/old_process_pruner_test.go diff --git a/internal/utilities/pruner/old_process_pruner.go b/internal/utilities/pruner/old_process_pruner.go index c4880984..10e599a0 100644 --- a/internal/utilities/pruner/old_process_pruner.go +++ b/internal/utilities/pruner/old_process_pruner.go @@ -2,6 +2,7 @@ package pruner import ( "context" + "errors" "fmt" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -9,6 +10,7 @@ import ( "k8s.io/apimachinery/pkg/selection" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "strconv" "time" //lagoonv1beta1 "github.com/uselagoon/remote-controller/apis/lagoon/v1beta1" //"github.com/uselagoon/remote-controller/internal/helpers" @@ -32,6 +34,7 @@ func (p *Pruner) LagoonOldProcPruner(pruneBuilds, pruneTasks bool) { //now we iterate through each namespace, and look for build/task pods for _, ns := range namespaces.Items { + if ns.Status.Phase == corev1.NamespaceTerminating { // if the namespace is terminating, don't search it for long running tasks opLog.Info(fmt.Sprintf("Namespace %s is being terminated, skipping build/task pruner", ns.ObjectMeta.Name)) @@ -44,6 +47,12 @@ func (p *Pruner) LagoonOldProcPruner(pruneBuilds, pruneTasks bool) { Items: nil, } + removeBuildIfCreatedBefore, removeTaskIfCreatedBefore, err := calculateRemoveBeforeTimes(p, ns, time.Now()) + if err != nil { + opLog.Error(err, err.Error()) + return + } + jobTypeLabelRequirements, _ := labels.NewRequirement("lagoon.sh/jobType", selection.Exists, nil) listOption := (&client.ListOptions{}).ApplyOptions([]client.ListOption{ client.InNamespace(ns.ObjectMeta.Name), @@ -59,31 +68,6 @@ func (p *Pruner) LagoonOldProcPruner(pruneBuilds, pruneTasks bool) { continue } - hours, err := time.ParseDuration(fmt.Sprintf("%vh", p.TimeoutForBuildPods)) - if err != nil { - opLog.Error(err, - fmt.Sprintf( - "Unable to parse TimeoutForBuildPods '%v' - cannot run long running task removal process.", - p.TimeoutForBuildPods, - ), - ) - return - } - - removeBuildIfCreatedBefore := time.Now().Add(-hours) - - hours, err = time.ParseDuration(fmt.Sprintf("%vh", p.TimeoutForTaskPods)) - if err != nil { - opLog.Error(err, - fmt.Sprintf( - "Unable to parse TimeoutForTaskPods '%v' - cannot run long running task removal process.", - p.TimeoutForBuildPods, - ), - ) - return - } - removeTaskIfCreatedBefore := time.Now().Add(-hours) - for _, pod := range podList.Items { if pod.Status.Phase == corev1.PodRunning { if podType, ok := pod.GetLabels()["lagoon.sh/jobType"]; ok { @@ -125,3 +109,49 @@ func (p *Pruner) LagoonOldProcPruner(pruneBuilds, pruneTasks bool) { } } } + +// calculateRemoveBeforeTimes will return the date/times before which a build and task should be pruned. +func calculateRemoveBeforeTimes(p *Pruner, ns corev1.Namespace, startTime time.Time) (time.Time, time.Time, error) { + // Here we set the timeout for build and task pods + // these are able to be overridden by a namespace level + // specification, + timeoutForBuildPods := p.TimeoutForBuildPods + if nsTimeoutForBuildPods, ok := ns.GetLabels()["lagoon.sh/buildPodTimeout"]; ok { + insTimeoutForBuildpods, err := strconv.Atoi(nsTimeoutForBuildPods) + if err != nil { + return time.Time{}, time.Time{}, err + } + timeoutForBuildPods = insTimeoutForBuildpods + } + + timeoutForTaskPods := p.TimeoutForTaskPods + if nsTimeoutForTaskPods, ok := ns.GetLabels()["lagoon.sh/taskPodTimeout"]; ok { + insTimeoutForTaskPods, err := strconv.Atoi(nsTimeoutForTaskPods) + if err != nil { + return time.Time{}, time.Time{}, err + } + timeoutForTaskPods = insTimeoutForTaskPods + } + + hours, err := time.ParseDuration(fmt.Sprintf("%vh", timeoutForBuildPods)) + if err != nil { + errorText := fmt.Sprintf( + "Unable to parse TimeoutForBuildPods '%v' - cannot run long running task removal process.", + p.TimeoutForBuildPods, + ) + return time.Time{}, time.Time{}, errors.New(errorText) + } + + removeBuildIfCreatedBefore := startTime.Add(-hours) + + hours, err = time.ParseDuration(fmt.Sprintf("%vh", timeoutForTaskPods)) + if err != nil { + errorText := fmt.Sprintf( + "Unable to parse TimeoutForTaskPods '%v' - cannot run long running task removal process.", + p.TimeoutForBuildPods, + ) + return time.Time{}, time.Time{}, errors.New(errorText) + } + removeTaskIfCreatedBefore := startTime.Add(-hours) + return removeBuildIfCreatedBefore, removeTaskIfCreatedBefore, nil +} diff --git a/internal/utilities/pruner/old_process_pruner_test.go b/internal/utilities/pruner/old_process_pruner_test.go new file mode 100644 index 00000000..749f9ec1 --- /dev/null +++ b/internal/utilities/pruner/old_process_pruner_test.go @@ -0,0 +1,101 @@ +package pruner + +import ( + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "reflect" + "testing" + "time" +) + +func Test_calculateRemoveBeforeTimes(t *testing.T) { + type args struct { + p *Pruner + ns v1.Namespace + startTime time.Time + } + tests := []struct { + name string + args args + buildBeforeTime time.Time + taskBeforeTime time.Time + wantErr bool + }{ + { + name: "Kill immediately (empty test)", + args: args{ + p: &Pruner{ + TimeoutForBuildPods: 0, + TimeoutForTaskPods: 0, + }, + ns: v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + //"lagoon.sh/buildPodTimeout": "1" + }, + //Annotations: nil, + }, + }, + startTime: time.Date(2000, 1, 1, 1, 1, 1, 1, time.Local), + }, + buildBeforeTime: time.Date(2000, 1, 1, 1, 1, 1, 1, time.Local), + taskBeforeTime: time.Date(2000, 1, 1, 1, 1, 1, 1, time.Local), + }, + { + name: "Kill 6 hours earlier - given by pruner settings", + args: args{ + p: &Pruner{ + TimeoutForBuildPods: 6, + TimeoutForTaskPods: 6, + }, + ns: v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + //"lagoon.sh/buildPodTimeout": "1" + }, + //Annotations: nil, + }, + }, + startTime: time.Date(2000, 1, 1, 7, 1, 1, 1, time.Local), + }, + buildBeforeTime: time.Date(2000, 1, 1, 1, 1, 1, 1, time.Local), + taskBeforeTime: time.Date(2000, 1, 1, 1, 1, 1, 1, time.Local), + }, + { + name: "Kill 1 hours earlier - given by ns override", + args: args{ + p: &Pruner{ + TimeoutForBuildPods: 6, + TimeoutForTaskPods: 6, + }, + ns: v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "lagoon.sh/buildPodTimeout": "2", + "lagoon.sh/taskPodTimeout": "1", + }, + //Annotations: nil, + }, + }, + startTime: time.Date(2000, 1, 1, 3, 1, 1, 1, time.Local), + }, + buildBeforeTime: time.Date(2000, 1, 1, 1, 1, 1, 1, time.Local), + taskBeforeTime: time.Date(2000, 1, 1, 2, 1, 1, 1, time.Local), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, got1, err := calculateRemoveBeforeTimes(tt.args.p, tt.args.ns, tt.args.startTime) + if (err != nil) != tt.wantErr { + t.Errorf("calculateRemoveBeforeTimes() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.buildBeforeTime) { + t.Errorf("calculateRemoveBeforeTimes() got = %v, buildBeforeTime %v", got, tt.buildBeforeTime) + } + if !reflect.DeepEqual(got1, tt.taskBeforeTime) { + t.Errorf("calculateRemoveBeforeTimes() got1 = %v, buildBeforeTime %v", got1, tt.taskBeforeTime) + } + }) + } +} From 8798d995dd9c291f32b949f3dfccd7b3951c434e Mon Sep 17 00:00:00 2001 From: Blaize Kaye Date: Wed, 26 Oct 2022 11:05:36 +1300 Subject: [PATCH 10/10] Removed extraneous logging --- internal/utilities/pruner/old_process_pruner.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/utilities/pruner/old_process_pruner.go b/internal/utilities/pruner/old_process_pruner.go index 10e599a0..4b74506d 100644 --- a/internal/utilities/pruner/old_process_pruner.go +++ b/internal/utilities/pruner/old_process_pruner.go @@ -19,7 +19,6 @@ import ( // LagoonOldProcPruner will identify and remove any long running builds or tasks. func (p *Pruner) LagoonOldProcPruner(pruneBuilds, pruneTasks bool) { opLog := ctrl.Log.WithName("utilities").WithName("LagoonOldProcPruner") - opLog.Info("Beginning marking old build and task pods") namespaces := &corev1.NamespaceList{} labelRequirements, _ := labels.NewRequirement("lagoon.sh/environmentType", selection.Exists, nil) listOption := (&client.ListOptions{}).ApplyOptions([]client.ListOption{ @@ -37,7 +36,6 @@ func (p *Pruner) LagoonOldProcPruner(pruneBuilds, pruneTasks bool) { if ns.Status.Phase == corev1.NamespaceTerminating { // if the namespace is terminating, don't search it for long running tasks - opLog.Info(fmt.Sprintf("Namespace %s is being terminated, skipping build/task pruner", ns.ObjectMeta.Name)) continue }