Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repo maintenance for windows #8626

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelogs/unreleased/8626-Lyndon-Li
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix issue #8419, support repo maintenance job to run on Windows nodes
49 changes: 28 additions & 21 deletions pkg/cmd/cli/repomantenance/maintenance.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"github.com/vmware-tanzu/velero/pkg/util/logging"

repokey "github.com/vmware-tanzu/velero/pkg/repository/keys"
"github.com/vmware-tanzu/velero/pkg/repository/maintenance"
repomanager "github.com/vmware-tanzu/velero/pkg/repository/manager"
)

Expand Down Expand Up @@ -78,17 +79,7 @@
}()

if pruneError != nil {
logger.WithError(pruneError).Error("An error occurred when running repo prune")
terminationLogFile, err := os.Create("/dev/termination-log")
if err != nil {
logger.WithError(err).Error("Failed to create termination log file")
return
}
defer terminationLogFile.Close()

if _, errWrite := terminationLogFile.WriteString(fmt.Sprintf("An error occurred: %v", err)); errWrite != nil {
logger.WithError(errWrite).Error("Failed to write error to termination log file")
}
os.Stdout.WriteString(fmt.Sprintf("%s%v", maintenance.TerminationLogIndicator, pruneError))

Check warning on line 82 in pkg/cmd/cli/repomantenance/maintenance.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/cli/repomantenance/maintenance.go#L82

Added line #L82 was not covered by tests
}
}

Expand Down Expand Up @@ -163,22 +154,38 @@
return err
}

manager, err := initRepoManager(namespace, cli, kubeClient, logger)
if err != nil {
return err
var repo *velerov1api.BackupRepository
retry := 10
for {
repo, err = repository.GetBackupRepository(context.Background(), cli, namespace,
repository.BackupRepositoryKey{
VolumeNamespace: o.RepoName,
BackupLocation: o.BackupStorageLocation,
RepositoryType: o.RepoType,
}, true)
if err == nil {
break

Check warning on line 167 in pkg/cmd/cli/repomantenance/maintenance.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/cli/repomantenance/maintenance.go#L157-L167

Added lines #L157 - L167 were not covered by tests
}

retry--
if retry == 0 {
break

Check warning on line 172 in pkg/cmd/cli/repomantenance/maintenance.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/cli/repomantenance/maintenance.go#L170-L172

Added lines #L170 - L172 were not covered by tests
}

logger.WithError(err).Warn("Failed to retrieve backup repo, need retry")

time.Sleep(time.Second)

Check warning on line 177 in pkg/cmd/cli/repomantenance/maintenance.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/cli/repomantenance/maintenance.go#L175-L177

Added lines #L175 - L177 were not covered by tests
}

// backupRepository
repo, err := repository.GetBackupRepository(context.Background(), cli, namespace,
repository.BackupRepositoryKey{
VolumeNamespace: o.RepoName,
BackupLocation: o.BackupStorageLocation,
RepositoryType: o.RepoType,
}, true)
if err != nil {
return errors.Wrap(err, "failed to get backup repository")
}

manager, err := initRepoManager(namespace, cli, kubeClient, logger)
if err != nil {
return err
}

Check warning on line 187 in pkg/cmd/cli/repomantenance/maintenance.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/cli/repomantenance/maintenance.go#L184-L187

Added lines #L184 - L187 were not covered by tests

err = manager.PruneRepo(repo)
if err != nil {
return errors.Wrap(err, "failed to prune repo")
Expand Down
36 changes: 20 additions & 16 deletions pkg/controller/backup_repository_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,15 @@ func waitMaintenanceJobCompleteFail(client.Client, context.Context, string, stri
}

func waitMaintenanceJobCompleteFunc(now time.Time, result velerov1api.BackupRepositoryMaintenanceResult, message string) func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) {
completionTimeStamp := &metav1.Time{Time: now.Add(time.Hour)}
if result == velerov1api.BackupRepositoryMaintenanceFailed {
completionTimeStamp = nil
}

return func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) {
return velerov1api.BackupRepositoryMaintenanceStatus{
StartTimestamp: &metav1.Time{Time: now},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: completionTimeStamp,
Result: result,
Message: message,
}, nil
Expand Down Expand Up @@ -316,10 +321,9 @@ func TestRunMaintenanceIfDue(t *testing.T) {
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
},
{
StartTimestamp: &metav1.Time{Time: now},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Message: "fake-maintenance-message",
StartTimestamp: &metav1.Time{Time: now},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Message: "fake-maintenance-message",
},
},
},
Expand Down Expand Up @@ -893,7 +897,7 @@ func TestUpdateRepoMaintenanceHistory(t *testing.T) {
{
name: "full history",
backupRepo: backupRepoWithFullHistory,
result: velerov1api.BackupRepositoryMaintenanceFailed,
result: velerov1api.BackupRepositoryMaintenanceSucceeded,
expectedHistory: []velerov1api.BackupRepositoryMaintenanceStatus{
{
StartTimestamp: &metav1.Time{Time: standardTime.Add(-time.Hour * 22)},
Expand All @@ -915,7 +919,7 @@ func TestUpdateRepoMaintenanceHistory(t *testing.T) {
{
name: "over full history",
backupRepo: backupRepoWithOverFullHistory,
result: velerov1api.BackupRepositoryMaintenanceFailed,
result: velerov1api.BackupRepositoryMaintenanceSucceeded,
expectedHistory: []velerov1api.BackupRepositoryMaintenanceStatus{
{
StartTimestamp: &metav1.Time{Time: standardTime.Add(-time.Hour * 20)},
Expand Down Expand Up @@ -1127,7 +1131,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
},
Expand All @@ -1149,7 +1153,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand All @@ -1172,7 +1176,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
},
Expand All @@ -1194,7 +1198,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand Down Expand Up @@ -1223,7 +1227,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand All @@ -1237,7 +1241,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand All @@ -1257,7 +1261,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand Down Expand Up @@ -1339,13 +1343,13 @@ func TestGetLastMaintenanceTimeFromHistory(t *testing.T) {
history: []velerov1api.BackupRepositoryMaintenanceStatus{
{
StartTimestamp: &metav1.Time{Time: now},
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Message: "fake-maintenance-message",
},
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand Down
83 changes: 50 additions & 33 deletions pkg/repository/maintenance/maintenance.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"fmt"
"math"
"sort"
"strings"
"time"

"github.com/pkg/errors"
Expand All @@ -35,6 +36,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"

velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
"github.com/vmware-tanzu/velero/pkg/util"
"github.com/vmware-tanzu/velero/pkg/util/kube"

appsv1 "k8s.io/api/apps/v1"
Expand All @@ -47,6 +49,7 @@ import (
const (
RepositoryNameLabel = "velero.io/repo-name"
GlobalKeyForRepoMaintenanceJobCM = "global"
TerminationLogIndicator = "Repo maintenance error: "
)

type JobConfigs struct {
Expand Down Expand Up @@ -147,24 +150,37 @@ func getResultFromJob(cli client.Client, job *batchv1.Job) (string, error) {
}

if len(podList.Items) == 0 {
return "", fmt.Errorf("no pod found for job %s", job.Name)
return "", errors.Errorf("no pod found for job %s", job.Name)
}

// we only have one maintenance pod for the job
pod := podList.Items[0]

statuses := pod.Status.ContainerStatuses
if len(statuses) == 0 {
return "", fmt.Errorf("no container statuses found for job %s", job.Name)
return "", errors.Errorf("no container statuses found for job %s", job.Name)
}

// we only have one maintenance container
terminated := statuses[0].State.Terminated
if terminated == nil {
return "", fmt.Errorf("container for job %s is not terminated", job.Name)
return "", errors.Errorf("container for job %s is not terminated", job.Name)
}

return terminated.Message, nil
if terminated.Message == "" {
return "", nil
}

idx := strings.Index(terminated.Message, TerminationLogIndicator)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need the indicator? Why cannot we return the termination message directly?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The termination message is written to stdout together with logrus logs. So we use this indicator to identify it among other logs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the behavior of TerminationMessageFallbackToLogsOnError, the last 2048 bytes of stdout is retrieved as the termination message whatever are there in the bytes.

if idx == -1 {
return "", errors.New("error to locate repo maintenance error indicator from termination message")
}

if idx+len(TerminationLogIndicator) >= len(terminated.Message) {
return "", errors.New("nothing after repo maintenance error indicator in termination message")
}

return terminated.Message[idx+len(TerminationLogIndicator):], nil
}

// getJobConfig is called to get the Maintenance Job Config for the
Expand Down Expand Up @@ -331,7 +347,7 @@ func WaitAllJobsComplete(ctx context.Context, cli client.Client, repo *velerov1a
if job.Status.Failed > 0 {
if msg, err := getResultFromJob(cli, job); err != nil {
log.WithError(err).Warnf("Failed to get result of maintenance job %s", job.Name)
message = "Repo maintenance failed but result is not retrieveable"
message = fmt.Sprintf("Repo maintenance failed but result is not retrieveable, err: %v", err)
} else {
message = msg
}
Expand Down Expand Up @@ -434,6 +450,16 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
return nil, errors.Wrap(err, "failed to parse resource requirements for maintenance job")
}

podLabels := map[string]string{
RepositoryNameLabel: repo.Name,
}

for _, k := range util.ThirdPartyLabels {
if v := veleroutil.GetVeleroServerLabelValue(deployment, k); v != "" {
podLabels[k] = v
}
}

// Set arguments
args := []string{"repo-maintenance"}
args = append(args, fmt.Sprintf("--repo-name=%s", repo.Spec.VolumeNamespace))
Expand All @@ -455,10 +481,8 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
BackoffLimit: new(int32), // Never retry
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Name: "velero-repo-maintenance-pod",
Labels: map[string]string{
RepositoryNameLabel: repo.Name,
},
Name: "velero-repo-maintenance-pod",
Labels: podLabels,
},
Spec: v1.PodSpec{
Containers: []v1.Container{
Expand All @@ -468,17 +492,26 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
Command: []string{
"/velero",
},
Args: args,
ImagePullPolicy: v1.PullIfNotPresent,
Env: envVars,
EnvFrom: envFromSources,
VolumeMounts: volumeMounts,
Resources: resources,
Args: args,
ImagePullPolicy: v1.PullIfNotPresent,
Env: envVars,
EnvFrom: envFromSources,
VolumeMounts: volumeMounts,
Resources: resources,
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
RestartPolicy: v1.RestartPolicyNever,
Volumes: volumes,
ServiceAccountName: serviceAccount,
Tolerations: []v1.Toleration{
{
Key: "os",
Operator: "Equal",
Effect: "NoSchedule",
Value: "windows",
},
},
},
},
},
Expand All @@ -489,22 +522,6 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
job.Spec.Template.Spec.Affinity = affinity
}

if tolerations := veleroutil.GetTolerationsFromVeleroServer(deployment); tolerations != nil {
job.Spec.Template.Spec.Tolerations = tolerations
}

if nodeSelector := veleroutil.GetNodeSelectorFromVeleroServer(deployment); nodeSelector != nil {
job.Spec.Template.Spec.NodeSelector = nodeSelector
}

if labels := veleroutil.GetVeleroServerLables(deployment); len(labels) > 0 {
job.Spec.Template.Labels = labels
}

if annotations := veleroutil.GetVeleroServerAnnotations(deployment); len(annotations) > 0 {
job.Spec.Template.Annotations = annotations
}

return job, nil
}

Expand All @@ -516,8 +533,8 @@ func composeStatusFromJob(job *batchv1.Job, message string) velerov1api.BackupRe

return velerov1api.BackupRepositoryMaintenanceStatus{
Result: result,
StartTimestamp: &metav1.Time{Time: job.CreationTimestamp.Time},
CompleteTimestamp: &metav1.Time{Time: job.Status.CompletionTime.Time},
StartTimestamp: &job.CreationTimestamp,
CompleteTimestamp: job.Status.CompletionTime,
Message: message,
}
}
Loading
Loading