Skip to content

Commit

Permalink
Merge pull request #8482 from Lyndon-Li/data-mover-exposer-diagnostic
Browse files Browse the repository at this point in the history
Data mover exposer diagnostic
  • Loading branch information
ywk253100 authored Dec 13, 2024
2 parents cd01222 + 34e417b commit 0224d99
Show file tree
Hide file tree
Showing 18 changed files with 1,192 additions and 2 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/8482-Lyndon-Li
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix issue #8125, log diagnostic info for data mover exposers when expose timeout
6 changes: 6 additions & 0 deletions pkg/controller/data_download_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package controller
import (
"context"
"fmt"
"strings"
"time"

"github.com/pkg/errors"
Expand Down Expand Up @@ -684,6 +685,11 @@ func (r *DataDownloadReconciler) onPrepareTimeout(ctx context.Context, dd *veler
return
}

diags := strings.Split(r.restoreExposer.DiagnoseExpose(ctx, getDataDownloadOwnerObject(dd)), "\n")
for _, diag := range diags {
log.Warnf("[Diagnose DD expose]%s", diag)
}

r.restoreExposer.CleanUp(ctx, getDataDownloadOwnerObject(dd))

log.Info("Dataupload has been cleaned up")
Expand Down
4 changes: 4 additions & 0 deletions pkg/controller/data_download_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,10 @@ func (dt *ddResumeTestHelper) PeekExposed(context.Context, corev1.ObjectReferenc
return nil
}

func (dt *ddResumeTestHelper) DiagnoseExpose(context.Context, corev1.ObjectReference) string {
return ""
}

func (dt *ddResumeTestHelper) RebindVolume(context.Context, corev1.ObjectReference, string, string, time.Duration) error {
return nil
}
Expand Down
6 changes: 6 additions & 0 deletions pkg/controller/data_upload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package controller
import (
"context"
"fmt"
"strings"
"time"

snapshotter "github.com/kubernetes-csi/external-snapshotter/client/v7/clientset/versioned/typed/volumesnapshot/v1"
Expand Down Expand Up @@ -751,6 +752,11 @@ func (r *DataUploadReconciler) onPrepareTimeout(ctx context.Context, du *velerov
volumeSnapshotName = du.Spec.CSISnapshot.VolumeSnapshot
}

diags := strings.Split(ep.DiagnoseExpose(ctx, getOwnerObject(du)), "\n")
for _, diag := range diags {
log.Warnf("[Diagnose DU expose]%s", diag)
}

ep.CleanUp(ctx, getOwnerObject(du), volumeSnapshotName, du.Spec.SourceNamespace)

log.Info("Dataupload has been cleaned up")
Expand Down
8 changes: 8 additions & 0 deletions pkg/controller/data_upload_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,10 @@ func (f *fakeSnapshotExposer) PeekExposed(ctx context.Context, ownerObject corev
return f.peekErr
}

func (f *fakeSnapshotExposer) DiagnoseExpose(context.Context, corev1.ObjectReference) string {
return ""
}

func (f *fakeSnapshotExposer) CleanUp(context.Context, corev1.ObjectReference, string, string) {
}

Expand Down Expand Up @@ -1043,6 +1047,10 @@ func (dt *duResumeTestHelper) PeekExposed(context.Context, corev1.ObjectReferenc
return nil
}

func (dt *duResumeTestHelper) DiagnoseExpose(context.Context, corev1.ObjectReference) string {
return ""
}

func (dt *duResumeTestHelper) CleanUp(context.Context, corev1.ObjectReference, string, string) {}

func (dt *duResumeTestHelper) newMicroServiceBRWatcher(kbclient.Client, kubernetes.Interface, manager.Manager, string, string, string, string, string, string,
Expand Down
64 changes: 64 additions & 0 deletions pkg/exposer/csi_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,70 @@ func (e *csiSnapshotExposer) PeekExposed(ctx context.Context, ownerObject corev1
return nil
}

func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject corev1.ObjectReference) string {
backupPodName := ownerObject.Name
backupPVCName := ownerObject.Name
backupVSName := ownerObject.Name

diag := "begin diagnose CSI exposer\n"

pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, backupPodName, metav1.GetOptions{})
if err != nil {
pod = nil
diag += fmt.Sprintf("error getting backup pod %s, err: %v\n", backupPodName, err)
}

pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, backupPVCName, metav1.GetOptions{})
if err != nil {
pvc = nil
diag += fmt.Sprintf("error getting backup pvc %s, err: %v\n", backupPVCName, err)
}

vs, err := e.csiSnapshotClient.VolumeSnapshots(ownerObject.Namespace).Get(ctx, backupVSName, metav1.GetOptions{})
if err != nil {
vs = nil
diag += fmt.Sprintf("error getting backup vs %s, err: %v\n", backupVSName, err)
}

if pod != nil {
diag += kube.DiagnosePod(pod)

if pod.Spec.NodeName != "" {
if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil {
diag += fmt.Sprintf("node-agent is not running in node %s, err: %v\n", pod.Spec.NodeName, err)
}
}
}

if pvc != nil {
diag += kube.DiagnosePVC(pvc)

if pvc.Spec.VolumeName != "" {
if pv, err := e.kubeClient.CoreV1().PersistentVolumes().Get(ctx, pvc.Spec.VolumeName, metav1.GetOptions{}); err != nil {
diag += fmt.Sprintf("error getting backup pv %s, err: %v\n", pvc.Spec.VolumeName, err)
} else {
diag += kube.DiagnosePV(pv)
}
}
}

if vs != nil {
diag += csi.DiagnoseVS(vs)

if vs.Status != nil && vs.Status.BoundVolumeSnapshotContentName != nil && *vs.Status.BoundVolumeSnapshotContentName != "" {
if vsc, err := e.csiSnapshotClient.VolumeSnapshotContents().Get(ctx, *vs.Status.BoundVolumeSnapshotContentName, metav1.GetOptions{}); err != nil {
diag += fmt.Sprintf("error getting backup vsc %s, err: %v\n", *vs.Status.BoundVolumeSnapshotContentName, err)
} else {
diag += csi.DiagnoseVSC(vsc)
}
}
}

diag += "end diagnose CSI exposer"

return diag
}

const cleanUpTimeout = time.Minute

func (e *csiSnapshotExposer) CleanUp(ctx context.Context, ownerObject corev1.ObjectReference, vsName string, sourceNamespace string) {
Expand Down
Loading

0 comments on commit 0224d99

Please sign in to comment.