diff --git a/.github/workflows/e2e_spark.yaml b/.github/workflows/e2e_spark.yaml index e67d7cee2b..347b070113 100644 --- a/.github/workflows/e2e_spark.yaml +++ b/.github/workflows/e2e_spark.yaml @@ -12,6 +12,21 @@ jobs: name: "E2E about Spark Integration test" runs-on: ubuntu-20.04 steps: + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: false + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true - name: Checkout current Volcano repository if: github.event.inputs.volcano-branch=='' diff --git a/test/e2e/util/job.go b/test/e2e/util/job.go index 4fbabca000..3b3e16a8df 100644 --- a/test/e2e/util/job.go +++ b/test/e2e/util/job.go @@ -31,6 +31,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/klog/v2" batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1" schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" @@ -275,22 +276,31 @@ func CreateJobInner(ctx *TestContext, jobSpec *JobSpec) (*batchv1alpha1.Job, err func WaitTaskPhase(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum int) error { var additionalError error + var podNotReadyCache map[string]*v1.Pod err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) { pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{}) Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace) readyTaskNum := 0 + podNotReadyCache = make(map[string]*v1.Pod) for _, pod := range pods.Items { if !metav1.IsControlledBy(&pod, job) { continue } + podReady := false for _, p := range phase { if pod.Status.Phase == p { readyTaskNum++ + podReady = true break } } + + if !podReady { + podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name) + podNotReadyCache[podKey] = &pod + } } ready := taskNum <= readyTaskNum @@ -302,11 +312,27 @@ func WaitTaskPhase(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase return ready, nil }) if err != nil && strings.Contains(err.Error(), TimeOutMessage) { + logEventsOfNotReadyPods(ctx, podNotReadyCache, phase) return fmt.Errorf("[Wait time out]: %s", additionalError) } return err } +func logEventsOfNotReadyPods(ctx *TestContext, podNotReadyCache map[string]*v1.Pod, phase []v1.PodPhase) { + for _, pod := range podNotReadyCache { + klog.Errorf("The pod <%s/%s> is not in %v phase", pod.Namespace, pod.Name, phase) + // Currently, we only filter Failed event + fieldSelector := fmt.Sprintf("involvedObject.kind=Pod,involvedObject.name=%s,reason=Failed", pod.Name) + events, err := ctx.Kubeclient.CoreV1().Events(pod.Namespace).List(context.TODO(), metav1.ListOptions{ + FieldSelector: fieldSelector, + }) + Expect(err).NotTo(HaveOccurred(), "failed to get events related with pod %s in namespace %s", pod.Name, pod.Namespace) + for _, event := range events.Items { + klog.Errorf("Event related with pod <%s/%s>: Reason: %s, Message: %s", pod.Namespace, pod.Name, event.Reason, event.Message) + } + } +} + func taskPhaseEx(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum map[string]int) error { err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) { diff --git a/test/e2e/util/util.go b/test/e2e/util/util.go index 18730702d8..b08b6ffce1 100644 --- a/test/e2e/util/util.go +++ b/test/e2e/util/util.go @@ -73,12 +73,13 @@ const ( ) const ( - DefaultBusyBoxImage = "busybox:1.24" - DefaultNginxImage = "nginx:1.14" + DefaultBusyBoxImage = "busybox" + DefaultNginxImage = "nginx" DefaultMPIImage = "volcanosh/example-mpi:0.0.3" DefaultTFImage = "volcanosh/dist-mnist-tf-example:0.0.1" // "volcanosh/pytorch-mnist-v1beta1-9ee8fda-example:0.0.1" is from "docker.io/kubeflowkatib/pytorch-mnist:v1beta1-9ee8fda" DefaultPytorchImage = "volcanosh/pytorch-mnist-v1beta1-9ee8fda-example:0.0.1" + LogTimeFormat = "[ 2006/01/02 15:04:05.000 ]" ) func CPUResource(request string) v1.ResourceList {