Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix ci and add event logging to help debug ci #3817

Merged
merged 1 commit into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/workflows/e2e_spark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,21 @@ jobs:
name: "E2E about Spark Integration test"
runs-on: ubuntu-20.04
steps:
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true

- name: Checkout current Volcano repository
if: github.event.inputs.volcano-branch==''
Expand Down
26 changes: 26 additions & 0 deletions test/e2e/util/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"

batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
Expand Down Expand Up @@ -275,22 +276,31 @@ func CreateJobInner(ctx *TestContext, jobSpec *JobSpec) (*batchv1alpha1.Job, err

func WaitTaskPhase(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum int) error {
var additionalError error
var podNotReadyCache map[string]*v1.Pod
err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {
pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{})
Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace)

readyTaskNum := 0
podNotReadyCache = make(map[string]*v1.Pod)
for _, pod := range pods.Items {
if !metav1.IsControlledBy(&pod, job) {
continue
}

podReady := false
for _, p := range phase {
if pod.Status.Phase == p {
readyTaskNum++
podReady = true
break
}
}

if !podReady {
podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
podNotReadyCache[podKey] = &pod
}
}

ready := taskNum <= readyTaskNum
Expand All @@ -302,11 +312,27 @@ func WaitTaskPhase(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase
return ready, nil
})
if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
logEventsOfNotReadyPods(ctx, podNotReadyCache, phase)
return fmt.Errorf("[Wait time out]: %s", additionalError)
}
return err
}

func logEventsOfNotReadyPods(ctx *TestContext, podNotReadyCache map[string]*v1.Pod, phase []v1.PodPhase) {
for _, pod := range podNotReadyCache {
klog.Errorf("The pod <%s/%s> is not in %v phase", pod.Namespace, pod.Name, phase)
// Currently, we only filter Failed event
fieldSelector := fmt.Sprintf("involvedObject.kind=Pod,involvedObject.name=%s,reason=Failed", pod.Name)
events, err := ctx.Kubeclient.CoreV1().Events(pod.Namespace).List(context.TODO(), metav1.ListOptions{
FieldSelector: fieldSelector,
})
Expect(err).NotTo(HaveOccurred(), "failed to get events related with pod %s in namespace %s", pod.Name, pod.Namespace)
for _, event := range events.Items {
klog.Errorf("Event related with pod <%s/%s>: Reason: %s, Message: %s", pod.Namespace, pod.Name, event.Reason, event.Message)
}
}
}

func taskPhaseEx(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum map[string]int) error {
err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {

Expand Down
5 changes: 3 additions & 2 deletions test/e2e/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,13 @@ const (
)

const (
DefaultBusyBoxImage = "busybox:1.24"
DefaultNginxImage = "nginx:1.14"
DefaultBusyBoxImage = "busybox"
DefaultNginxImage = "nginx"
DefaultMPIImage = "volcanosh/example-mpi:0.0.3"
DefaultTFImage = "volcanosh/dist-mnist-tf-example:0.0.1"
// "volcanosh/pytorch-mnist-v1beta1-9ee8fda-example:0.0.1" is from "docker.io/kubeflowkatib/pytorch-mnist:v1beta1-9ee8fda"
DefaultPytorchImage = "volcanosh/pytorch-mnist-v1beta1-9ee8fda-example:0.0.1"
LogTimeFormat = "[ 2006/01/02 15:04:05.000 ]"
)

func CPUResource(request string) v1.ResourceList {
Expand Down
Loading