Skip to content

Commit

Permalink
fix ci and add event logging to debug ci
Browse files Browse the repository at this point in the history
  • Loading branch information
JesseStutler committed Nov 14, 2024
1 parent 8b2f918 commit 06a26dc
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 2 deletions.
15 changes: 15 additions & 0 deletions .github/workflows/e2e_spark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,21 @@ jobs:
name: "E2E about Spark Integration test"
runs-on: ubuntu-20.04
steps:
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true

- name: Checkout current Volcano repository
if: github.event.inputs.volcano-branch==''
Expand Down
26 changes: 26 additions & 0 deletions test/e2e/util/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"

batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
Expand Down Expand Up @@ -275,22 +276,31 @@ func CreateJobInner(ctx *TestContext, jobSpec *JobSpec) (*batchv1alpha1.Job, err

func WaitTaskPhase(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum int) error {
var additionalError error
var podNotReadyCache map[string]*v1.Pod
err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {
pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{})
Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace)

readyTaskNum := 0
podNotReadyCache = make(map[string]*v1.Pod)
for _, pod := range pods.Items {
if !metav1.IsControlledBy(&pod, job) {
continue
}

podReady := false
for _, p := range phase {
if pod.Status.Phase == p {
readyTaskNum++
podReady = true
break
}
}

if !podReady {
podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
podNotReadyCache[podKey] = &pod
}
}

ready := taskNum <= readyTaskNum
Expand All @@ -302,11 +312,27 @@ func WaitTaskPhase(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase
return ready, nil
})
if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
logEventsOfNotReadyPods(ctx, podNotReadyCache, phase)
return fmt.Errorf("[Wait time out]: %s", additionalError)
}
return err
}

func logEventsOfNotReadyPods(ctx *TestContext, podNotReadyCache map[string]*v1.Pod, phase []v1.PodPhase) {
for _, pod := range podNotReadyCache {
klog.Errorf("The pod <%s/%s> is not in %v phase", pod.Namespace, pod.Name, phase)
// Currently, we only filter Failed event
fieldSelector := fmt.Sprintf("involvedObject.kind=Pod,involvedObject.name=%s,reason=Failed", pod.Name)
events, err := ctx.Kubeclient.CoreV1().Events(pod.Namespace).List(context.TODO(), metav1.ListOptions{
FieldSelector: fieldSelector,
})
Expect(err).NotTo(HaveOccurred(), "failed to get events related with pod %s in namespace %s", pod.Name, pod.Namespace)
for _, event := range events.Items {
klog.Errorf("Event related with pod <%s/%s>: Reason: %s, Message: %s", pod.Namespace, pod.Name, event.Reason, event.Message)
}
}
}

func taskPhaseEx(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum map[string]int) error {
err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {

Expand Down
5 changes: 3 additions & 2 deletions test/e2e/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,13 @@ const (
)

const (
DefaultBusyBoxImage = "busybox:1.24"
DefaultNginxImage = "nginx:1.14"
DefaultBusyBoxImage = "busybox"
DefaultNginxImage = "nginx"
DefaultMPIImage = "volcanosh/example-mpi:0.0.3"
DefaultTFImage = "volcanosh/dist-mnist-tf-example:0.0.1"
// "volcanosh/pytorch-mnist-v1beta1-9ee8fda-example:0.0.1" is from "docker.io/kubeflowkatib/pytorch-mnist:v1beta1-9ee8fda"
DefaultPytorchImage = "volcanosh/pytorch-mnist-v1beta1-9ee8fda-example:0.0.1"
LogTimeFormat = "[ 2006/01/02 15:04:05.000 ]"
)

func CPUResource(request string) v1.ResourceList {
Expand Down

0 comments on commit 06a26dc

Please sign in to comment.