From c4956dca50201b4bcf159a1c2188c2efded8960f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A2=D0=BE=D0=BC=D1=87=D0=B8=D0=BA=20=D0=9D=D0=B8=D0=BA?= =?UTF-8?q?=D0=B8=D1=82=D0=B0=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5=D0=B5=D0=B2?= =?UTF-8?q?=D0=B8=D1=87?= Date: Thu, 13 Jul 2023 12:26:13 +0300 Subject: [PATCH 01/11] Fix panic issue with proportional scheduling Signed-off-by: Nikita --- pkg/scheduler/plugins/predicates/proportional.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/scheduler/plugins/predicates/proportional.go b/pkg/scheduler/plugins/predicates/proportional.go index bd83d0f54f..469631e220 100644 --- a/pkg/scheduler/plugins/predicates/proportional.go +++ b/pkg/scheduler/plugins/predicates/proportional.go @@ -33,13 +33,13 @@ func checkNodeResourceIsProportional(task *api.TaskInfo, node *api.NodeInfo, pro return status, nil } } + for resourceName, resourceRate := range proportional { if value, found := node.Idle.ScalarResources[resourceName]; found { cpuReserved := value * resourceRate.CPU memoryReserved := value * resourceRate.Memory * 1000 * 1000 - r := node.Idle.Clone() - r = r.Sub(task.Resreq) - if r.MilliCPU < cpuReserved || r.Memory < memoryReserved { + + if node.Idle.MilliCPU-task.Resreq.MilliCPU < cpuReserved || node.Idle.Memory-task.Resreq.Memory < memoryReserved { status.Code = api.Unschedulable status.Reason = fmt.Sprintf("proportional of resource %s check failed", resourceName) return status, fmt.Errorf("proportional of resource %s check failed", resourceName) From ef7fdb3051b0c36bf91053b8656f301bc6d9480e Mon Sep 17 00:00:00 2001 From: william-wang Date: Mon, 24 Jul 2023 10:39:30 +0800 Subject: [PATCH 02/11] add the job creation permission for jobflow controller Signed-off-by: william-wang --- installer/helm/chart/volcano/templates/controllers.yaml | 2 +- installer/volcano-development.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/installer/helm/chart/volcano/templates/controllers.yaml b/installer/helm/chart/volcano/templates/controllers.yaml index 7c2929df9e..f8e37170d0 100644 --- a/installer/helm/chart/volcano/templates/controllers.yaml +++ b/installer/helm/chart/volcano/templates/controllers.yaml @@ -20,7 +20,7 @@ rules: verbs: ["create", "get", "list", "watch", "delete"] - apiGroups: ["batch.volcano.sh"] resources: ["jobs"] - verbs: ["get", "list", "watch", "update", "delete"] + verbs: ["create", "get", "list", "watch", "update", "delete"] - apiGroups: ["batch.volcano.sh"] resources: ["jobs/status", "jobs/finalizers"] verbs: ["update", "patch"] diff --git a/installer/volcano-development.yaml b/installer/volcano-development.yaml index 135ba20837..17ea59a768 100644 --- a/installer/volcano-development.yaml +++ b/installer/volcano-development.yaml @@ -8527,7 +8527,7 @@ rules: verbs: ["create", "get", "list", "watch", "delete"] - apiGroups: ["batch.volcano.sh"] resources: ["jobs"] - verbs: ["get", "list", "watch", "update", "delete"] + verbs: ["create", "get", "list", "watch", "update", "delete"] - apiGroups: ["batch.volcano.sh"] resources: ["jobs/status", "jobs/finalizers"] verbs: ["update", "patch"] From 2d4c0ada5f3fec1a7ab5cd4943f8266e499ac3be Mon Sep 17 00:00:00 2001 From: lowang_bh Date: Sat, 10 Jun 2023 12:39:06 +0800 Subject: [PATCH 03/11] make log more clear so that easy to debug Signed-off-by: lowang_bh --- .../Enhance-Generate-PodGroup-OwnerReferences-for-Normal-Pod.md | 2 +- pkg/controllers/job/job_controller_handler.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/design/Enhance-Generate-PodGroup-OwnerReferences-for-Normal-Pod.md b/docs/design/Enhance-Generate-PodGroup-OwnerReferences-for-Normal-Pod.md index 9e02688b22..2ccde83cdc 100644 --- a/docs/design/Enhance-Generate-PodGroup-OwnerReferences-for-Normal-Pod.md +++ b/docs/design/Enhance-Generate-PodGroup-OwnerReferences-for-Normal-Pod.md @@ -68,7 +68,7 @@ When controller need create a podgroup, it will check the pod `ownerReferences` ### DiscoveryClient & DynamicClient From the `ownerReferences`, we can get or create a GVK and owner resource name. -Discovery client is foucs on the k8s resources, it can get GVR from GVK. When get GVR and owner resource name, we can use Dynamic client to get the owner resource's `ownerReferences`. +Discovery client is focused on the k8s resources, it can get GVR from GVK. When get GVR and owner resource name, we can use Dynamic client to get the owner resource's `ownerReferences`. ### RBAC diff --git a/pkg/controllers/job/job_controller_handler.go b/pkg/controllers/job/job_controller_handler.go index 562072276c..3f632d508d 100644 --- a/pkg/controllers/job/job_controller_handler.go +++ b/pkg/controllers/job/job_controller_handler.go @@ -427,7 +427,7 @@ func (cc *jobcontroller) updatePodGroup(oldObj, newObj interface{}) { _, err := cc.cache.Get(jobcache.JobKeyByName(newPG.Namespace, jobNameKey)) if err != nil && newPG.Annotations != nil { klog.Warningf( - "Failed to find job in cache by PodGroup, this may not be a PodGroup for volcano job.") + "Failed to find job in cache by PodGroup(%s/%s), this may not be a PodGroup for volcano job.", newPG.Namespace, newPG.Name) } if newPG.Status.Phase != oldPG.Status.Phase { From ee79040f5aeff8f665334266b2029d8888a0527c Mon Sep 17 00:00:00 2001 From: wulixuan Date: Thu, 13 Jul 2023 17:50:41 +0800 Subject: [PATCH 04/11] fix scheduler metric e2e_scheduling_latency_milliseconds Signed-off-by: wulixuan --- pkg/scheduler/scheduler.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index fc57e6eab7..6fde8f1575 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -116,14 +116,16 @@ func (pc *Scheduler) runOnce() { } ssn := framework.OpenSession(pc.cache, plugins, configurations) - defer framework.CloseSession(ssn) + defer func() { + framework.CloseSession(ssn) + metrics.UpdateE2eDuration(metrics.Duration(scheduleStartTime)) + }() for _, action := range actions { actionStartTime := time.Now() action.Execute(ssn) metrics.UpdateActionDuration(action.Name(), metrics.Duration(actionStartTime)) } - metrics.UpdateE2eDuration(metrics.Duration(scheduleStartTime)) } func (pc *Scheduler) loadSchedulerConf() { From 8bfb07f99ff4d6ed6ed45c806819c74b271d2af7 Mon Sep 17 00:00:00 2001 From: wulixuan Date: Tue, 25 Jul 2023 19:28:36 +0800 Subject: [PATCH 05/11] support worker-threads-for-podgroup Signed-off-by: wulixuan --- cmd/controller-manager/app/options/options.go | 4 ++++ .../app/options/options_test.go | 18 +++++++++--------- cmd/controller-manager/app/server.go | 1 + pkg/controllers/framework/interface.go | 1 + pkg/controllers/podgroup/pg_controller.go | 2 +- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/cmd/controller-manager/app/options/options.go b/cmd/controller-manager/app/options/options.go index 31b59554d7..6e4845d0a0 100644 --- a/cmd/controller-manager/app/options/options.go +++ b/cmd/controller-manager/app/options/options.go @@ -65,6 +65,9 @@ type ServerOption struct { DetectionPeriodOfDependsOntask time.Duration // To determine whether inherit owner's annotations for pods when create podgroup InheritOwnerAnnotations bool + // WorkerThreadsForPG is the number of threads syncing podgroup operations + // The larger the number, the faster the podgroup processing, but requires more CPU load. + WorkerThreadsForPG uint32 } type DecryptFunc func(c *ServerOption) error @@ -97,6 +100,7 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet) { fs.DurationVar(&s.DetectionPeriodOfDependsOntask, "detection-period-of-dependson-task", defaultDetectionPeriodOfDependsOntask, "It indicates how often to detect the status of dependent tasks."+ "e.g. --detection-period-of-dependson-task=1s") fs.BoolVar(&s.InheritOwnerAnnotations, "inherit-owner-annotations", true, "Enable inherit owner annotations for pods when create podgroup; it is enabled by default") + fs.Uint32Var(&s.WorkerThreadsForPG, "worker-threads-for-podgroup", 1, "The number of threads syncing podgroup operations. The larger the number, the faster the podgroup processing, but requires more CPU load.") } // CheckOptionOrDie checks the LockObjectNamespace. diff --git a/cmd/controller-manager/app/options/options_test.go b/cmd/controller-manager/app/options/options_test.go index e3a7f56b82..f48beab1f8 100644 --- a/cmd/controller-manager/app/options/options_test.go +++ b/cmd/controller-manager/app/options/options_test.go @@ -46,15 +46,15 @@ func TestAddFlags(t *testing.T) { QPS: defaultQPS, Burst: 200, }, - PrintVersion: false, - WorkerThreads: defaultWorkers, - SchedulerNames: []string{"volcano", "volcano2"}, - MaxRequeueNum: defaultMaxRequeueNum, - HealthzBindAddress: ":11251", - DetectionPeriodOfDependsOntask: defaultDetectionPeriodOfDependsOntask, - InheritOwnerAnnotations: true, - EnableLeaderElection: true, - LockObjectNamespace: defaultLockObjectNamespace, + PrintVersion: false, + WorkerThreads: defaultWorkers, + SchedulerNames: []string{"volcano", "volcano2"}, + MaxRequeueNum: defaultMaxRequeueNum, + HealthzBindAddress: ":11251", + InheritOwnerAnnotations: true, + EnableLeaderElection: true, + LockObjectNamespace: defaultLockObjectNamespace, + WorkerThreadsForPG: 1, } if !reflect.DeepEqual(expected, s) { diff --git a/cmd/controller-manager/app/server.go b/cmd/controller-manager/app/server.go index a8eaeaa1c6..c0822842cf 100644 --- a/cmd/controller-manager/app/server.go +++ b/cmd/controller-manager/app/server.go @@ -127,6 +127,7 @@ func startControllers(config *rest.Config, opt *options.ServerOption) func(ctx c controllerOpt.VolcanoClient = vcclientset.NewForConfigOrDie(config) controllerOpt.SharedInformerFactory = informers.NewSharedInformerFactory(controllerOpt.KubeClient, 0) controllerOpt.InheritOwnerAnnotations = opt.InheritOwnerAnnotations + controllerOpt.WorkerThreadsForPG = opt.WorkerThreadsForPG return func(ctx context.Context) { framework.ForeachController(func(c framework.Controller) { diff --git a/pkg/controllers/framework/interface.go b/pkg/controllers/framework/interface.go index cc594ef644..1af10d99b8 100644 --- a/pkg/controllers/framework/interface.go +++ b/pkg/controllers/framework/interface.go @@ -33,6 +33,7 @@ type ControllerOption struct { MaxRequeueNum int InheritOwnerAnnotations bool + WorkerThreadsForPG uint32 } // Controller is the interface of all controllers. diff --git a/pkg/controllers/podgroup/pg_controller.go b/pkg/controllers/podgroup/pg_controller.go index 4767da6ab4..bb0272e118 100644 --- a/pkg/controllers/podgroup/pg_controller.go +++ b/pkg/controllers/podgroup/pg_controller.go @@ -81,7 +81,7 @@ func (pg *pgcontroller) Name() string { func (pg *pgcontroller) Initialize(opt *framework.ControllerOption) error { pg.kubeClient = opt.KubeClient pg.vcClient = opt.VolcanoClient - pg.workers = opt.WorkerNum + pg.workers = opt.WorkerThreadsForPG pg.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) From bcbaae15fd1a7bbf0079eda83038412711f48c87 Mon Sep 17 00:00:00 2001 From: mazhongbao Date: Thu, 27 Jul 2023 19:58:23 +0800 Subject: [PATCH 06/11] add the jobflows/status,jobs/finalizers,jobtemplates/status,jobtemplates/finalizers permission for jobflow controller Signed-off-by: mazhongbao --- installer/helm/chart/volcano/templates/controllers.yaml | 3 +++ installer/volcano-development.yaml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/installer/helm/chart/volcano/templates/controllers.yaml b/installer/helm/chart/volcano/templates/controllers.yaml index f8e37170d0..3ec0c7c4fa 100644 --- a/installer/helm/chart/volcano/templates/controllers.yaml +++ b/installer/helm/chart/volcano/templates/controllers.yaml @@ -54,6 +54,9 @@ rules: - apiGroups: ["flow.volcano.sh"] resources: ["jobflows", "jobtemplates"] verbs: ["get", "list", "watch", "create", "delete", "update"] + - apiGroups: [ "flow.volcano.sh" ] + resources: [ "jobflows/status", "jobs/finalizers","jobtemplates/status", "jobtemplates/finalizers" ] + verbs: [ "update", "patch" ] - apiGroups: ["scheduling.k8s.io"] resources: ["priorityclasses"] verbs: ["get", "list", "watch", "create", "delete"] diff --git a/installer/volcano-development.yaml b/installer/volcano-development.yaml index 17ea59a768..ebb3f48f0c 100644 --- a/installer/volcano-development.yaml +++ b/installer/volcano-development.yaml @@ -8561,6 +8561,9 @@ rules: - apiGroups: ["flow.volcano.sh"] resources: ["jobflows", "jobtemplates"] verbs: ["get", "list", "watch", "create", "delete", "update"] + - apiGroups: [ "flow.volcano.sh" ] + resources: [ "jobflows/status", "jobs/finalizers","jobtemplates/status", "jobtemplates/finalizers" ] + verbs: [ "update", "patch" ] - apiGroups: ["scheduling.k8s.io"] resources: ["priorityclasses"] verbs: ["get", "list", "watch", "create", "delete"] From 8e7b020ef189d220e04aad17a82e33a189421f46 Mon Sep 17 00:00:00 2001 From: Tongruizhe Date: Sun, 30 Jul 2023 21:35:50 +0800 Subject: [PATCH 07/11] Fix panic issue with job taskMinAvailable clone Signed-off-by: Tongruizhe --- pkg/scheduler/api/job_info.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/scheduler/api/job_info.go b/pkg/scheduler/api/job_info.go index 68b2d7f5d4..f4b4601aa9 100644 --- a/pkg/scheduler/api/job_info.go +++ b/pkg/scheduler/api/job_info.go @@ -574,7 +574,7 @@ func (ji *JobInfo) Clone() *JobInfo { PodGroup: ji.PodGroup.Clone(), TaskStatusIndex: map[TaskStatus]tasksMap{}, - TaskMinAvailable: ji.TaskMinAvailable, + TaskMinAvailable: make(map[TaskID]int32), TaskMinAvailableTotal: ji.TaskMinAvailableTotal, Tasks: tasksMap{}, Preemptable: ji.Preemptable, @@ -584,6 +584,9 @@ func (ji *JobInfo) Clone() *JobInfo { ji.CreationTimestamp.DeepCopyInto(&info.CreationTimestamp) + for task, minAvailable := range ji.TaskMinAvailable { + info.TaskMinAvailable[task] = minAvailable + } for _, task := range ji.Tasks { info.AddTaskInfo(task.Clone()) } From 3c41a4bb1798b556a3f49f00078232e921688d58 Mon Sep 17 00:00:00 2001 From: Ren Date: Sun, 25 Jun 2023 14:27:30 +0800 Subject: [PATCH 08/11] change wait dependson job log level to error Signed-off-by: Ren --- cmd/controller-manager/app/options/options.go | 2 ++ pkg/controllers/job/job_controller_actions.go | 31 ++++++++++++++++++- pkg/controllers/job/job_controller_util.go | 1 - 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/cmd/controller-manager/app/options/options.go b/cmd/controller-manager/app/options/options.go index 6e4845d0a0..cc5e025fc8 100644 --- a/cmd/controller-manager/app/options/options.go +++ b/cmd/controller-manager/app/options/options.go @@ -23,6 +23,8 @@ import ( "github.com/spf13/pflag" + "github.com/spf13/pflag" + "volcano.sh/volcano/pkg/kube" ) diff --git a/pkg/controllers/job/job_controller_actions.go b/pkg/controllers/job/job_controller_actions.go index 9dad507b4e..32fc8e119c 100644 --- a/pkg/controllers/job/job_controller_actions.go +++ b/pkg/controllers/job/job_controller_actions.go @@ -378,7 +378,16 @@ func (cc *jobcontroller) syncJob(jobInfo *apis.JobInfo, updateStatus state.Updat go func(taskName string, podToCreateEachTask []*v1.Pod) { taskIndex := jobhelpers.GetTasklndexUnderJob(taskName, job) if job.Spec.Tasks[taskIndex].DependsOn != nil { - cc.waitDependsOnTaskMeetCondition(taskName, taskIndex, podToCreateEachTask, job) + if !cc.waitDependsOnTaskMeetCondition(taskName, taskIndex, podToCreateEachTask, job) { + klog.V(3).Infof("Job %s/%s depends on task not ready", job.Name, job.Namespace) + // release wait group + for _, pod := range podToCreateEachTask { + go func(pod *v1.Pod) { + defer waitCreationGroup.Done() + }(pod) + } + return + } } for _, pod := range podToCreateEachTask { @@ -484,6 +493,7 @@ func (cc *jobcontroller) waitDependsOnTaskMeetCondition(taskName string, taskInd dependsOn := *job.Spec.Tasks[taskIndex].DependsOn if len(dependsOn.Name) > 1 && dependsOn.Iteration == batch.IterationAny { +<<<<<<< HEAD wait.PollInfinite(detectionPeriodOfDependsOntask, func() (bool, error) { for _, task := range dependsOn.Name { if cc.isDependsOnPodsReady(task, job) { @@ -502,6 +512,25 @@ func (cc *jobcontroller) waitDependsOnTaskMeetCondition(taskName string, taskInd }) } } +======= + // any ready to create task, return true + for _, task := range dependsOn.Name { + if cc.isDependsOnPodsReady(task, job) { + return true + } + } + // all not ready to skip create task, return false + return false + } + for _, dependsOnTask := range dependsOn.Name { + // any not ready to skip create task, return false + if !cc.isDependsOnPodsReady(dependsOnTask, job) { + return false + } + } + // all ready to create task, return true + return true +>>>>>>> 6d0ab3b8d... change wait dependson job log level to error } func (cc *jobcontroller) isDependsOnPodsReady(task string, job *batch.Job) bool { diff --git a/pkg/controllers/job/job_controller_util.go b/pkg/controllers/job/job_controller_util.go index 6f93c706ac..ecbfd74b72 100644 --- a/pkg/controllers/job/job_controller_util.go +++ b/pkg/controllers/job/job_controller_util.go @@ -18,7 +18,6 @@ package job import ( "fmt" - "time" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" From 809b9137dfac114dc7d67959c1ff7889b84085b8 Mon Sep 17 00:00:00 2001 From: Ren Date: Wed, 14 Jun 2023 15:19:00 +0800 Subject: [PATCH 09/11] delete error log Signed-off-by: Ren --- pkg/controllers/job/job_controller_actions.go | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/pkg/controllers/job/job_controller_actions.go b/pkg/controllers/job/job_controller_actions.go index 32fc8e119c..8f08fcc153 100644 --- a/pkg/controllers/job/job_controller_actions.go +++ b/pkg/controllers/job/job_controller_actions.go @@ -493,26 +493,6 @@ func (cc *jobcontroller) waitDependsOnTaskMeetCondition(taskName string, taskInd dependsOn := *job.Spec.Tasks[taskIndex].DependsOn if len(dependsOn.Name) > 1 && dependsOn.Iteration == batch.IterationAny { -<<<<<<< HEAD - wait.PollInfinite(detectionPeriodOfDependsOntask, func() (bool, error) { - for _, task := range dependsOn.Name { - if cc.isDependsOnPodsReady(task, job) { - return true, nil - } - } - return false, nil - }) - } else { - for _, dependsOnTask := range dependsOn.Name { - wait.PollInfinite(detectionPeriodOfDependsOntask, func() (bool, error) { - if cc.isDependsOnPodsReady(dependsOnTask, job) { - return true, nil - } - return false, nil - }) - } - } -======= // any ready to create task, return true for _, task := range dependsOn.Name { if cc.isDependsOnPodsReady(task, job) { @@ -530,7 +510,6 @@ func (cc *jobcontroller) waitDependsOnTaskMeetCondition(taskName string, taskInd } // all ready to create task, return true return true ->>>>>>> 6d0ab3b8d... change wait dependson job log level to error } func (cc *jobcontroller) isDependsOnPodsReady(task string, job *batch.Job) bool { From 2207a33acfc4baa5112df5f487d498d87799296d Mon Sep 17 00:00:00 2001 From: Ren Date: Wed, 7 Jun 2023 17:56:14 +0800 Subject: [PATCH 10/11] delete dependson task wait.PollInfinite Signed-off-by: Ren --- cmd/controller-manager/app/options/options.go | 24 ++-- cmd/controller-manager/app/server.go | 3 - pkg/controllers/job/job_controller_actions.go | 7 +- .../job/job_controller_actions_test.go | 108 +++++++++++++++++- pkg/controllers/job/job_controller_util.go | 6 - 5 files changed, 116 insertions(+), 32 deletions(-) diff --git a/cmd/controller-manager/app/options/options.go b/cmd/controller-manager/app/options/options.go index cc5e025fc8..c62c0d3056 100644 --- a/cmd/controller-manager/app/options/options.go +++ b/cmd/controller-manager/app/options/options.go @@ -18,10 +18,8 @@ package options import ( "fmt" - "os" - "time" - "github.com/spf13/pflag" + "os" "github.com/spf13/pflag" @@ -29,14 +27,13 @@ import ( ) const ( - defaultQPS = 50.0 - defaultBurst = 100 - defaultWorkers = 3 - defaultMaxRequeueNum = 15 - defaultSchedulerName = "volcano" - defaultHealthzAddress = ":11251" - defaultDetectionPeriodOfDependsOntask = 100 * time.Millisecond - defaultLockObjectNamespace = "volcano-system" + defaultQPS = 50.0 + defaultBurst = 100 + defaultWorkers = 3 + defaultMaxRequeueNum = 15 + defaultSchedulerName = "volcano" + defaultHealthzAddress = ":11251" + defaultLockObjectNamespace = "volcano-system" ) // ServerOption is the main context object for the controllers. @@ -62,9 +59,6 @@ type ServerOption struct { // defaulting to 0.0.0.0:11252 HealthzBindAddress string EnableHealthz bool - // For dependent tasks, there is a detection cycle inside volcano - // It indicates how often to detect the status of dependent tasks - DetectionPeriodOfDependsOntask time.Duration // To determine whether inherit owner's annotations for pods when create podgroup InheritOwnerAnnotations bool // WorkerThreadsForPG is the number of threads syncing podgroup operations @@ -99,8 +93,6 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet) { fs.IntVar(&s.MaxRequeueNum, "max-requeue-num", defaultMaxRequeueNum, "The number of times a job, queue or command will be requeued before it is dropped out of the queue") fs.StringVar(&s.HealthzBindAddress, "healthz-address", defaultHealthzAddress, "The address to listen on for the health check server.") fs.BoolVar(&s.EnableHealthz, "enable-healthz", false, "Enable the health check; it is false by default") - fs.DurationVar(&s.DetectionPeriodOfDependsOntask, "detection-period-of-dependson-task", defaultDetectionPeriodOfDependsOntask, "It indicates how often to detect the status of dependent tasks."+ - "e.g. --detection-period-of-dependson-task=1s") fs.BoolVar(&s.InheritOwnerAnnotations, "inherit-owner-annotations", true, "Enable inherit owner annotations for pods when create podgroup; it is enabled by default") fs.Uint32Var(&s.WorkerThreadsForPG, "worker-threads-for-podgroup", 1, "The number of threads syncing podgroup operations. The larger the number, the faster the podgroup processing, but requires more CPU load.") } diff --git a/cmd/controller-manager/app/server.go b/cmd/controller-manager/app/server.go index c0822842cf..c74dff44ed 100644 --- a/cmd/controller-manager/app/server.go +++ b/cmd/controller-manager/app/server.go @@ -38,7 +38,6 @@ import ( vcclientset "volcano.sh/apis/pkg/client/clientset/versioned" "volcano.sh/volcano/cmd/controller-manager/app/options" "volcano.sh/volcano/pkg/controllers/framework" - "volcano.sh/volcano/pkg/controllers/job" "volcano.sh/volcano/pkg/kube" ) @@ -61,8 +60,6 @@ func Run(opt *options.ServerOption) error { } } - job.SetDetectionPeriodOfDependsOntask(opt.DetectionPeriodOfDependsOntask) - run := startControllers(config, opt) if !opt.EnableLeaderElection { diff --git a/pkg/controllers/job/job_controller_actions.go b/pkg/controllers/job/job_controller_actions.go index 8f08fcc153..2c647d411f 100644 --- a/pkg/controllers/job/job_controller_actions.go +++ b/pkg/controllers/job/job_controller_actions.go @@ -18,6 +18,7 @@ package job import ( "context" + "errors" "fmt" "reflect" "sort" @@ -28,7 +29,6 @@ import ( v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/wait" quotav1 "k8s.io/apiserver/pkg/quota/v1" "k8s.io/klog/v2" @@ -486,11 +486,10 @@ func (cc *jobcontroller) syncJob(jobInfo *apis.JobInfo, updateStatus state.Updat return nil } -func (cc *jobcontroller) waitDependsOnTaskMeetCondition(taskName string, taskIndex int, podToCreateEachTask []*v1.Pod, job *batch.Job) { +func (cc *jobcontroller) waitDependsOnTaskMeetCondition(taskName string, taskIndex int, podToCreateEachTask []*v1.Pod, job *batch.Job) bool { if job.Spec.Tasks[taskIndex].DependsOn == nil { - return + return true } - dependsOn := *job.Spec.Tasks[taskIndex].DependsOn if len(dependsOn.Name) > 1 && dependsOn.Iteration == batch.IterationAny { // any ready to create task, return true diff --git a/pkg/controllers/job/job_controller_actions_test.go b/pkg/controllers/job/job_controller_actions_test.go index 687b965d32..a756020de9 100644 --- a/pkg/controllers/job/job_controller_actions_test.go +++ b/pkg/controllers/job/job_controller_actions_test.go @@ -20,12 +20,11 @@ import ( "context" "errors" "fmt" - "reflect" - "testing" - "github.com/agiledragon/gomonkey/v2" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "reflect" + "testing" "volcano.sh/apis/pkg/apis/batch/v1alpha1" schedulingapi "volcano.sh/apis/pkg/apis/scheduling/v1beta1" @@ -261,6 +260,109 @@ func TestSyncJobFunc(t *testing.T) { Plugins: []string{"svc", "ssh", "env"}, ExpectVal: nil, }, + { + Name: "SyncJob with dependsOn job can't find the dependent task", + /* + Work dependsOn Master task, preempt actions causes controller deadlock + controller master,work scheduler + | | <---preempt----- | + | | <---kill work--- | + | ----watch work kill----> | | + | | <---kill master-- | + | ----create work pods---> | | + | --wait master running--> | | + | | | + | ---watch master kill---> | | + | --push master to queue-> | | + | -wait process to create-> | | + */ + Job: &v1alpha1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "job1", + Namespace: namespace, + ResourceVersion: "100", + UID: "e7f18111-1cec-11ea-b688-fa163ec79500", + }, + Spec: v1alpha1.JobSpec{ + Tasks: []v1alpha1.TaskSpec{ + { + Name: "master", + Replicas: 1, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pods", + Namespace: namespace, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "Containers", + }, + }, + }, + }, + }, + { + Name: "work", + Replicas: 3, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pods", + Namespace: namespace, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "Containers", + }, + }, + }, + }, + DependsOn: &v1alpha1.DependsOn{ + Name: []string{"master"}, + }, + }, + }, + }, + Status: v1alpha1.JobStatus{ + State: v1alpha1.JobState{ + Phase: v1alpha1.Pending, + }, + }, + }, + PodGroup: &schedulingapi.PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "job1-e7f18111-1cec-11ea-b688-fa163ec79500", + Namespace: namespace, + }, + Spec: schedulingapi.PodGroupSpec{ + MinResources: &v1.ResourceList{}, + MinTaskMember: map[string]int32{}, + }, + Status: schedulingapi.PodGroupStatus{ + Phase: schedulingapi.PodGroupInqueue, + }, + }, + PodRetainPhase: state.PodRetainPhaseNone, + UpdateStatus: nil, + JobInfo: &apis.JobInfo{ + Namespace: namespace, + Name: "jobinfo1", + Pods: map[string]map[string]*v1.Pod{ + "work": { + "job1-work-0": buildPod(namespace, "job1-work-0", v1.PodRunning, nil), + "job1-work-1": buildPod(namespace, "job1-work-1", v1.PodRunning, nil), + }, + }, + }, + Pods: map[string]*v1.Pod{ + "job1-work-0": buildPod(namespace, "job1-work-0", v1.PodRunning, nil), + "job1-work-1": buildPod(namespace, "job1-work-1", v1.PodRunning, nil), + }, + TotalNumPods: 4, + Plugins: []string{"svc", "ssh", "env"}, + ExpectVal: nil, + }, } for i, testcase := range testcases { diff --git a/pkg/controllers/job/job_controller_util.go b/pkg/controllers/job/job_controller_util.go index ecbfd74b72..d5ca852ed9 100644 --- a/pkg/controllers/job/job_controller_util.go +++ b/pkg/controllers/job/job_controller_util.go @@ -32,8 +32,6 @@ import ( jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers" ) -var detectionPeriodOfDependsOntask time.Duration - // MakePodName append podname,jobname,taskName and index and returns the string. func MakePodName(jobName string, taskName string, index int) string { return fmt.Sprintf(jobhelpers.PodNameFmt, jobName, taskName, index) @@ -257,7 +255,3 @@ func isControlledBy(obj metav1.Object, gvk schema.GroupVersionKind) bool { } return false } - -func SetDetectionPeriodOfDependsOntask(period time.Duration) { - detectionPeriodOfDependsOntask = period -} From 6423ab0a53b5e9c54f7241e8324fe64bcb6665dc Mon Sep 17 00:00:00 2001 From: william-wang Date: Fri, 4 Aug 2023 18:11:10 +0800 Subject: [PATCH 11/11] fix the ci failure Signed-off-by: william-wang --- cmd/controller-manager/app/options/options.go | 1 - pkg/controllers/job/job_controller_actions.go | 1 - 2 files changed, 2 deletions(-) diff --git a/cmd/controller-manager/app/options/options.go b/cmd/controller-manager/app/options/options.go index c62c0d3056..f73838c602 100644 --- a/cmd/controller-manager/app/options/options.go +++ b/cmd/controller-manager/app/options/options.go @@ -18,7 +18,6 @@ package options import ( "fmt" - "github.com/spf13/pflag" "os" "github.com/spf13/pflag" diff --git a/pkg/controllers/job/job_controller_actions.go b/pkg/controllers/job/job_controller_actions.go index 2c647d411f..10dae61615 100644 --- a/pkg/controllers/job/job_controller_actions.go +++ b/pkg/controllers/job/job_controller_actions.go @@ -18,7 +18,6 @@ package job import ( "context" - "errors" "fmt" "reflect" "sort"