From ecc1622ab8cc83bb2ab33c4f543cfb80b3192f37 Mon Sep 17 00:00:00 2001 From: chenfengyu Date: Mon, 19 Jun 2023 01:15:17 +0800 Subject: [PATCH 01/12] avoid pod being terminating for a long time Signed-off-by: chenfengyu --- pkg/scheduler/api/helpers.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pkg/scheduler/api/helpers.go b/pkg/scheduler/api/helpers.go index 7509da2981..7622d47716 100644 --- a/pkg/scheduler/api/helpers.go +++ b/pkg/scheduler/api/helpers.go @@ -18,6 +18,7 @@ package api import ( "fmt" + "time" v1 "k8s.io/api/core/v1" clientcache "k8s.io/client-go/tools/cache" @@ -33,15 +34,22 @@ func PodKey(pod *v1.Pod) TaskID { } func getTaskStatus(pod *v1.Pod) TaskStatus { + var gracePeriodSeconds int64 = 30 + if pod.Spec.TerminationGracePeriodSeconds != nil { + // default grace period + gracePeriodSeconds = *pod.Spec.TerminationGracePeriodSeconds + } switch pod.Status.Phase { case v1.PodRunning: - if pod.DeletionTimestamp != nil { + if pod.DeletionTimestamp != nil && + time.Now().Unix()-pod.DeletionTimestamp.Unix() <= gracePeriodSeconds { return Releasing } return Running case v1.PodPending: - if pod.DeletionTimestamp != nil { + if pod.DeletionTimestamp != nil && + time.Now().Unix()-pod.DeletionTimestamp.Unix() <= gracePeriodSeconds { return Releasing } From 2692c580bc4c2cb06a907282ff9d0a88ee4eeae1 Mon Sep 17 00:00:00 2001 From: chenfengyu Date: Mon, 26 Jun 2023 00:57:50 +0800 Subject: [PATCH 02/12] add task status named ReleasingFailed Signed-off-by: chenfengyu --- pkg/scheduler/api/helpers.go | 6 +++++- pkg/scheduler/api/job_info.go | 1 + pkg/scheduler/api/types.go | 5 +++++ pkg/scheduler/framework/session.go | 5 ++++- 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pkg/scheduler/api/helpers.go b/pkg/scheduler/api/helpers.go index 7622d47716..2959d9296e 100644 --- a/pkg/scheduler/api/helpers.go +++ b/pkg/scheduler/api/helpers.go @@ -44,6 +44,8 @@ func getTaskStatus(pod *v1.Pod) TaskStatus { if pod.DeletionTimestamp != nil && time.Now().Unix()-pod.DeletionTimestamp.Unix() <= gracePeriodSeconds { return Releasing + } else if pod.DeletionTimestamp != nil { + return ReleasingFailed } return Running @@ -51,6 +53,8 @@ func getTaskStatus(pod *v1.Pod) TaskStatus { if pod.DeletionTimestamp != nil && time.Now().Unix()-pod.DeletionTimestamp.Unix() <= gracePeriodSeconds { return Releasing + } else if pod.DeletionTimestamp != nil { + return ReleasingFailed } if len(pod.Spec.NodeName) == 0 { @@ -71,7 +75,7 @@ func getTaskStatus(pod *v1.Pod) TaskStatus { // AllocatedStatus checks whether the tasks has AllocatedStatus func AllocatedStatus(status TaskStatus) bool { switch status { - case Bound, Binding, Running, Allocated: + case Bound, Binding, Running, Allocated, ReleasingFailed: return true default: return false diff --git a/pkg/scheduler/api/job_info.go b/pkg/scheduler/api/job_info.go index b0b04b980b..d1ba5627f5 100644 --- a/pkg/scheduler/api/job_info.go +++ b/pkg/scheduler/api/job_info.go @@ -678,6 +678,7 @@ func (ji *JobInfo) ReadyTaskNum() int32 { occupied += len(ji.TaskStatusIndex[Bound]) occupied += len(ji.TaskStatusIndex[Binding]) occupied += len(ji.TaskStatusIndex[Running]) + occupied += len(ji.TaskStatusIndex[ReleasingFailed]) occupied += len(ji.TaskStatusIndex[Allocated]) occupied += len(ji.TaskStatusIndex[Succeeded]) diff --git a/pkg/scheduler/api/types.go b/pkg/scheduler/api/types.go index 699cf8a918..d81562d815 100644 --- a/pkg/scheduler/api/types.go +++ b/pkg/scheduler/api/types.go @@ -45,6 +45,9 @@ const ( // Releasing means a task/pod is deleted. Releasing + // ReleasingFailed means a task/pod delete failed + ReleasingFailed + // Succeeded means that all containers in the pod have voluntarily terminated // with a container exit code of 0, and the system is not going to restart any of these containers. Succeeded @@ -73,6 +76,8 @@ func (ts TaskStatus) String() string { return "Running" case Releasing: return "Releasing" + case ReleasingFailed: + return "ReleasingFailed" case Succeeded: return "Succeeded" case Failed: diff --git a/pkg/scheduler/framework/session.go b/pkg/scheduler/framework/session.go index d95174e7cf..6a7348f574 100644 --- a/pkg/scheduler/framework/session.go +++ b/pkg/scheduler/framework/session.go @@ -201,6 +201,9 @@ func updateQueueStatus(ssn *Session) { for _, runningTask := range job.TaskStatusIndex[api.Running] { allocatedResources[job.Queue].Add(runningTask.Resreq) } + for _, runningTask := range job.TaskStatusIndex[api.ReleasingFailed] { + allocatedResources[job.Queue].Add(runningTask.Resreq) + } } // update queue status @@ -277,7 +280,7 @@ func jobStatus(ssn *Session, jobInfo *api.JobInfo) scheduling.PodGroupStatus { } } - status.Running = int32(len(jobInfo.TaskStatusIndex[api.Running])) + status.Running = int32(len(jobInfo.TaskStatusIndex[api.Running]) + len(jobInfo.TaskStatusIndex[api.ReleasingFailed])) status.Failed = int32(len(jobInfo.TaskStatusIndex[api.Failed])) status.Succeeded = int32(len(jobInfo.TaskStatusIndex[api.Succeeded])) From fd6f16aea33990cd0816521e05ed2908fd35dc6e Mon Sep 17 00:00:00 2001 From: chenfengyu Date: Tue, 27 Jun 2023 00:20:54 +0800 Subject: [PATCH 03/12] set time grace period seconds from server opations Signed-off-by: chenfengyu --- cmd/scheduler/app/options/options.go | 8 ++++++++ pkg/scheduler/api/helpers.go | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/cmd/scheduler/app/options/options.go b/cmd/scheduler/app/options/options.go index 7d91e203ab..db669ea4bc 100644 --- a/cmd/scheduler/app/options/options.go +++ b/cmd/scheduler/app/options/options.go @@ -42,6 +42,8 @@ const ( defaultMinNodesToFind = 100 defaultPercentageOfNodesToFind = 100 defaultLockObjectNamespace = "volcano-system" + defaultGracePeriodSeconds = 30 + defaultGracePeriodFactor = 1.1 ) // ServerOption is the main context object for the controller manager. @@ -75,6 +77,9 @@ type ServerOption struct { NodeSelector []string EnableCacheDumper bool + + GracePeriodSeconds int64 + GracePeriodSecondsWait int64 } type DecryptFunc func(c *ServerOption) error @@ -128,6 +133,9 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet) { fs.BoolVar(&s.EnableMetrics, "enable-metrics", false, "Enable the metrics function; it is false by default") fs.StringSliceVar(&s.NodeSelector, "node-selector", nil, "volcano only work with the labeled node, like: --node-selector=volcano.sh/role:train --node-selector=volcano.sh/role:serving") fs.BoolVar(&s.EnableCacheDumper, "cache-dumper", true, "Enable the cache dumper, it's true by default") + + fs.Int64Var(&s.GracePeriodSeconds, "grace-period", defaultGracePeriodSeconds, "the default second grace period seconds from pod") + fs.Int64Var(&s.GracePeriodSecondsWait, "grace-period-wait", defaultGracePeriodFactor, "wait time from pod send sig kill to delete pod") } // CheckOptionOrDie check lock-object-namespace when LeaderElection is enabled. diff --git a/pkg/scheduler/api/helpers.go b/pkg/scheduler/api/helpers.go index 2959d9296e..3ca103adab 100644 --- a/pkg/scheduler/api/helpers.go +++ b/pkg/scheduler/api/helpers.go @@ -19,6 +19,7 @@ package api import ( "fmt" "time" + "volcano.sh/volcano/cmd/scheduler/app/options" v1 "k8s.io/api/core/v1" clientcache "k8s.io/client-go/tools/cache" @@ -34,11 +35,13 @@ func PodKey(pod *v1.Pod) TaskID { } func getTaskStatus(pod *v1.Pod) TaskStatus { - var gracePeriodSeconds int64 = 30 + opts := options.ServerOpts + gracePeriodSeconds := opts.GracePeriodSeconds if pod.Spec.TerminationGracePeriodSeconds != nil { // default grace period gracePeriodSeconds = *pod.Spec.TerminationGracePeriodSeconds } + gracePeriodSeconds += opts.GracePeriodSecondsWait switch pod.Status.Phase { case v1.PodRunning: if pod.DeletionTimestamp != nil && From 1b338b670450723f0e013ada486f205bb6a54aa9 Mon Sep 17 00:00:00 2001 From: kingeasternsun Date: Tue, 3 Jan 2023 03:13:17 +0000 Subject: [PATCH 04/12] fix bug when deal with missing resource dimension in left Signed-off-by: kingeasternsun Signed-off-by: chenfengyu --- pkg/scheduler/api/resource_info.go | 32 +++++++++++++++++++ pkg/scheduler/api/resource_info_test.go | 30 +++++++++++++++-- .../plugins/proportion/proportion.go | 13 ++++++-- 3 files changed, 70 insertions(+), 5 deletions(-) diff --git a/pkg/scheduler/api/resource_info.go b/pkg/scheduler/api/resource_info.go index b4977222d7..b42f45fd11 100644 --- a/pkg/scheduler/api/resource_info.go +++ b/pkg/scheduler/api/resource_info.go @@ -345,6 +345,14 @@ func (r *Resource) Less(rr *Resource, defaultValue DimensionDefaultValue) bool { return false } + if defaultValue == Infinity { + for name := range rr.ScalarResources { + if _, ok := r.ScalarResources[name]; !ok { + return false + } + } + } + for resourceName, leftValue := range r.ScalarResources { rightValue, ok := rr.ScalarResources[resourceName] if !ok && defaultValue == Infinity { @@ -376,6 +384,14 @@ func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) b return false } + if defaultValue == Infinity { + for name := range rr.ScalarResources { + if _, ok := r.ScalarResources[name]; !ok { + return false + } + } + } + for resourceName, leftValue := range r.ScalarResources { rightValue, ok := rr.ScalarResources[resourceName] if !ok && defaultValue == Infinity { @@ -433,6 +449,14 @@ func (r *Resource) LessPartly(rr *Resource, defaultValue DimensionDefaultValue) return true } + if defaultValue == Zero { + for name := range rr.ScalarResources { + if _, ok := r.ScalarResources[name]; !ok { + return true + } + } + } + for resourceName, leftValue := range r.ScalarResources { rightValue, ok := rr.ScalarResources[resourceName] if !ok && defaultValue == Infinity { @@ -461,6 +485,14 @@ func (r *Resource) LessEqualPartly(rr *Resource, defaultValue DimensionDefaultVa return true } + if defaultValue == Zero { + for name := range rr.ScalarResources { + if _, ok := r.ScalarResources[name]; !ok { + return true + } + } + } + for resourceName, leftValue := range r.ScalarResources { rightValue, ok := rr.ScalarResources[resourceName] if !ok && defaultValue == Infinity { diff --git a/pkg/scheduler/api/resource_info_test.go b/pkg/scheduler/api/resource_info_test.go index bb97ce0261..f4f42d9dd5 100644 --- a/pkg/scheduler/api/resource_info_test.go +++ b/pkg/scheduler/api/resource_info_test.go @@ -557,7 +557,7 @@ func TestLess(t *testing.T) { Memory: 2000, ScalarResources: map[v1.ResourceName]float64{"scalar.test/scalar1": 1000, "hugepages-test": 2000}, }, - expected: true, + expected: false, }, { resource1: &Resource{ @@ -701,7 +701,7 @@ func TestLessEqual(t *testing.T) { Memory: 2000, ScalarResources: map[v1.ResourceName]float64{"scalar.test/scalar1": 1000, "hugepages-test": 2000}, }, - expected: true, + expected: false, }, { resource1: &Resource{ @@ -712,6 +712,30 @@ func TestLessEqual(t *testing.T) { resource2: &Resource{}, expected: false, }, + { + resource1: &Resource{ + MilliCPU: 4000, + Memory: 2000, + }, + resource2: &Resource{ + MilliCPU: 4000, + Memory: 2000, + ScalarResources: map[v1.ResourceName]float64{"scalar.test/scalar1": 1000, "hugepages-test": 2000}, + }, + expected: false, + }, + { + resource1: &Resource{ + MilliCPU: 4000, + Memory: 2000, + ScalarResources: map[v1.ResourceName]float64{"scalar.test/scalar1": 1000, "hugepages-test": 2000}, + }, + resource2: &Resource{ + MilliCPU: 4000, + Memory: 2000, + }, + expected: true, + }, } for _, test := range testsForDefaultZero { @@ -807,7 +831,7 @@ func TestLessPartly(t *testing.T) { Memory: 2000, ScalarResources: map[v1.ResourceName]float64{"scalar.test/scalar1": 1000, "hugepages-test": 2000}, }, - expected: false, + expected: true, }, { resource1: &Resource{ diff --git a/pkg/scheduler/plugins/proportion/proportion.go b/pkg/scheduler/plugins/proportion/proportion.go index 86b85794c4..44fb56274a 100644 --- a/pkg/scheduler/plugins/proportion/proportion.go +++ b/pkg/scheduler/plugins/proportion/proportion.go @@ -242,7 +242,7 @@ func (pp *proportionPlugin) OnSessionOpen(ssn *framework.Session) { remaining.Sub(increasedDeserved).Add(decreasedDeserved) klog.V(4).Infof("Remaining resource is <%s>", remaining) if remaining.IsEmpty() || reflect.DeepEqual(remaining, oldRemaining) { - klog.V(4).Infof("Exiting when remaining is empty or no queue has more reosurce request: <%v>", remaining) + klog.V(4).Infof("Exiting when remaining is empty or no queue has more resource request: <%v>", remaining) break } } @@ -337,7 +337,16 @@ func (pp *proportionPlugin) OnSessionOpen(ssn *framework.Session) { klog.V(5).Infof("job %s min resource <%s>, queue %s capability <%s> allocated <%s> inqueue <%s> elastic <%s>", job.Name, minReq.String(), queue.Name, attr.realCapability.String(), attr.allocated.String(), attr.inqueue.String(), attr.elastic.String()) // The queue resource quota limit has not reached - inqueue := minReq.Add(attr.allocated).Add(attr.inqueue).Sub(attr.elastic).LessEqual(attr.realCapability, api.Infinity) + r := minReq.Add(attr.allocated).Add(attr.inqueue).Sub(attr.elastic) + rr := attr.realCapability.Clone() + + for name := range rr.ScalarResources { + if _, ok := r.ScalarResources[name]; !ok { + delete(rr.ScalarResources, name) + } + } + + inqueue := r.LessEqual(rr, api.Infinity) klog.V(5).Infof("job %s inqueue %v", job.Name, inqueue) if inqueue { attr.inqueue.Add(job.GetMinResources()) From 2a9c803cb60e6d01d833abf24f96dd1fc8f9c76f Mon Sep 17 00:00:00 2001 From: wangyang Date: Mon, 26 Jun 2023 20:37:59 +0800 Subject: [PATCH 05/12] Upgrade the setup-go and checkout versions in the action Signed-off-by: wangyang Signed-off-by: chenfengyu --- .github/workflows/code_verify.yaml | 6 +++--- .github/workflows/codeql-analysis.yml | 2 +- .github/workflows/e2e_parallel_jobs.yaml | 4 ++-- .github/workflows/e2e_scheduling_actions.yaml | 4 ++-- .github/workflows/e2e_scheduling_basic.yaml | 4 ++-- .github/workflows/e2e_sequence.yaml | 4 ++-- .github/workflows/e2e_spark.yaml | 2 +- .github/workflows/e2e_vcctl.yaml | 4 ++-- .github/workflows/fossa.yml | 4 ++-- .github/workflows/release.yaml | 4 ++-- 10 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/code_verify.yaml b/.github/workflows/code_verify.yaml index 493d4f153b..79bd4aab3d 100644 --- a/.github/workflows/code_verify.yaml +++ b/.github/workflows/code_verify.yaml @@ -16,12 +16,12 @@ jobs: GOPATH: /home/runner/work/${{ github.repository }} steps: - name: Install Go - uses: actions/setup-go@v2 + uses: actions/setup-go@v4 with: go-version: 1.19.x - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: fetch-depth: 0 path: ./src/github.com/${{ github.repository }} @@ -38,4 +38,4 @@ jobs: make TAG=latest generate-yaml make verify-generated-yaml make unit-test - working-directory: ./src/github.com/${{ github.repository }} \ No newline at end of file + working-directory: ./src/github.com/${{ github.repository }} diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index b0a84e2d92..5c2d150ca3 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -35,7 +35,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/e2e_parallel_jobs.yaml b/.github/workflows/e2e_parallel_jobs.yaml index 51204007d5..612eff3114 100644 --- a/.github/workflows/e2e_parallel_jobs.yaml +++ b/.github/workflows/e2e_parallel_jobs.yaml @@ -14,7 +14,7 @@ jobs: timeout-minutes: 40 steps: - name: Install Go - uses: actions/setup-go@v2 + uses: actions/setup-go@v4 with: go-version: 1.19.x @@ -34,7 +34,7 @@ jobs: GO111MODULE="on" go install sigs.k8s.io/kind@v0.11.0 curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.23.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Run E2E Tests run: | diff --git a/.github/workflows/e2e_scheduling_actions.yaml b/.github/workflows/e2e_scheduling_actions.yaml index c805fc4957..58a066cd88 100644 --- a/.github/workflows/e2e_scheduling_actions.yaml +++ b/.github/workflows/e2e_scheduling_actions.yaml @@ -14,7 +14,7 @@ jobs: timeout-minutes: 40 steps: - name: Install Go - uses: actions/setup-go@v2 + uses: actions/setup-go@v4 with: go-version: 1.19.x @@ -34,7 +34,7 @@ jobs: GO111MODULE="on" go install sigs.k8s.io/kind@v0.11.0 curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.23.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Run E2E Tests run: | diff --git a/.github/workflows/e2e_scheduling_basic.yaml b/.github/workflows/e2e_scheduling_basic.yaml index 18722df052..657a44ccc3 100644 --- a/.github/workflows/e2e_scheduling_basic.yaml +++ b/.github/workflows/e2e_scheduling_basic.yaml @@ -14,7 +14,7 @@ jobs: timeout-minutes: 40 steps: - name: Install Go - uses: actions/setup-go@v2 + uses: actions/setup-go@v4 with: go-version: 1.19.x @@ -34,7 +34,7 @@ jobs: GO111MODULE="on" go install sigs.k8s.io/kind@v0.11.0 curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.23.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Run E2E Tests run: | diff --git a/.github/workflows/e2e_sequence.yaml b/.github/workflows/e2e_sequence.yaml index 1dda39c205..99725bdf8a 100644 --- a/.github/workflows/e2e_sequence.yaml +++ b/.github/workflows/e2e_sequence.yaml @@ -14,7 +14,7 @@ jobs: timeout-minutes: 40 steps: - name: Install Go - uses: actions/setup-go@v2 + uses: actions/setup-go@v4 with: go-version: 1.19.x @@ -34,7 +34,7 @@ jobs: GO111MODULE="on" go install sigs.k8s.io/kind@v0.11.0 curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.23.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Run E2E Tests run: | diff --git a/.github/workflows/e2e_spark.yaml b/.github/workflows/e2e_spark.yaml index 3f22063ed5..01f54c8ae5 100644 --- a/.github/workflows/e2e_spark.yaml +++ b/.github/workflows/e2e_spark.yaml @@ -48,7 +48,7 @@ jobs: distribution: temurin java-version: 8 - name: Install Go - uses: actions/setup-go@v3 + uses: actions/setup-go@v4 with: go-version: 1.19.x - name: Set up Docker Buildx diff --git a/.github/workflows/e2e_vcctl.yaml b/.github/workflows/e2e_vcctl.yaml index 7c8c879cf3..0351499a72 100644 --- a/.github/workflows/e2e_vcctl.yaml +++ b/.github/workflows/e2e_vcctl.yaml @@ -14,7 +14,7 @@ jobs: timeout-minutes: 20 steps: - name: Install Go - uses: actions/setup-go@v2 + uses: actions/setup-go@v4 with: go-version: 1.19.x @@ -34,7 +34,7 @@ jobs: GO111MODULE="on" go install sigs.k8s.io/kind@v0.11.0 curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.23.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Run E2E Tests run: | diff --git a/.github/workflows/fossa.yml b/.github/workflows/fossa.yml index 77126234ef..35a599f1b9 100644 --- a/.github/workflows/fossa.yml +++ b/.github/workflows/fossa.yml @@ -9,8 +9,8 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-go@v2 + - uses: actions/checkout@v3 + - uses: actions/setup-go@v4 with: go-version: "^1.19.x" - run: go version diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index c64e1ff0ac..cfb665eb5b 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -16,7 +16,7 @@ jobs: GOPATH: /home/runner/work/${{ github.repository }} steps: - name: Install Go - uses: actions/setup-go@v2 + uses: actions/setup-go@v4 with: go-version: 1.19.x @@ -28,7 +28,7 @@ jobs: make && sudo make install - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: fetch-depth: 0 path: ./src/github.com/${{ github.repository }} From e403c02db95dc80937944430b48ec4bfd047336f Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Mon, 26 Jun 2023 16:25:29 +0800 Subject: [PATCH 06/12] Upgrade the spark integration test to 3.4 Signed-off-by: Yikun Jiang Signed-off-by: chenfengyu --- .github/workflows/e2e_spark.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e_spark.yaml b/.github/workflows/e2e_spark.yaml index 01f54c8ae5..a2c5b0a7d8 100644 --- a/.github/workflows/e2e_spark.yaml +++ b/.github/workflows/e2e_spark.yaml @@ -22,7 +22,7 @@ jobs: with: fetch-depth: 0 repository: apache/spark - ref: branch-3.3 + ref: branch-3.4 path: ${{ github.workspace }}/spark - name: Cache Scala, SBT and Maven uses: actions/cache@v3 From 6aa718ab2ace656ee0bc5983e1047df6bf34ceef Mon Sep 17 00:00:00 2001 From: rayoluo Date: Tue, 27 Jun 2023 20:11:32 +0800 Subject: [PATCH 07/12] fix some issues reported by golint Signed-off-by: rayoluo Signed-off-by: chenfengyu --- .../api/devices/nvidia/gpushare/device_info.go | 4 ++-- pkg/scheduler/api/devices/nvidia/vgpu/device_info.go | 4 ++-- pkg/scheduler/api/devices/nvidia/vgpu/type.go | 10 +++++----- pkg/scheduler/api/node_info.go | 7 ++++--- pkg/scheduler/framework/util.go | 2 +- pkg/scheduler/plugins/proportion/proportion_test.go | 6 +++--- .../admission/pods/mutate/mutate_pod_test.go | 6 +++--- test/e2e/jobp/job_lifecycle.go | 8 ++++---- test/e2e/jobseq/job_error_handling.go | 5 +++-- test/e2e/schedulingbase/sla.go | 12 ++++++------ test/e2e/util/job.go | 6 +++--- test/e2e/util/util.go | 3 ++- test/e2e/vcctl/command.go | 2 +- 13 files changed, 39 insertions(+), 36 deletions(-) diff --git a/pkg/scheduler/api/devices/nvidia/gpushare/device_info.go b/pkg/scheduler/api/devices/nvidia/gpushare/device_info.go index 10e3144476..167139f0b3 100644 --- a/pkg/scheduler/api/devices/nvidia/gpushare/device_info.go +++ b/pkg/scheduler/api/devices/nvidia/gpushare/device_info.go @@ -94,7 +94,7 @@ func (gs *GPUDevices) GetIgnoredDevices() []string { return []string{""} } -// AddGPUResource adds the pod to GPU pool if it is assigned +// AddResource adds the pod to GPU pool if it is assigned func (gs *GPUDevices) AddResource(pod *v1.Pod) { gpuRes := getGPUMemoryOfPod(pod) if gpuRes > 0 { @@ -107,7 +107,7 @@ func (gs *GPUDevices) AddResource(pod *v1.Pod) { } } -// SubGPUResource frees the gpu hold by the pod +// SubResource frees the gpu hold by the pod func (gs *GPUDevices) SubResource(pod *v1.Pod) { gpuRes := getGPUMemoryOfPod(pod) if gpuRes > 0 { diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go b/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go index 574f8c0c90..e956e1dafa 100644 --- a/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go +++ b/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go @@ -117,7 +117,7 @@ func (gs *GPUDevices) GetIgnoredDevices() []string { return []string{VolcanoVGPUMemory, VolcanoVGPUMemoryPercentage, VolcanoVGPUCores} } -// AddGPUResource adds the pod to GPU pool if it is assigned +// AddResource adds the pod to GPU pool if it is assigned func (gs *GPUDevices) AddResource(pod *v1.Pod) { ids, ok := pod.Annotations[AssignedIDsAnnotations] if !ok { @@ -141,7 +141,7 @@ func (gs *GPUDevices) AddResource(pod *v1.Pod) { } } -// SubGPUResource frees the gpu hold by the pod +// SubResource frees the gpu hold by the pod func (gs *GPUDevices) SubResource(pod *v1.Pod) { ids, ok := pod.Annotations[AssignedIDsAnnotations] if !ok { diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/type.go b/pkg/scheduler/api/devices/nvidia/vgpu/type.go index 2884a61dd5..16544c5473 100644 --- a/pkg/scheduler/api/devices/nvidia/vgpu/type.go +++ b/pkg/scheduler/api/devices/nvidia/vgpu/type.go @@ -31,17 +31,17 @@ const ( NvidiaGPUDevice = "NVIDIA" - // VolcanoGPUMemory extended gpu memory + // VolcanoVGPUMemory extended gpu memory VolcanoVGPUMemory = "volcano.sh/vgpu-memory" - // VolcanoMemoryPercentage extends gpu memory + // VolcanoVGPUMemoryPercentage extends gpu memory VolcanoVGPUMemoryPercentage = "volcano.sh/vgpu-memory-percentage" - // VolcanoVGPUcores indicates utilization percentage of vgpu + // VolcanoVGPUCores indicates utilization percentage of vgpu VolcanoVGPUCores = "volcano.sh/vgpu-cores" - // VolcanoGPUNumber virtual GPU card number + // VolcanoVGPUNumber virtual GPU card number VolcanoVGPUNumber = "volcano.sh/vgpu-number" // VolcanoVGPURegister virtual gpu information registered from device-plugin to scheduler VolcanoVGPURegister = "volcano.sh/node-vgpu-register" - // Volcanohandshake for vgpu + // VolcanoVGPUHandshake for vgpu VolcanoVGPUHandshake = "volcano.sh/node-vgpu-handshake" // PredicateTime is the key of predicate time diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go index c5e20c95b6..ae1585415a 100644 --- a/pkg/scheduler/api/node_info.go +++ b/pkg/scheduler/api/node_info.go @@ -25,6 +25,7 @@ import ( k8sframework "k8s.io/kubernetes/pkg/scheduler/framework" "volcano.sh/apis/pkg/apis/scheduling/v1beta1" + "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare" "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" ) @@ -213,7 +214,7 @@ func (ni *NodeInfo) Clone() *NodeInfo { klog.V(5).Infof("imageStates is %v", res.ImageStates) res.Others = ni.CloneOthers() - res.ImageStates = ni.CloneImageSumary() + res.ImageStates = ni.CloneImageSummary() return res } @@ -539,8 +540,8 @@ func (ni *NodeInfo) Pods() (pods []*v1.Pod) { return } -// Clone Image State -func (ni *NodeInfo) CloneImageSumary() map[string]*k8sframework.ImageStateSummary { +// CloneImageSummary Clone Image State +func (ni *NodeInfo) CloneImageSummary() map[string]*k8sframework.ImageStateSummary { nodeImageStates := make(map[string]*k8sframework.ImageStateSummary) for imageName, summary := range ni.ImageStates { newImageSummary := &k8sframework.ImageStateSummary{ diff --git a/pkg/scheduler/framework/util.go b/pkg/scheduler/framework/util.go index ee6fad607a..0528c9dba5 100644 --- a/pkg/scheduler/framework/util.go +++ b/pkg/scheduler/framework/util.go @@ -230,7 +230,7 @@ func GenerateNodeMapAndSlice(nodes map[string]*api.NodeInfo) map[string]*schedul nodeInfo.SetNode(node.Node) nodeMap[node.Name] = nodeInfo // add imagestate into nodeinfo - nodeMap[node.Name].ImageStates = node.CloneImageSumary() + nodeMap[node.Name].ImageStates = node.CloneImageSummary() } return nodeMap } diff --git a/pkg/scheduler/plugins/proportion/proportion_test.go b/pkg/scheduler/plugins/proportion/proportion_test.go index e5199882a9..f7b4398b5c 100644 --- a/pkg/scheduler/plugins/proportion/proportion_test.go +++ b/pkg/scheduler/plugins/proportion/proportion_test.go @@ -32,6 +32,7 @@ import ( "k8s.io/client-go/util/workqueue" schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" + "volcano.sh/volcano/cmd/scheduler/app/options" "volcano.sh/volcano/pkg/scheduler/actions/allocate" "volcano.sh/volcano/pkg/scheduler/api" @@ -282,10 +283,9 @@ func TestProportion(t *testing.T) { t.Errorf("after delete vcjob pg2, queue_allocated metrics is fail,%v", metrics) c <- false return - } else { - t.Logf("after delete vcjob pg2, queue_allocated metrics is ok,%v", metrics) - c <- true } + t.Logf("after delete vcjob pg2, queue_allocated metrics is ok,%v", metrics) + c <- true } num++ } diff --git a/pkg/webhooks/admission/pods/mutate/mutate_pod_test.go b/pkg/webhooks/admission/pods/mutate/mutate_pod_test.go index 869327cb53..f964ee8372 100644 --- a/pkg/webhooks/admission/pods/mutate/mutate_pod_test.go +++ b/pkg/webhooks/admission/pods/mutate/mutate_pod_test.go @@ -28,9 +28,9 @@ import ( ) func TestMutatePods(t *testing.T) { - affinityJsonStr := `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"kubernetes.io/os","operator":"In","values":["linux"]}]}]}}}` + affinityJSONStr := `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"kubernetes.io/os","operator":"In","values":["linux"]}]}]}}}` var affinity v1.Affinity - json.Unmarshal([]byte(affinityJsonStr), &affinity) + json.Unmarshal([]byte(affinityJSONStr), &affinity) admissionConfigData := &webconfig.AdmissionConfiguration{ ResGroupsConfig: []webconfig.ResGroupConfig{ @@ -51,7 +51,7 @@ func TestMutatePods(t *testing.T) { Effect: v1.TaintEffectNoSchedule, }, }, - Affinity: affinityJsonStr, + Affinity: affinityJSONStr, Labels: map[string]string{ "volcano.sh/nodetype": "management", }, diff --git a/test/e2e/jobp/job_lifecycle.go b/test/e2e/jobp/job_lifecycle.go index f9682ce8de..3ea44de6b7 100644 --- a/test/e2e/jobp/job_lifecycle.go +++ b/test/e2e/jobp/job_lifecycle.go @@ -45,7 +45,7 @@ var _ = Describe("Job Life Cycle", func() { Img: e2eutil.DefaultNginxImage, Min: 2, Rep: 2, - Req: e2eutil.CpuResource("10000"), + Req: e2eutil.CPUResource("10000"), }, }, }) @@ -222,7 +222,7 @@ var _ = Describe("Job Life Cycle", func() { Rep: 1, Command: "sleep 10s", RestartPolicy: v1.RestartPolicyNever, - Req: e2eutil.CpuResource("1"), + Req: e2eutil.CPUResource("1"), }, }, }) @@ -306,8 +306,8 @@ var _ = Describe("Job Life Cycle", func() { Rep: rep + 1, Command: "sleep 10s", RestartPolicy: v1.RestartPolicyNever, - Req: e2eutil.CpuResource("1"), - Limit: e2eutil.CpuResource("1"), + Req: e2eutil.CPUResource("1"), + Limit: e2eutil.CPUResource("1"), Affinity: &v1.Affinity{NodeAffinity: nodeAffinity}, }, }, diff --git a/test/e2e/jobseq/job_error_handling.go b/test/e2e/jobseq/job_error_handling.go index 554f937434..2478758b12 100644 --- a/test/e2e/jobseq/job_error_handling.go +++ b/test/e2e/jobseq/job_error_handling.go @@ -28,6 +28,7 @@ import ( vcbatch "volcano.sh/apis/pkg/apis/batch/v1alpha1" vcbus "volcano.sh/apis/pkg/apis/bus/v1alpha1" + jobctl "volcano.sh/volcano/pkg/controllers/job" e2eutil "volcano.sh/volcano/test/e2e/util" @@ -834,14 +835,14 @@ var _ = Describe("Job Error Handling", func() { Name: "higherprioritytask", Img: e2eutil.DefaultNginxImage, Rep: int32(nodecount), - Req: e2eutil.CpuResource(strconv.Itoa(int(rep)/nodecount - 1)), + Req: e2eutil.CPUResource(strconv.Itoa(int(rep)/nodecount - 1)), Taskpriority: e2eutil.MasterPriority, }, { Name: "lowerprioritytask", Img: e2eutil.DefaultNginxImage, Rep: int32(nodecount), - Req: e2eutil.CpuResource(strconv.Itoa(int(rep)/nodecount - 1)), + Req: e2eutil.CPUResource(strconv.Itoa(int(rep)/nodecount - 1)), Taskpriority: e2eutil.MasterPriority, }, }, diff --git a/test/e2e/schedulingbase/sla.go b/test/e2e/schedulingbase/sla.go index 22632d8578..6854c6067f 100644 --- a/test/e2e/schedulingbase/sla.go +++ b/test/e2e/schedulingbase/sla.go @@ -96,22 +96,22 @@ var _ = Describe("SLA Test", func() { Expect(err).NotTo(HaveOccurred()) job2.Name = "j2-slow-sla" - slowSlaJob := e2eutil.CreateJobWithPodGroup(ctx, job2, "", map[string]string{jobWaitingTime: "1h"}) - err = e2eutil.WaitTaskPhase(ctx, slowSlaJob, []v1.PodPhase{v1.PodPending}, 0) + slowSLAJob := e2eutil.CreateJobWithPodGroup(ctx, job2, "", map[string]string{jobWaitingTime: "1h"}) + err = e2eutil.WaitTaskPhase(ctx, slowSLAJob, []v1.PodPhase{v1.PodPending}, 0) Expect(err).NotTo(HaveOccurred()) job2.Name = "j3-fast-sla" - fastSlaJob := e2eutil.CreateJobWithPodGroup(ctx, job2, "", map[string]string{jobWaitingTime: "30m"}) - err = e2eutil.WaitTaskPhase(ctx, fastSlaJob, []v1.PodPhase{v1.PodPending}, 0) + fastSLAJob := e2eutil.CreateJobWithPodGroup(ctx, job2, "", map[string]string{jobWaitingTime: "30m"}) + err = e2eutil.WaitTaskPhase(ctx, fastSLAJob, []v1.PodPhase{v1.PodPending}, 0) Expect(err).NotTo(HaveOccurred()) err = ctx.Vcclient.BatchV1alpha1().Jobs(e2eutil.Namespace(ctx, job1)).Delete(context.TODO(), job1.Name, metav1.DeleteOptions{}) Expect(err).NotTo(HaveOccurred()) - err = e2eutil.WaitTaskPhase(ctx, slowSlaJob, []v1.PodPhase{v1.PodPending}, 0) + err = e2eutil.WaitTaskPhase(ctx, slowSLAJob, []v1.PodPhase{v1.PodPending}, 0) Expect(err).NotTo(HaveOccurred()) - err = e2eutil.WaitTasksReady(ctx, fastSlaJob, int(rep)) + err = e2eutil.WaitTasksReady(ctx, fastSLAJob, int(rep)) Expect(err).NotTo(HaveOccurred()) }) }) diff --git a/test/e2e/util/job.go b/test/e2e/util/job.go index ad8873df94..0d62287d06 100644 --- a/test/e2e/util/job.go +++ b/test/e2e/util/job.go @@ -67,7 +67,7 @@ type JobSpec struct { Volumes []batchv1alpha1.VolumeSpec NodeName string // ttl seconds after job finished - Ttl *int32 + TTL *int32 MinSuccess *int32 // job max retry MaxRetry int32 @@ -101,7 +101,7 @@ func CreateJobWithPodGroup(ctx *TestContext, jobSpec *JobSpec, Policies: jobSpec.Policies, Queue: jobSpec.Queue, Plugins: jobSpec.Plugins, - TTLSecondsAfterFinished: jobSpec.Ttl, + TTLSecondsAfterFinished: jobSpec.TTL, }, } @@ -197,7 +197,7 @@ func CreateJobInner(ctx *TestContext, jobSpec *JobSpec) (*batchv1alpha1.Job, err Policies: jobSpec.Policies, Queue: jobSpec.Queue, Plugins: jobSpec.Plugins, - TTLSecondsAfterFinished: jobSpec.Ttl, + TTLSecondsAfterFinished: jobSpec.TTL, MinSuccess: jobSpec.MinSuccess, MaxRetry: jobSpec.MaxRetry, }, diff --git a/test/e2e/util/util.go b/test/e2e/util/util.go index 4737463546..ed0e3d2d5b 100644 --- a/test/e2e/util/util.go +++ b/test/e2e/util/util.go @@ -39,6 +39,7 @@ import ( "k8s.io/client-go/kubernetes" vcclient "volcano.sh/apis/pkg/client/clientset/versioned" + "volcano.sh/volcano/pkg/controllers/job/helpers" schedulerapi "volcano.sh/volcano/pkg/scheduler/api" ) @@ -80,7 +81,7 @@ const ( DefaultPytorchImage = "volcanosh/pytorch-mnist-v1beta1-9ee8fda-example:0.0.1" ) -func CpuResource(request string) v1.ResourceList { +func CPUResource(request string) v1.ResourceList { return v1.ResourceList{v1.ResourceCPU: resource.MustParse(request)} } diff --git a/test/e2e/vcctl/command.go b/test/e2e/vcctl/command.go index d728c2ff83..d2817211c1 100644 --- a/test/e2e/vcctl/command.go +++ b/test/e2e/vcctl/command.go @@ -129,7 +129,7 @@ var _ = Describe("Job E2E Test: Test Job Command", func() { { Name: taskName, Img: e2eutil.DefaultNginxImage, - Req: e2eutil.CpuResource(fmt.Sprintf("%dm", 1000*rep)), + Req: e2eutil.CPUResource(fmt.Sprintf("%dm", 1000*rep)), Min: 1, Rep: 1, }, From e806229ddadaedf8331e17e8a2e105206cb9e750 Mon Sep 17 00:00:00 2001 From: aakcht Date: Fri, 14 Apr 2023 15:48:53 +0300 Subject: [PATCH 08/12] Add more configuration options for helm chart Signed-off-by: aakcht Signed-off-by: chenfengyu --- installer/README.md | 29 +++++++- .../chart/volcano/templates/admission.yaml | 45 +++++++++++ .../chart/volcano/templates/controllers.yaml | 26 +++++++ .../chart/volcano/templates/scheduler.yaml | 26 +++++++ installer/helm/chart/volcano/values.yaml | 74 +++++++++++++++++++ installer/volcano-development.yaml | 8 ++ 6 files changed, 205 insertions(+), 3 deletions(-) diff --git a/installer/README.md b/installer/README.md index 63dd8c1ea3..acc49732b8 100644 --- a/installer/README.md +++ b/installer/README.md @@ -14,13 +14,10 @@ Kubernetes that are commonly required by many classes of batch & elastic workloa ## Installing volcano via yaml file All-in-one yaml has been generated for quick deployment. Try command: - ```$xslt kubectl apply -f volcano-v0.0.x.yaml ``` - Check the status in namespace `volcano-system` - ```$xslt $kubectl get all -n volcano-system NAME READY STATUS RESTARTS AGE @@ -98,6 +95,31 @@ The following are the list configurable parameters of Volcano Chart and their de |`custom.scheduler_enable`|Whether to Enable Scheduler|`true`| |`custom.scheduler_replicas`|The number of Scheduler pods to run|`1`| |`custom.leader_elect_enable`|Whether to Enable leader elect|`false`| +|`custom.default_affinity`|Default affinity for Admission/Controller/Scheduler pods|`~`| +|`custom.admission_affinity`|Affinity for Admission pods|`~`| +|`custom.controller_affinity`|Affinity for Controller pods|`~`| +|`custom.scheduler_affinity`|Affinity for Scheduler pods|`~`| +|`custom.default_tolerations`|Default tolerations for Admission/Controller/Scheduler pods|`~`| +|`custom.admission_tolerations`|Tolerations for Admission pods|`~`| +|`custom.controller_tolerations`|Tolerations for Controller pods|`~`| +|`custom.scheduler_tolerations`|Tolerations for Scheduler pods|`~`| +|`custom.default_sc`|Default securityContext for Admission/Controller/Scheduler pods|`~`| +|`custom.admission_sc`|securityContext for Admission pods|`~`| +|`custom.controller_sc`|securityContext for Controller pods|`~`| +|`custom.scheduler_sc`|securityContext for Scheduler pods|`~`| +|`custom.default_ns`|Default nodeSelector for Admission/Controller/Scheduler pods|`~`| +|`custom.admission_ns`|nodeSelector for Admission pods|`~`| +|`custom.controller_ns`|nodeSelector for Controller pods|`~`| +|`custom.scheduler_ns`|nodeSelector for Scheduler pods|`~`| +|`custom.admission_podLabels`|Pod labels for Admission pods|`~`| +|`custom.controller_podLabels`|Pod labels for Controller pods|`~`| +|`custom.scheduler_podLabels`|Pod labels for Scheduler pods|`~`| +|`custom.admission_labels`|Labels for Admission deployment and job|`~`| +|`custom.controller_labels`|Labels for Controller deployment|`~`| +|`custom.scheduler_labels`|Labels for Scheduler deployment|`~`| +|`custom.admission_resources`|Resources for Admission pods|`~`| +|`custom.controller_resources`|Resources for Controller pods|`~`| +|`custom.scheduler_resources`|Resources for Scheduler pods|`~`| Specify each parameter using the `--set key=value[,key=value]` argument to `helm install`. For example, @@ -107,6 +129,7 @@ $ helm install --name volcano-release --set basic.image_pull_policy=Always volca The above command set image pull policy to `Always`, so docker image will be pulled each time. + Alternatively, a YAML file that specifies the values for the parameters can be provided while installing the chart. For example, ```bash diff --git a/installer/helm/chart/volcano/templates/admission.yaml b/installer/helm/chart/volcano/templates/admission.yaml index 2aa9e5363d..389660e4b3 100644 --- a/installer/helm/chart/volcano/templates/admission.yaml +++ b/installer/helm/chart/volcano/templates/admission.yaml @@ -1,4 +1,8 @@ {{- if .Values.custom.admission_enable }} +{{ $admission_affinity := or .Values.custom.admission_affinity .Values.custom.default_affinity }} +{{ $admission_tolerations := or .Values.custom.admission_tolerations .Values.custom.default_tolerations }} +{{ $admission_sc := or .Values.custom.admission_sc .Values.custom.default_sc }} +{{ $admission_ns := or .Values.custom.admission_ns .Values.custom.default_ns }} apiVersion: v1 kind: ConfigMap metadata: @@ -64,6 +68,9 @@ kind: Deployment metadata: labels: app: volcano-admission + {{- if .Values.custom.admission_labels }} + {{- toYaml .Values.custom.admission_labels | nindent 4 }} + {{- end }} name: {{ .Release.Name }}-admission namespace: {{ .Release.Namespace }} spec: @@ -75,7 +82,24 @@ spec: metadata: labels: app: volcano-admission + {{- if .Values.custom.admission_podLabels }} + {{- toYaml .Values.custom.admission_podLabels | nindent 8 }} + {{- end }} spec: + {{- if $admission_tolerations }} + tolerations: {{- toYaml $admission_tolerations | nindent 8 }} + {{- end }} + {{- if $admission_ns }} + nodeSelector: {{- toYaml $admission_ns | nindent 8 }} + {{- end }} + {{- if $admission_affinity }} + affinity: + {{- toYaml $admission_affinity | nindent 8 }} + {{- end }} + {{- if $admission_sc }} + securityContext: + {{- toYaml $admission_sc | nindent 8 }} + {{- end }} serviceAccount: {{ .Release.Name }}-admission priorityClassName: system-cluster-critical {{- if .Values.basic.image_pull_secret }} @@ -99,6 +123,8 @@ spec: image: {{.Values.basic.admission_image_name}}:{{.Values.basic.image_tag_version}} imagePullPolicy: {{ .Values.basic.image_pull_policy }} name: admission + resources: + {{- toYaml .Values.custom.admission_resources | nindent 12 }} volumeMounts: - mountPath: /admission.local.config/certificates name: admission-certs @@ -152,10 +178,27 @@ metadata: namespace: {{ .Release.Namespace }} labels: app: volcano-admission-init + {{- if .Values.custom.admission_labels }} + {{- toYaml .Values.custom.admission_labels | nindent 4 }} + {{- end }} spec: backoffLimit: 3 template: spec: + {{- if $admission_tolerations }} + tolerations: {{- toYaml $admission_tolerations | nindent 8 }} + {{- end }} + {{- if $admission_ns }} + nodeSelector: {{- toYaml $admission_ns | nindent 8 }} + {{- end }} + {{- if $admission_affinity }} + affinity: + {{- toYaml $admission_affinity | nindent 8 }} + {{- end }} + {{- if $admission_sc }} + securityContext: + {{- toYaml $admission_sc | nindent 8 }} + {{- end }} serviceAccountName: {{ .Release.Name }}-admission priorityClassName: system-cluster-critical {{- if .Values.basic.image_pull_secret }} @@ -165,6 +208,8 @@ spec: restartPolicy: Never containers: - name: main + resources: + {{- toYaml .Values.custom.admission_resources | nindent 12 }} image: {{.Values.basic.admission_image_name}}:{{.Values.basic.image_tag_version}} imagePullPolicy: {{ .Values.basic.image_pull_policy }} command: ["./gen-admission-secret.sh", "--service", "{{ .Release.Name }}-admission-service", "--namespace", diff --git a/installer/helm/chart/volcano/templates/controllers.yaml b/installer/helm/chart/volcano/templates/controllers.yaml index f48c3a0d7e..697bd9f981 100644 --- a/installer/helm/chart/volcano/templates/controllers.yaml +++ b/installer/helm/chart/volcano/templates/controllers.yaml @@ -1,4 +1,8 @@ {{- if .Values.custom.controller_enable }} +{{ $controller_affinity := or .Values.custom.controller_affinity .Values.custom.default_affinity }} +{{ $controller_tolerations := or .Values.custom.controller_tolerations .Values.custom.default_tolerations }} +{{ $controller_sc := or .Values.custom.controller_sc .Values.custom.default_sc }} +{{ $controller_ns := or .Values.custom.controller_ns .Values.custom.default_ns }} apiVersion: v1 kind: ServiceAccount metadata: @@ -90,6 +94,9 @@ metadata: namespace: {{ .Release.Namespace }} labels: app: volcano-controller + {{- if .Values.custom.controller_labels }} + {{- toYaml .Values.custom.controller_labels | nindent 4 }} + {{- end }} spec: replicas: {{ .Values.custom.controller_replicas }} selector: @@ -99,7 +106,24 @@ spec: metadata: labels: app: volcano-controller + {{- if .Values.custom.controller_podLabels }} + {{- toYaml .Values.custom.controller_podLabels | nindent 8 }} + {{- end }} spec: + {{- if $controller_tolerations }} + tolerations: {{- toYaml $controller_tolerations | nindent 8 }} + {{- end }} + {{- if $controller_ns }} + nodeSelector: {{- toYaml $controller_ns | nindent 8 }} + {{- end }} + {{- if $controller_affinity }} + affinity: + {{- toYaml $controller_affinity | nindent 8 }} + {{- end }} + {{- if $controller_sc }} + securityContext: + {{- toYaml $controller_sc | nindent 8 }} + {{- end }} serviceAccount: {{ .Release.Name }}-controllers priorityClassName: system-cluster-critical {{- if .Values.basic.image_pull_secret }} @@ -108,6 +132,8 @@ spec: {{- end }} containers: - name: {{ .Release.Name }}-controllers + resources: + {{- toYaml .Values.custom.controller_resources | nindent 14 }} image: {{.Values.basic.controller_image_name}}:{{.Values.basic.image_tag_version}} args: - --logtostderr diff --git a/installer/helm/chart/volcano/templates/scheduler.yaml b/installer/helm/chart/volcano/templates/scheduler.yaml index a224e6e1e7..756ea5d3d0 100644 --- a/installer/helm/chart/volcano/templates/scheduler.yaml +++ b/installer/helm/chart/volcano/templates/scheduler.yaml @@ -1,4 +1,8 @@ {{- if .Values.custom.scheduler_enable }} +{{ $scheduler_affinity := or .Values.custom.scheduler_affinity .Values.custom.default_affinity }} +{{ $scheduler_tolerations := or .Values.custom.scheduler_tolerations .Values.custom.default_tolerations }} +{{ $scheduler_sc := or .Values.custom.scheduler_sc .Values.custom.default_sc }} +{{ $scheduler_ns := or .Values.custom.scheduler_ns .Values.custom.default_ns }} apiVersion: v1 kind: ConfigMap metadata: @@ -103,6 +107,9 @@ metadata: namespace: {{ .Release.Namespace }} labels: app: volcano-scheduler + {{- if .Values.custom.scheduler_labels }} + {{- toYaml .Values.custom.scheduler_labels | nindent 4 }} + {{- end }} spec: replicas: {{ .Values.custom.scheduler_replicas }} selector: @@ -112,7 +119,24 @@ spec: metadata: labels: app: volcano-scheduler + {{- if .Values.custom.scheduler_podLabels }} + {{- toYaml .Values.custom.scheduler_podLabels | nindent 8 }} + {{- end }} spec: + {{- if $scheduler_tolerations }} + tolerations: {{- toYaml $scheduler_tolerations | nindent 8 }} + {{- end }} + {{- if $scheduler_ns }} + nodeSelector: {{- toYaml $scheduler_ns | nindent 8 }} + {{- end }} + {{- if $scheduler_affinity }} + affinity: + {{- toYaml $scheduler_affinity | nindent 8 }} + {{- end }} + {{- if $scheduler_sc }} + securityContext: + {{- toYaml $scheduler_sc | nindent 8 }} + {{- end }} serviceAccount: {{ .Release.Name }}-scheduler priorityClassName: system-cluster-critical {{- if .Values.basic.image_pull_secret }} @@ -122,6 +146,8 @@ spec: containers: - name: {{ .Release.Name }}-scheduler image: {{.Values.basic.scheduler_image_name}}:{{.Values.basic.image_tag_version}} + resources: + {{- toYaml .Values.custom.scheduler_resources | nindent 12 }} args: - --logtostderr - --scheduler-conf=/volcano.scheduler/{{base .Values.basic.scheduler_config_file}} diff --git a/installer/helm/chart/volcano/values.yaml b/installer/helm/chart/volcano/values.yaml index 0b29d9780a..901259bb41 100644 --- a/installer/helm/chart/volcano/values.yaml +++ b/installer/helm/chart/volcano/values.yaml @@ -22,3 +22,77 @@ custom: nodeSelector: {} affinity: [] tolerations: [] + +# Specify affinity for all main Volcano components or per component. +# For example: +# +# default_affinity: +# podAntiAffinity: +# preferredDuringSchedulingIgnoredDuringExecution: +# - podAffinityTerm: +# labelSelector: +# matchLabels: +# key: value +# topologyKey: kubernetes.io/hostname +# weight: 49 + default_affinity: ~ + admission_affinity: ~ + controller_affinity: ~ + scheduler_affinity: ~ + +# Specify tolerations for all main Volcano components or per component +# For example: +# +# default_tolerations: +# - key: "example-key1" +# operator: "Exists" +# effect: "NoSchedule" + default_tolerations: ~ + admission_tolerations: ~ + controller_tolerations: ~ + scheduler_tolerations: ~ + +# Specify securityContext for all main Volcano components or per component +# For example: +# +# default_sc: +# runAsUser: 3000 +# runAsGroup: 3000 + default_sc: ~ + scheduler_sc: ~ + admission_sc: ~ + controller_sc: ~ + +# Specify nodeSelector for all main Volcano components or per component +# For example: +# +# default_ns: +# nodetype: criticalservices + default_ns: ~ + admission_ns: ~ + scheduler_ns: ~ + controller_ns: ~ + + +# Specify labels for Volcano main component deployments and pods +# For example: +# +# admission_podLabels: +# key1: value1 + admission_podLabels: ~ + scheduler_podLabels: ~ + controller_podLabels: ~ + admission_labels: ~ + scheduler_labels: ~ + controller_labels: ~ + +# Specify resources for Volcano main component deployments and pods +# For example: +# +# admission_resources: +# limits: +# cpu: 300m +# memory: 300Mi + admission_resources: ~ + scheduler_resources: ~ + controller_resources: ~ diff --git a/installer/volcano-development.yaml b/installer/volcano-development.yaml index 6e2fcbe68a..dd617ca65a 100644 --- a/installer/volcano-development.yaml +++ b/installer/volcano-development.yaml @@ -153,6 +153,8 @@ spec: image: volcanosh/vc-webhook-manager:latest imagePullPolicy: Always name: admission + resources: + null volumeMounts: - mountPath: /admission.local.config/certificates name: admission-certs @@ -185,6 +187,8 @@ spec: restartPolicy: Never containers: - name: main + resources: + null image: volcanosh/vc-webhook-manager:latest imagePullPolicy: Always command: ["./gen-admission-secret.sh", "--service", "volcano-admission-service", "--namespace", @@ -8616,6 +8620,8 @@ spec: priorityClassName: system-cluster-critical containers: - name: volcano-controllers + resources: + null image: volcanosh/vc-controller-manager:latest args: - --logtostderr @@ -8785,6 +8791,8 @@ spec: containers: - name: volcano-scheduler image: volcanosh/vc-scheduler:latest + resources: + null args: - --logtostderr - --scheduler-conf=/volcano.scheduler/volcano-scheduler.conf From 0bf1005ebd647b509b828467594141c46ecbfdf8 Mon Sep 17 00:00:00 2001 From: aakcht Date: Tue, 23 May 2023 07:27:12 +0300 Subject: [PATCH 09/12] resolve merge conflicts Signed-off-by: aakcht Signed-off-by: chenfengyu --- installer/README.md | 4 +++- .../helm/chart/volcano/templates/admission.yaml | 13 ------------- .../helm/chart/volcano/templates/controllers.yaml | 12 ------------ .../helm/chart/volcano/templates/scheduler.yaml | 12 ------------ installer/helm/chart/volcano/values.yaml | 3 --- 5 files changed, 3 insertions(+), 41 deletions(-) diff --git a/installer/README.md b/installer/README.md index acc49732b8..41c176e407 100644 --- a/installer/README.md +++ b/installer/README.md @@ -14,10 +14,13 @@ Kubernetes that are commonly required by many classes of batch & elastic workloa ## Installing volcano via yaml file All-in-one yaml has been generated for quick deployment. Try command: + ```$xslt kubectl apply -f volcano-v0.0.x.yaml ``` + Check the status in namespace `volcano-system` + ```$xslt $kubectl get all -n volcano-system NAME READY STATUS RESTARTS AGE @@ -129,7 +132,6 @@ $ helm install --name volcano-release --set basic.image_pull_policy=Always volca The above command set image pull policy to `Always`, so docker image will be pulled each time. - Alternatively, a YAML file that specifies the values for the parameters can be provided while installing the chart. For example, ```bash diff --git a/installer/helm/chart/volcano/templates/admission.yaml b/installer/helm/chart/volcano/templates/admission.yaml index 389660e4b3..ba627b16ab 100644 --- a/installer/helm/chart/volcano/templates/admission.yaml +++ b/installer/helm/chart/volcano/templates/admission.yaml @@ -140,19 +140,6 @@ spec: configMap: name: {{ .Release.Name }}-admission-configmap - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - --- apiVersion: v1 kind: Service diff --git a/installer/helm/chart/volcano/templates/controllers.yaml b/installer/helm/chart/volcano/templates/controllers.yaml index 697bd9f981..da30b86c90 100644 --- a/installer/helm/chart/volcano/templates/controllers.yaml +++ b/installer/helm/chart/volcano/templates/controllers.yaml @@ -145,16 +145,4 @@ spec: - -v=4 - 2>&1 imagePullPolicy: {{ .Values.basic.image_pull_policy }} - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} {{- end }} diff --git a/installer/helm/chart/volcano/templates/scheduler.yaml b/installer/helm/chart/volcano/templates/scheduler.yaml index 756ea5d3d0..f377635e60 100644 --- a/installer/helm/chart/volcano/templates/scheduler.yaml +++ b/installer/helm/chart/volcano/templates/scheduler.yaml @@ -167,18 +167,6 @@ spec: - name: scheduler-config configMap: name: {{ .Release.Name }}-scheduler-configmap - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} --- apiVersion: v1 kind: Service diff --git a/installer/helm/chart/volcano/values.yaml b/installer/helm/chart/volcano/values.yaml index 901259bb41..c679b94f5b 100644 --- a/installer/helm/chart/volcano/values.yaml +++ b/installer/helm/chart/volcano/values.yaml @@ -19,9 +19,6 @@ custom: scheduler_replicas: 1 leader_elect_enable: false enabled_admissions: "/jobs/mutate,/jobs/validate,/podgroups/mutate,/pods/validate,/pods/mutate,/queues/mutate,/queues/validate" -nodeSelector: {} -affinity: [] -tolerations: [] # Specify affinity for all main Volcano components or per component. # For example: From 07059650ee82ad34c4fca46dcae0a20b07e78040 Mon Sep 17 00:00:00 2001 From: aakcht Date: Fri, 16 Jun 2023 15:03:40 +0300 Subject: [PATCH 10/12] avoid null resources in volcano-development.yaml Signed-off-by: aakcht Signed-off-by: chenfengyu --- installer/helm/chart/volcano/templates/admission.yaml | 4 ++++ installer/helm/chart/volcano/templates/controllers.yaml | 2 ++ installer/helm/chart/volcano/templates/scheduler.yaml | 2 ++ installer/volcano-development.yaml | 8 -------- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/installer/helm/chart/volcano/templates/admission.yaml b/installer/helm/chart/volcano/templates/admission.yaml index ba627b16ab..8cfdce4648 100644 --- a/installer/helm/chart/volcano/templates/admission.yaml +++ b/installer/helm/chart/volcano/templates/admission.yaml @@ -123,8 +123,10 @@ spec: image: {{.Values.basic.admission_image_name}}:{{.Values.basic.image_tag_version}} imagePullPolicy: {{ .Values.basic.image_pull_policy }} name: admission + {{- if .Values.custom.admission_resources }} resources: {{- toYaml .Values.custom.admission_resources | nindent 12 }} + {{- end }} volumeMounts: - mountPath: /admission.local.config/certificates name: admission-certs @@ -195,8 +197,10 @@ spec: restartPolicy: Never containers: - name: main + {{- if .Values.custom.admission_resources }} resources: {{- toYaml .Values.custom.admission_resources | nindent 12 }} + {{- end }} image: {{.Values.basic.admission_image_name}}:{{.Values.basic.image_tag_version}} imagePullPolicy: {{ .Values.basic.image_pull_policy }} command: ["./gen-admission-secret.sh", "--service", "{{ .Release.Name }}-admission-service", "--namespace", diff --git a/installer/helm/chart/volcano/templates/controllers.yaml b/installer/helm/chart/volcano/templates/controllers.yaml index da30b86c90..7c2929df9e 100644 --- a/installer/helm/chart/volcano/templates/controllers.yaml +++ b/installer/helm/chart/volcano/templates/controllers.yaml @@ -132,8 +132,10 @@ spec: {{- end }} containers: - name: {{ .Release.Name }}-controllers + {{- if .Values.custom.controller_resources }} resources: {{- toYaml .Values.custom.controller_resources | nindent 14 }} + {{- end }} image: {{.Values.basic.controller_image_name}}:{{.Values.basic.image_tag_version}} args: - --logtostderr diff --git a/installer/helm/chart/volcano/templates/scheduler.yaml b/installer/helm/chart/volcano/templates/scheduler.yaml index f377635e60..98636f17bf 100644 --- a/installer/helm/chart/volcano/templates/scheduler.yaml +++ b/installer/helm/chart/volcano/templates/scheduler.yaml @@ -146,8 +146,10 @@ spec: containers: - name: {{ .Release.Name }}-scheduler image: {{.Values.basic.scheduler_image_name}}:{{.Values.basic.image_tag_version}} + {{- if .Values.custom.scheduler_resources }} resources: {{- toYaml .Values.custom.scheduler_resources | nindent 12 }} + {{- end }} args: - --logtostderr - --scheduler-conf=/volcano.scheduler/{{base .Values.basic.scheduler_config_file}} diff --git a/installer/volcano-development.yaml b/installer/volcano-development.yaml index dd617ca65a..6e2fcbe68a 100644 --- a/installer/volcano-development.yaml +++ b/installer/volcano-development.yaml @@ -153,8 +153,6 @@ spec: image: volcanosh/vc-webhook-manager:latest imagePullPolicy: Always name: admission - resources: - null volumeMounts: - mountPath: /admission.local.config/certificates name: admission-certs @@ -187,8 +185,6 @@ spec: restartPolicy: Never containers: - name: main - resources: - null image: volcanosh/vc-webhook-manager:latest imagePullPolicy: Always command: ["./gen-admission-secret.sh", "--service", "volcano-admission-service", "--namespace", @@ -8620,8 +8616,6 @@ spec: priorityClassName: system-cluster-critical containers: - name: volcano-controllers - resources: - null image: volcanosh/vc-controller-manager:latest args: - --logtostderr @@ -8791,8 +8785,6 @@ spec: containers: - name: volcano-scheduler image: volcanosh/vc-scheduler:latest - resources: - null args: - --logtostderr - --scheduler-conf=/volcano.scheduler/volcano-scheduler.conf From 097ad3da3e99cbbcadcad93d5d1393d84d61bb0a Mon Sep 17 00:00:00 2001 From: chenfengyu Date: Thu, 29 Jun 2023 00:41:35 +0800 Subject: [PATCH 11/12] podgroup status running remove length of releasing failed pod Signed-off-by: chenfengyu --- pkg/scheduler/framework/session.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/scheduler/framework/session.go b/pkg/scheduler/framework/session.go index 6a7348f574..9b9ff2b1db 100644 --- a/pkg/scheduler/framework/session.go +++ b/pkg/scheduler/framework/session.go @@ -280,7 +280,9 @@ func jobStatus(ssn *Session, jobInfo *api.JobInfo) scheduling.PodGroupStatus { } } - status.Running = int32(len(jobInfo.TaskStatusIndex[api.Running]) + len(jobInfo.TaskStatusIndex[api.ReleasingFailed])) + // todo + // Should status running add length of ReleasingFailed pod? + status.Running = int32(len(jobInfo.TaskStatusIndex[api.Running])) status.Failed = int32(len(jobInfo.TaskStatusIndex[api.Failed])) status.Succeeded = int32(len(jobInfo.TaskStatusIndex[api.Succeeded])) From 69ff3ffd7f373936fe5666c31ba2a3d806cc9236 Mon Sep 17 00:00:00 2001 From: chenfengyu Date: Thu, 29 Jun 2023 00:44:02 +0800 Subject: [PATCH 12/12] rename som variable Signed-off-by: chenfengyu --- cmd/scheduler/app/options/options.go | 4 ++-- pkg/scheduler/api/helpers.go | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cmd/scheduler/app/options/options.go b/cmd/scheduler/app/options/options.go index db669ea4bc..a0b5926777 100644 --- a/cmd/scheduler/app/options/options.go +++ b/cmd/scheduler/app/options/options.go @@ -43,7 +43,7 @@ const ( defaultPercentageOfNodesToFind = 100 defaultLockObjectNamespace = "volcano-system" defaultGracePeriodSeconds = 30 - defaultGracePeriodFactor = 1.1 + defaultGracePeriodSecondsWait = 3 ) // ServerOption is the main context object for the controller manager. @@ -135,7 +135,7 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet) { fs.BoolVar(&s.EnableCacheDumper, "cache-dumper", true, "Enable the cache dumper, it's true by default") fs.Int64Var(&s.GracePeriodSeconds, "grace-period", defaultGracePeriodSeconds, "the default second grace period seconds from pod") - fs.Int64Var(&s.GracePeriodSecondsWait, "grace-period-wait", defaultGracePeriodFactor, "wait time from pod send sig kill to delete pod") + fs.Int64Var(&s.GracePeriodSecondsWait, "grace-period-wait", defaultGracePeriodSecondsWait, "wait time from pod send sig kill to delete pod") } // CheckOptionOrDie check lock-object-namespace when LeaderElection is enabled. diff --git a/pkg/scheduler/api/helpers.go b/pkg/scheduler/api/helpers.go index 3ca103adab..fb2c26aa2b 100644 --- a/pkg/scheduler/api/helpers.go +++ b/pkg/scheduler/api/helpers.go @@ -36,16 +36,16 @@ func PodKey(pod *v1.Pod) TaskID { func getTaskStatus(pod *v1.Pod) TaskStatus { opts := options.ServerOpts - gracePeriodSeconds := opts.GracePeriodSeconds + waitTime := opts.GracePeriodSeconds if pod.Spec.TerminationGracePeriodSeconds != nil { // default grace period - gracePeriodSeconds = *pod.Spec.TerminationGracePeriodSeconds + waitTime = *pod.Spec.TerminationGracePeriodSeconds } - gracePeriodSeconds += opts.GracePeriodSecondsWait + waitTime += opts.GracePeriodSecondsWait switch pod.Status.Phase { case v1.PodRunning: if pod.DeletionTimestamp != nil && - time.Now().Unix()-pod.DeletionTimestamp.Unix() <= gracePeriodSeconds { + time.Now().Unix()-pod.DeletionTimestamp.Unix() <= waitTime { return Releasing } else if pod.DeletionTimestamp != nil { return ReleasingFailed @@ -54,7 +54,7 @@ func getTaskStatus(pod *v1.Pod) TaskStatus { return Running case v1.PodPending: if pod.DeletionTimestamp != nil && - time.Now().Unix()-pod.DeletionTimestamp.Unix() <= gracePeriodSeconds { + time.Now().Unix()-pod.DeletionTimestamp.Unix() <= waitTime { return Releasing } else if pod.DeletionTimestamp != nil { return ReleasingFailed