diff --git a/README.md b/README.md
index 09a959c403..4befc27381 100644
--- a/README.md
+++ b/README.md
@@ -153,6 +153,10 @@ make TAG=latest generate-yaml
kubectl create -f _output/release/volcano-monitoring-latest.yaml
```
+### Install dashboard
+
+Please follow the guide [Volcano Dashboard](https://github.com/volcano-sh/dashboard#volcano-dashboard) to install volcano dashboard.
+
## Kubernetes compatibility
| | Kubernetes 1.17 | Kubernetes 1.18 | Kubernetes 1.19 | Kubernetes 1.20 | Kubernetes 1.21 | Kubernetes 1.22 | Kubernetes 1.23 | Kubernetes 1.24 | Kubernetes 1.25 | Kubernetes 1.26 | Kubernetes 1.27 | Kubernetes 1.28 | Kubernetes 1.29 |Kubernetes 1.30 |Kubernetes 1.31 |
diff --git a/benchmark/README.md b/benchmark/README.md
index b81a7f69b2..5de8952b26 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -13,13 +13,13 @@ results based on their own hardware. This document is for demonstration purposes
The information about the environment used for this test is as follows:
-| 属性 | 值 |
+| Attribute | Value |
|------------|---------|
-| 操作系统 | Mac |
+| Operating System | Mac |
| Arch | Arm |
-| CPU核数 | 12 |
-| 内存 | 32GB |
-| Volcano 版本 | v1.10.0 |
+| CPU Cores | 12 |
+| Memory | 32GB |
+| Volcano Version | v1.10.0 |
## Test Procedure
@@ -141,7 +141,7 @@ cd benchmark/sh
./benchmark.sh 6
```
-#### 测试结果:
+#### Test Results:
Test results are output to benchmark/img/res/, with files like g1.png, g2.png, and g3.png.
@@ -154,7 +154,7 @@ Test results are output to benchmark/img/res/, with files like g1.png, g2.png, a
| 5 | |
| 6 | |
-#### 指标实时观测
+#### Real-Time Observation of Indicators
Metrics can be monitored in real-time via Grafana. Run the following command to open Grafana in your local browser
at http://localhost:3000. The default username is `admin` and the password is `prom-operator`:
diff --git a/docs/ut_coverage/UT_coverage_v1.4.0.html b/docs/ut_coverage/UT_coverage_v1.4.0.html
deleted file mode 100644
index a341bdf680..0000000000
--- a/docs/ut_coverage/UT_coverage_v1.4.0.html
+++ /dev/null
@@ -1,20476 +0,0 @@
-
-
-
-
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "os"
- "path/filepath"
-
- "github.com/spf13/cobra"
-)
-
-type commonFlags struct {
- Master string
- Kubeconfig string
-}
-
-func initFlags(cmd *cobra.Command, cf *commonFlags) {
- cmd.Flags().StringVarP(&cf.Master, "master", "s", "", "the address of apiserver")
-
- kubeConfFile := os.Getenv("KUBECONFIG")
- if kubeConfFile == "" {
- if home := homeDir(); home != "" {
- kubeConfFile = filepath.Join(home, ".kube", "config")
- }
- }
- cmd.Flags().StringVarP(&cf.Kubeconfig, "kubeconfig", "k", kubeConfFile, "(optional) absolute path to the kubeconfig file")
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type deleteFlags struct {
- commonFlags
-
- Namespace string
- JobName string
-}
-
-var deleteJobFlags = &deleteFlags{}
-
-// InitDeleteFlags init the delete command flags.
-func InitDeleteFlags(cmd *cobra.Command) {
- initFlags(cmd, &deleteJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&deleteJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&deleteJobFlags.JobName, "name", "N", "", "the name of job")
-}
-
-// DeleteJob delete the job.
-func DeleteJob() error {
- config, err := util.BuildConfig(deleteJobFlags.Master, deleteJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if deleteJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to delete a particular job")
- return err
- }
-
- jobClient := versioned.NewForConfigOrDie(config)
- err = jobClient.BatchV1alpha1().Jobs(deleteJobFlags.Namespace).Delete(context.TODO(), deleteJobFlags.JobName, metav1.DeleteOptions{})
- if err != nil {
- return err
- }
- fmt.Printf("delete job %v successfully\n", deleteJobFlags.JobName)
- return nil
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "io"
- "os"
- "strings"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type listFlags struct {
- commonFlags
-
- Namespace string
- SchedulerName string
- allNamespace bool
- selector string
-}
-
-const (
-
- // Name name etc below key words are used in job print format
- Name string = "Name"
- // Creation create
- Creation string = "Creation"
- // Phase phase
- Phase string = "Phase"
- // Replicas replicas
- Replicas string = "Replicas"
- // Min minimum
- Min string = "Min"
- // Scheduler scheduler
- Scheduler string = "Scheduler"
- // Pending pending
- Pending string = "Pending"
- // Running running
- Running string = "Running"
- // Succeeded success
- Succeeded string = "Succeeded"
- // Terminating terminating
- Terminating string = "Terminating"
- // Version version
- Version string = "Version"
- // Failed failed
- Failed string = "Failed"
- // Unknown pod
- Unknown string = "Unknown"
- // RetryCount retry count
- RetryCount string = "RetryCount"
- // JobType job type
- JobType string = "JobType"
- // Namespace job namespace
- Namespace string = "Namespace"
-)
-
-var listJobFlags = &listFlags{}
-
-// InitListFlags init list command flags.
-func InitListFlags(cmd *cobra.Command) {
- initFlags(cmd, &listJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&listJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&listJobFlags.SchedulerName, "scheduler", "S", "", "list job with specified scheduler name")
- cmd.Flags().BoolVarP(&listJobFlags.allNamespace, "all-namespaces", "", false, "list jobs in all namespaces")
- cmd.Flags().StringVarP(&listJobFlags.selector, "selector", "", "", "fuzzy matching jobName")
-}
-
-// ListJobs lists all jobs details.
-func ListJobs() error {
- config, err := util.BuildConfig(listJobFlags.Master, listJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
- if listJobFlags.allNamespace {
- listJobFlags.Namespace = ""
- }
- jobClient := versioned.NewForConfigOrDie(config)
- jobs, err := jobClient.BatchV1alpha1().Jobs(listJobFlags.Namespace).List(context.TODO(), metav1.ListOptions{})
- if err != nil {
- return err
- }
-
- if len(jobs.Items) == 0 {
- fmt.Printf("No resources found\n")
- return nil
- }
- PrintJobs(jobs, os.Stdout)
-
- return nil
-}
-
-// PrintJobs prints all jobs details.
-func PrintJobs(jobs *v1alpha1.JobList, writer io.Writer) {
- maxLenInfo := getMaxLen(jobs)
-
- titleFormat := "%%-%ds%%-15s%%-12s%%-12s%%-12s%%-6s%%-10s%%-10s%%-12s%%-10s%%-12s%%-10s\n"
- contentFormat := "%%-%ds%%-15s%%-12s%%-12s%%-12d%%-6d%%-10d%%-10d%%-12d%%-10d%%-12d%%-10d\n"
-
- var err error
- if listJobFlags.allNamespace {
- _, err = fmt.Fprintf(writer, fmt.Sprintf("%%-%ds"+titleFormat, maxLenInfo[1], maxLenInfo[0]),
- Namespace, Name, Creation, Phase, JobType, Replicas, Min, Pending, Running, Succeeded, Failed, Unknown, RetryCount)
- } else {
- _, err = fmt.Fprintf(writer, fmt.Sprintf(titleFormat, maxLenInfo[0]),
- Name, Creation, Phase, JobType, Replicas, Min, Pending, Running, Succeeded, Failed, Unknown, RetryCount)
- }
- if err != nil {
- fmt.Printf("Failed to print list command result: %s.\n", err)
- }
-
- for _, job := range jobs.Items {
- if listJobFlags.SchedulerName != "" && listJobFlags.SchedulerName != job.Spec.SchedulerName {
- continue
- }
- if !strings.Contains(job.Name, listJobFlags.selector) {
- continue
- }
- replicas := int32(0)
- for _, ts := range job.Spec.Tasks {
- replicas += ts.Replicas
- }
- jobType := job.ObjectMeta.Labels[v1alpha1.JobTypeKey]
- if jobType == "" {
- jobType = "Batch"
- }
-
- if listJobFlags.allNamespace {
- _, err = fmt.Fprintf(writer, fmt.Sprintf("%%-%ds"+contentFormat, maxLenInfo[1], maxLenInfo[0]),
- job.Namespace, job.Name, job.CreationTimestamp.Format("2006-01-02"), job.Status.State.Phase, jobType, replicas,
- job.Status.MinAvailable, job.Status.Pending, job.Status.Running, job.Status.Succeeded, job.Status.Failed, job.Status.Unknown, job.Status.RetryCount)
- } else {
- _, err = fmt.Fprintf(writer, fmt.Sprintf(contentFormat, maxLenInfo[0]),
- job.Name, job.CreationTimestamp.Format("2006-01-02"), job.Status.State.Phase, jobType, replicas,
- job.Status.MinAvailable, job.Status.Pending, job.Status.Running, job.Status.Succeeded, job.Status.Failed, job.Status.Unknown, job.Status.RetryCount)
- }
- if err != nil {
- fmt.Printf("Failed to print list command result: %s.\n", err)
- }
- }
-}
-
-func getMaxLen(jobs *v1alpha1.JobList) []int {
- maxNameLen := len(Name)
- maxNamespaceLen := len(Namespace)
- for _, job := range jobs.Items {
- if len(job.Name) > maxNameLen {
- maxNameLen = len(job.Name)
- }
- if len(job.Namespace) > maxNamespaceLen {
- maxNamespaceLen = len(job.Namespace)
- }
- }
-
- return []int{maxNameLen + 3, maxNamespaceLen + 3}
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "fmt"
-
- "github.com/spf13/cobra"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type resumeFlags struct {
- commonFlags
-
- Namespace string
- JobName string
-}
-
-var resumeJobFlags = &resumeFlags{}
-
-// InitResumeFlags init resume command flags.
-func InitResumeFlags(cmd *cobra.Command) {
- initFlags(cmd, &resumeJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&resumeJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&resumeJobFlags.JobName, "name", "N", "", "the name of job")
-}
-
-// ResumeJob resumes the job.
-func ResumeJob() error {
- config, err := util.BuildConfig(resumeJobFlags.Master, resumeJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
- if resumeJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to resume a particular job")
- return err
- }
-
- return createJobCommand(config,
- resumeJobFlags.Namespace, resumeJobFlags.JobName,
- v1alpha1.ResumeJobAction)
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "io/ioutil"
- "strings"
-
- "github.com/spf13/cobra"
-
- v1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "sigs.k8s.io/yaml"
-
- vcbatch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type runFlags struct {
- commonFlags
-
- Name string
- Namespace string
- Image string
-
- MinAvailable int
- Replicas int
- Requests string
- Limits string
- SchedulerName string
- FileName string
-}
-
-var launchJobFlags = &runFlags{}
-
-// InitRunFlags init the run flags.
-func InitRunFlags(cmd *cobra.Command) {
- initFlags(cmd, &launchJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&launchJobFlags.Image, "image", "i", "busybox", "the container image of job")
- cmd.Flags().StringVarP(&launchJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&launchJobFlags.Name, "name", "N", "", "the name of job")
- cmd.Flags().IntVarP(&launchJobFlags.MinAvailable, "min", "m", 1, "the minimal available tasks of job")
- cmd.Flags().IntVarP(&launchJobFlags.Replicas, "replicas", "r", 1, "the total tasks of job")
- cmd.Flags().StringVarP(&launchJobFlags.Requests, "requests", "R", "cpu=1000m,memory=100Mi", "the resource request of the task")
- cmd.Flags().StringVarP(&launchJobFlags.Limits, "limits", "L", "cpu=1000m,memory=100Mi", "the resource limit of the task")
- cmd.Flags().StringVarP(&launchJobFlags.SchedulerName, "scheduler", "S", "volcano", "the scheduler for this job")
- cmd.Flags().StringVarP(&launchJobFlags.FileName, "filename", "f", "", "the yaml file of job")
-}
-
-var jobName = "job.volcano.sh"
-
-// RunJob creates the job.
-func RunJob() error {
- config, err := util.BuildConfig(launchJobFlags.Master, launchJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if launchJobFlags.Name == "" && launchJobFlags.FileName == "" {
- err = fmt.Errorf("job name cannot be left blank")
- return err
- }
-
- req, err := populateResourceListV1(launchJobFlags.Requests)
- if err != nil {
- return err
- }
-
- limit, err := populateResourceListV1(launchJobFlags.Limits)
- if err != nil {
- return err
- }
-
- job, err := readFile(launchJobFlags.FileName)
- if err != nil {
- return err
- }
-
- if job == nil {
- job = constructLaunchJobFlagsJob(launchJobFlags, req, limit)
- }
-
- jobClient := versioned.NewForConfigOrDie(config)
- newJob, err := jobClient.BatchV1alpha1().Jobs(launchJobFlags.Namespace).Create(context.TODO(), job, metav1.CreateOptions{})
- if err != nil {
- return err
- }
-
- if newJob.Spec.Queue == "" {
- newJob.Spec.Queue = "default"
- }
-
- fmt.Printf("run job %v successfully\n", newJob.Name)
-
- return nil
-}
-
-func readFile(filename string) (*vcbatch.Job, error) {
- if filename == "" {
- return nil, nil
- }
-
- if !strings.Contains(filename, ".yaml") && !strings.Contains(filename, ".yml") {
- return nil, fmt.Errorf("only support yaml file")
- }
-
- file, err := ioutil.ReadFile(filename)
- if err != nil {
- return nil, fmt.Errorf("failed to read file, err: %v", err)
- }
-
- var job vcbatch.Job
- if err := yaml.Unmarshal(file, &job); err != nil {
- return nil, fmt.Errorf("failed to unmarshal file, err: %v", err)
- }
-
- return &job, nil
-}
-
-func constructLaunchJobFlagsJob(launchJobFlags *runFlags, req, limit v1.ResourceList) *vcbatch.Job {
- return &vcbatch.Job{
- ObjectMeta: metav1.ObjectMeta{
- Name: launchJobFlags.Name,
- Namespace: launchJobFlags.Namespace,
- },
- Spec: vcbatch.JobSpec{
- MinAvailable: int32(launchJobFlags.MinAvailable),
- SchedulerName: launchJobFlags.SchedulerName,
- Tasks: []vcbatch.TaskSpec{
- {
- Replicas: int32(launchJobFlags.Replicas),
-
- Template: v1.PodTemplateSpec{
- ObjectMeta: metav1.ObjectMeta{
- Name: launchJobFlags.Name,
- Labels: map[string]string{jobName: launchJobFlags.Name},
- },
- Spec: v1.PodSpec{
- RestartPolicy: v1.RestartPolicyNever,
- Containers: []v1.Container{
- {
- Image: launchJobFlags.Image,
- Name: launchJobFlags.Name,
- ImagePullPolicy: v1.PullIfNotPresent,
- Resources: v1.ResourceRequirements{
- Limits: limit,
- Requests: req,
- },
- },
- },
- },
- },
- },
- },
- },
- }
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "fmt"
-
- "github.com/spf13/cobra"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type suspendFlags struct {
- commonFlags
-
- Namespace string
- JobName string
-}
-
-var suspendJobFlags = &suspendFlags{}
-
-// InitSuspendFlags init suspend related flags.
-func InitSuspendFlags(cmd *cobra.Command) {
- initFlags(cmd, &suspendJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&suspendJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&suspendJobFlags.JobName, "name", "N", "", "the name of job")
-}
-
-// SuspendJob suspends the job.
-func SuspendJob() error {
- config, err := util.BuildConfig(suspendJobFlags.Master, suspendJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if suspendJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to suspend a particular job")
- return err
- }
-
- return createJobCommand(config,
- suspendJobFlags.Namespace, suspendJobFlags.JobName,
- v1alpha1.AbortJobAction)
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "os"
- "strings"
- "time"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
- "k8s.io/client-go/rest"
-
- vcbus "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-func homeDir() string {
- if h := os.Getenv("HOME"); h != "" {
- return h
- }
- return os.Getenv("USERPROFILE") // windows
-}
-
-// populateResourceListV1 takes strings of form <resourceName1>=<value1>,<resourceName1>=<value2>
-// and returns ResourceList.
-func populateResourceListV1(spec string) (v1.ResourceList, error) {
- // empty input gets a nil response to preserve generator test expected behaviors
- if spec == "" {
- return nil, nil
- }
-
- result := v1.ResourceList{}
- resourceStatements := strings.Split(spec, ",")
- for _, resourceStatement := range resourceStatements {
- parts := strings.Split(resourceStatement, "=")
- if len(parts) != 2 {
- return nil, fmt.Errorf("invalid argument syntax %v, expected <resource>=<value>", resourceStatement)
- }
- resourceName := v1.ResourceName(parts[0])
- resourceQuantity, err := resource.ParseQuantity(parts[1])
- if err != nil {
- return nil, err
- }
- result[resourceName] = resourceQuantity
- }
- return result, nil
-}
-
-func createJobCommand(config *rest.Config, ns, name string, action vcbus.Action) error {
- jobClient := versioned.NewForConfigOrDie(config)
- job, err := jobClient.BatchV1alpha1().Jobs(ns).Get(context.TODO(), name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- ctrlRef := metav1.NewControllerRef(job, helpers.JobKind)
- cmd := &vcbus.Command{
- ObjectMeta: metav1.ObjectMeta{
- GenerateName: fmt.Sprintf("%s-%s-",
- job.Name, strings.ToLower(string(action))),
- Namespace: job.Namespace,
- OwnerReferences: []metav1.OwnerReference{
- *ctrlRef,
- },
- },
- TargetObject: ctrlRef,
- Action: string(action),
- }
-
- if _, err := jobClient.BusV1alpha1().Commands(ns).Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
- return err
- }
-
- return nil
-}
-
-func translateTimestampSince(timestamp metav1.Time) string {
- if timestamp.IsZero() {
- return "<unknown>"
- }
- return HumanDuration(time.Since(timestamp.Time))
-}
-
-// HumanDuration translate time.Duration to human readable time string.
-func HumanDuration(d time.Duration) string {
- // Allow deviation no more than 2 seconds(excluded) to tolerate machine time
- // inconsistence, it can be considered as almost now.
- if seconds := int(d.Seconds()); seconds < -1 {
- return "<invalid>"
- } else if seconds < 0 {
- return "0s"
- } else if seconds < 60*2 {
- return fmt.Sprintf("%ds", seconds)
- }
- minutes := int(d / time.Minute)
- if minutes < 10 {
- s := int(d/time.Second) % 60
- if s == 0 {
- return fmt.Sprintf("%dm", minutes)
- }
- return fmt.Sprintf("%dm%ds", minutes, s)
- } else if minutes < 60*3 {
- return fmt.Sprintf("%dm", minutes)
- }
- hours := int(d / time.Hour)
- if hours < 8 {
- m := int(d/time.Minute) % 60
- if m == 0 {
- return fmt.Sprintf("%dh", hours)
- }
- return fmt.Sprintf("%dh%dm", hours, m)
- } else if hours < 48 {
- return fmt.Sprintf("%dh", hours)
- } else if hours < 24*8 {
- h := hours % 24
- if h == 0 {
- return fmt.Sprintf("%dd", hours/24)
- }
- return fmt.Sprintf("%dd%dh", hours/24, h)
- } else if hours < 24*365*2 {
- return fmt.Sprintf("%dd", hours/24)
- } else if hours < 24*365*8 {
- return fmt.Sprintf("%dy%dd", hours/24/365, (hours/24)%365)
- }
- return fmt.Sprintf("%dy", hours/24/365)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "encoding/json"
- "fmt"
- "io"
- "os"
- "strings"
-
- "github.com/spf13/cobra"
-
- coreV1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/client-go/kubernetes"
- "k8s.io/client-go/rest"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type viewFlags struct {
- commonFlags
-
- Namespace string
- JobName string
-}
-
-// level of print indent.
-const (
- Level0 = iota
- Level1
- Level2
-)
-
-var viewJobFlags = &viewFlags{}
-
-// InitViewFlags init the view command flags.
-func InitViewFlags(cmd *cobra.Command) {
- initFlags(cmd, &viewJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&viewJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&viewJobFlags.JobName, "name", "N", "", "the name of job")
-}
-
-// ViewJob gives full details of the job.
-func ViewJob() error {
- config, err := util.BuildConfig(viewJobFlags.Master, viewJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
- if viewJobFlags.JobName == "" {
- err := fmt.Errorf("job name (specified by --name or -N) is mandatory to view a particular job")
- return err
- }
-
- jobClient := versioned.NewForConfigOrDie(config)
- job, err := jobClient.BatchV1alpha1().Jobs(viewJobFlags.Namespace).Get(context.TODO(), viewJobFlags.JobName, metav1.GetOptions{})
- if err != nil {
- return err
- }
- if job == nil {
- fmt.Printf("No resources found\n")
- return nil
- }
- PrintJobInfo(job, os.Stdout)
- PrintEvents(GetEvents(config, job), os.Stdout)
- return nil
-}
-
-// PrintJobInfo print the job detailed info into writer.
-func PrintJobInfo(job *v1alpha1.Job, writer io.Writer) {
- WriteLine(writer, Level0, "Name: \t%s\n", job.Name)
- WriteLine(writer, Level0, "Namespace: \t%s\n", job.Namespace)
- if len(job.Labels) > 0 {
- label, _ := json.Marshal(job.Labels)
- WriteLine(writer, Level0, "Labels: \t%s\n", string(label))
- } else {
- WriteLine(writer, Level0, "Labels: \t<none>\n")
- }
- if len(job.Annotations) > 0 {
- annotation, _ := json.Marshal(job.Annotations)
- WriteLine(writer, Level0, "Annotations:\t%s\n", string(annotation))
- } else {
- WriteLine(writer, Level0, "Annotations:\t<none>\n")
- }
- WriteLine(writer, Level0, "API Version:\t%s\n", job.APIVersion)
- WriteLine(writer, Level0, "Kind: \t%s\n", job.Kind)
-
- WriteLine(writer, Level0, "Metadata:\n")
- WriteLine(writer, Level1, "Creation Timestamp:\t%s\n", job.CreationTimestamp)
- WriteLine(writer, Level1, "Generate Name: \t%s\n", job.GenerateName)
- WriteLine(writer, Level1, "Generation: \t%d\n", job.Generation)
- WriteLine(writer, Level1, "Resource Version: \t%s\n", job.ResourceVersion)
- WriteLine(writer, Level1, "Self Link: \t%s\n", job.SelfLink)
- WriteLine(writer, Level1, "UID: \t%s\n", job.UID)
-
- WriteLine(writer, Level0, "Spec:\n")
- WriteLine(writer, Level1, "Min Available: \t%d\n", job.Spec.MinAvailable)
- WriteLine(writer, Level1, "Plugins:\n")
- WriteLine(writer, Level2, "Env:\t%v\n", job.Spec.Plugins["env"])
- WriteLine(writer, Level2, "Ssh:\t%v\n", job.Spec.Plugins["ssh"])
- WriteLine(writer, Level1, "Scheduler Name: \t%s\n", job.Spec.SchedulerName)
- WriteLine(writer, Level1, "Tasks:\n")
- for i := 0; i < len(job.Spec.Tasks); i++ {
- WriteLine(writer, Level2, "Name:\t%s\n", job.Spec.Tasks[i].Name)
- WriteLine(writer, Level2, "Replicas:\t%d\n", job.Spec.Tasks[i].Replicas)
- WriteLine(writer, Level2, "Template:\n")
- WriteLine(writer, Level2+1, "Metadata:\n")
- WriteLine(writer, Level2+2, "Annotations:\n")
- WriteLine(writer, Level2+3, "Cri . Cci . Io / Container - Type: \t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.Annotations["cri.cci.io/container-type"])
- WriteLine(writer, Level2+3, "Kubernetes . Io / Availablezone: \t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.Annotations["kubernetes.io/availablezone"])
- WriteLine(writer, Level2+3, "Network . Alpha . Kubernetes . Io / Network:\t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.Annotations["network.alpha.kubernetes.io/network"])
- WriteLine(writer, Level2+2, "Creation Timestamp:\t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.CreationTimestamp)
-
- WriteLine(writer, Level2+1, "Spec:\n")
- WriteLine(writer, Level2+2, "Containers:\n")
- for j := 0; j < len(job.Spec.Tasks[i].Template.Spec.Containers); j++ {
- WriteLine(writer, Level2+3, "Command:\n")
- for k := 0; k < len(job.Spec.Tasks[i].Template.Spec.Containers[j].Command); k++ {
- WriteLine(writer, Level2+4, "%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Command[k])
- }
- WriteLine(writer, Level2+3, "Image:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Image)
- WriteLine(writer, Level2+3, "Name: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Name)
- WriteLine(writer, Level2+3, "Ports:\n")
- for k := 0; k < len(job.Spec.Tasks[i].Template.Spec.Containers[j].Ports); k++ {
- WriteLine(writer, Level2+4, "Container Port:\t%d\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Ports[k].ContainerPort)
- WriteLine(writer, Level2+4, "Name: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Ports[k].Name)
- }
- WriteLine(writer, Level2+3, "Resources:\n")
- WriteLine(writer, Level2+4, "Limits:\n")
- WriteLine(writer, Level2+5, "Cpu: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Limits.Cpu())
- WriteLine(writer, Level2+5, "Memory:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Limits.Memory())
- WriteLine(writer, Level2+4, "Requests:\n")
- WriteLine(writer, Level2+5, "Cpu: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Requests.Cpu())
- WriteLine(writer, Level2+5, "Memory:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Requests.Memory())
- WriteLine(writer, Level2+4, "Working Dir:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].WorkingDir)
- }
- WriteLine(writer, Level2+2, "Image Pull Secrets:\n")
- for j := 0; j < len(job.Spec.Tasks[i].Template.Spec.ImagePullSecrets); j++ {
- WriteLine(writer, Level2+3, "Name: \t%s\n", job.Spec.Tasks[i].Template.Spec.ImagePullSecrets[j].Name)
- }
- WriteLine(writer, Level2+2, "Restart Policy: \t%s\n", job.Spec.Tasks[i].Template.Spec.RestartPolicy)
- }
-
- WriteLine(writer, Level0, "Status:\n")
- if job.Status.Succeeded > 0 {
- WriteLine(writer, Level1, "Succeeded: \t%d\n", job.Status.Succeeded)
- }
- if job.Status.Pending > 0 {
- WriteLine(writer, Level1, "Pending: \t%d\n", job.Status.Pending)
- }
- if job.Status.Running > 0 {
- WriteLine(writer, Level1, "Running: \t%d\n", job.Status.Running)
- }
- if job.Status.Failed > 0 {
- WriteLine(writer, Level1, "Failed: \t%d\n", job.Status.Failed)
- }
- if job.Status.Terminating > 0 {
- WriteLine(writer, Level1, "Terminating: \t%d\n", job.Status.Terminating)
- }
- if job.Status.Unknown > 0 {
- WriteLine(writer, Level1, "Unknown: \t%d\n", job.Status.Unknown)
- }
- if job.Status.RetryCount > 0 {
- WriteLine(writer, Level1, "RetryCount: \t%d\n", job.Status.RetryCount)
- }
- if job.Status.MinAvailable > 0 {
- WriteLine(writer, Level1, "Min Available:\t%d\n", job.Status.MinAvailable)
- }
- if job.Status.Version > 0 {
- WriteLine(writer, Level1, "Version: \t%d\n", job.Status.Version)
- }
-
- WriteLine(writer, Level1, "State:\n")
- WriteLine(writer, Level2, "Phase:\t%s\n", job.Status.State.Phase)
- if len(job.Status.ControlledResources) > 0 {
- WriteLine(writer, Level1, "Controlled Resources:\n")
- for key, value := range job.Status.ControlledResources {
- WriteLine(writer, Level2, "%s: \t%s\n", key, value)
- }
- }
-}
-
-// PrintEvents print event info to writer.
-func PrintEvents(events []coreV1.Event, writer io.Writer) {
- if len(events) > 0 {
- WriteLine(writer, Level0, "%s:\n%-15s\t%-40s\t%-30s\t%-40s\t%s\n", "Events", "Type", "Reason", "Age", "Form", "Message")
- WriteLine(writer, Level0, "%-15s\t%-40s\t%-30s\t%-40s\t%s\n", "-------", "-------", "-------", "-------", "-------")
- for _, e := range events {
- var interval string
- if e.Count > 1 {
- interval = fmt.Sprintf("%s (x%d over %s)", translateTimestampSince(e.LastTimestamp), e.Count, translateTimestampSince(e.FirstTimestamp))
- } else {
- interval = translateTimestampSince(e.FirstTimestamp)
- }
- EventSourceString := []string{e.Source.Component}
- if len(e.Source.Host) > 0 {
- EventSourceString = append(EventSourceString, e.Source.Host)
- }
- WriteLine(writer, Level0, "%-15v\t%-40v\t%-30s\t%-40s\t%v\n",
- e.Type,
- e.Reason,
- interval,
- strings.Join(EventSourceString, ", "),
- strings.TrimSpace(e.Message),
- )
- }
- } else {
- WriteLine(writer, Level0, "Events: \t<none>\n")
- }
-}
-
-// GetEvents get the job event by config.
-func GetEvents(config *rest.Config, job *v1alpha1.Job) []coreV1.Event {
- kubernetes, err := kubernetes.NewForConfig(config)
- if err != nil {
- fmt.Printf("%v\n", err)
- return nil
- }
- events, _ := kubernetes.CoreV1().Events(viewJobFlags.Namespace).List(context.TODO(), metav1.ListOptions{})
- var jobEvents []coreV1.Event
- for _, v := range events.Items {
- if strings.HasPrefix(v.ObjectMeta.Name, job.Name+".") {
- jobEvents = append(jobEvents, v)
- }
- }
- return jobEvents
-}
-
-// WriteLine write lines with specified indent.
-func WriteLine(writer io.Writer, spaces int, content string, params ...interface{}) {
- prefix := ""
- for i := 0; i < spaces; i++ {
- prefix += " "
- }
- fmt.Fprintf(writer, prefix+content, params...)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "os"
- "path/filepath"
-
- "github.com/spf13/cobra"
-)
-
-type commonFlags struct {
- Master string
- Kubeconfig string
- SchedulerName string
-}
-
-func initFlags(cmd *cobra.Command, cf *commonFlags) {
- cmd.Flags().StringVarP(&cf.SchedulerName, "scheduler", "", "volcano", "the scheduler for this job")
- cmd.Flags().StringVarP(&cf.Master, "master", "s", "", "the address of apiserver")
-
- kubeConfFile := os.Getenv("KUBECONFIG")
- if kubeConfFile == "" {
- if home := homeDir(); home != "" {
- kubeConfFile = filepath.Join(home, ".kube", "config")
- }
- }
- cmd.Flags().StringVarP(&cf.Kubeconfig, "kubeconfig", "k", kubeConfFile, "(optional) absolute path to the kubeconfig file")
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-type createFlags struct {
- commonFlags
-
- Name string
- Weight int32
- // State is state of Queue
- State string
-}
-
-var createQueueFlags = &createFlags{}
-
-// InitCreateFlags is used to init all flags during queue creating.
-func InitCreateFlags(cmd *cobra.Command) {
- initFlags(cmd, &createQueueFlags.commonFlags)
-
- cmd.Flags().StringVarP(&createQueueFlags.Name, "name", "n", "test", "the name of queue")
- cmd.Flags().Int32VarP(&createQueueFlags.Weight, "weight", "w", 1, "the weight of the queue")
-
- cmd.Flags().StringVarP(&createQueueFlags.State, "state", "S", "Open", "the state of queue")
-}
-
-// CreateQueue create queue.
-func CreateQueue() error {
- config, err := buildConfig(createQueueFlags.Master, createQueueFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- queue := &schedulingv1beta1.Queue{
- ObjectMeta: metav1.ObjectMeta{
- Name: createQueueFlags.Name,
- },
- Spec: schedulingv1beta1.QueueSpec{
- Weight: createQueueFlags.Weight,
- },
- Status: schedulingv1beta1.QueueStatus{
- State: schedulingv1beta1.QueueState(createQueueFlags.State),
- },
- }
-
- queueClient := versioned.NewForConfigOrDie(config)
- if _, err := queueClient.SchedulingV1beta1().Queues().Create(context.TODO(), queue, metav1.CreateOptions{}); err != nil {
- return err
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
-
- "volcano.sh/apis/pkg/client/clientset/versioned"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-)
-
-type deleteFlags struct {
- commonFlags
-
- // Name is name of queue
- Name string
-}
-
-var deleteQueueFlags = &deleteFlags{}
-
-// InitDeleteFlags is used to init all flags during queue deleting.
-func InitDeleteFlags(cmd *cobra.Command) {
- initFlags(cmd, &deleteQueueFlags.commonFlags)
-
- cmd.Flags().StringVarP(&deleteQueueFlags.Name, "name", "n", "", "the name of queue")
-}
-
-// DeleteQueue delete queue.
-func DeleteQueue() error {
- config, err := buildConfig(deleteQueueFlags.Master, deleteQueueFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if len(deleteQueueFlags.Name) == 0 {
- return fmt.Errorf("queue name must be specified")
- }
-
- queueClient := versioned.NewForConfigOrDie(config)
- return queueClient.SchedulingV1beta1().Queues().Delete(context.TODO(), deleteQueueFlags.Name, metav1.DeleteOptions{})
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
- "io"
- "os"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-type getFlags struct {
- commonFlags
-
- Name string
-}
-
-var getQueueFlags = &getFlags{}
-
-// InitGetFlags is used to init all flags.
-func InitGetFlags(cmd *cobra.Command) {
- initFlags(cmd, &getQueueFlags.commonFlags)
-
- cmd.Flags().StringVarP(&getQueueFlags.Name, "name", "n", "", "the name of queue")
-}
-
-// GetQueue gets a queue.
-func GetQueue() error {
- config, err := buildConfig(getQueueFlags.Master, getQueueFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if getQueueFlags.Name == "" {
- err := fmt.Errorf("name is mandatory to get the particular queue details")
- return err
- }
-
- queueClient := versioned.NewForConfigOrDie(config)
- queue, err := queueClient.SchedulingV1beta1().Queues().Get(context.TODO(), getQueueFlags.Name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- PrintQueue(queue, os.Stdout)
-
- return nil
-}
-
-// PrintQueue prints queue information.
-func PrintQueue(queue *v1beta1.Queue, writer io.Writer) {
- _, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s\n",
- Name, Weight, State, Inqueue, Pending, Running, Unknown)
- if err != nil {
- fmt.Printf("Failed to print queue command result: %s.\n", err)
- }
- _, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d\n",
- queue.Name, queue.Spec.Weight, queue.Status.State, queue.Status.Inqueue,
- queue.Status.Pending, queue.Status.Running, queue.Status.Unknown)
- if err != nil {
- fmt.Printf("Failed to print queue command result: %s.\n", err)
- }
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
- "io"
- "os"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-type listFlags struct {
- commonFlags
-}
-
-const (
- // Weight of the queue
- Weight string = "Weight"
-
- // Name of queue
- Name string = "Name"
-
- // Pending status of the queue
- Pending string = "Pending"
-
- // Running status of the queue
- Running string = "Running"
-
- // Unknown status of the queue
- Unknown string = "Unknown"
-
- // Inqueue status of queue
- Inqueue string = "Inqueue"
-
- // State is state of queue
- State string = "State"
-)
-
-var listQueueFlags = &listFlags{}
-
-// InitListFlags inits all flags.
-func InitListFlags(cmd *cobra.Command) {
- initFlags(cmd, &listQueueFlags.commonFlags)
-}
-
-// ListQueue lists all the queue.
-func ListQueue() error {
- config, err := buildConfig(listQueueFlags.Master, listQueueFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- jobClient := versioned.NewForConfigOrDie(config)
- queues, err := jobClient.SchedulingV1beta1().Queues().List(context.TODO(), metav1.ListOptions{})
- if err != nil {
- return err
- }
-
- if len(queues.Items) == 0 {
- fmt.Printf("No resources found\n")
- return nil
- }
- PrintQueues(queues, os.Stdout)
-
- return nil
-}
-
-// PrintQueues prints queue information.
-func PrintQueues(queues *v1beta1.QueueList, writer io.Writer) {
- _, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s\n",
- Name, Weight, State, Inqueue, Pending, Running, Unknown)
- if err != nil {
- fmt.Printf("Failed to print queue command result: %s.\n", err)
- }
- for _, queue := range queues.Items {
- _, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d\n",
- queue.Name, queue.Spec.Weight, queue.Status.State, queue.Status.Inqueue,
- queue.Status.Pending, queue.Status.Running, queue.Status.Unknown)
- if err != nil {
- fmt.Printf("Failed to print queue command result: %s.\n", err)
- }
- }
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
-
- "github.com/spf13/cobra"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "k8s.io/apimachinery/pkg/types"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-const (
- // ActionOpen is `open` action
- ActionOpen = "open"
- // ActionClose is `close` action
- ActionClose = "close"
- // ActionUpdate is `update` action
- ActionUpdate = "update"
-)
-
-type operateFlags struct {
- commonFlags
-
- // Name is name of queue
- Name string
- // Weight is weight of queue
- Weight int32
- // Action is operation action of queue
- Action string
-}
-
-var operateQueueFlags = &operateFlags{}
-
-// InitOperateFlags is used to init all flags during queue operating
-func InitOperateFlags(cmd *cobra.Command) {
- initFlags(cmd, &operateQueueFlags.commonFlags)
-
- cmd.Flags().StringVarP(&operateQueueFlags.Name, "name", "n", "", "the name of queue")
- cmd.Flags().Int32VarP(&operateQueueFlags.Weight, "weight", "w", 0, "the weight of the queue")
- cmd.Flags().StringVarP(&operateQueueFlags.Action, "action", "a", "",
- "operate action to queue, valid actions are open, close, update")
-}
-
-// OperateQueue operates queue
-func OperateQueue() error {
- config, err := buildConfig(operateQueueFlags.Master, operateQueueFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if len(operateQueueFlags.Name) == 0 {
- return fmt.Errorf("queue name must be specified")
- }
-
- var action v1alpha1.Action
-
- switch operateQueueFlags.Action {
- case ActionOpen:
- action = v1alpha1.OpenQueueAction
- case ActionClose:
- action = v1alpha1.CloseQueueAction
- case ActionUpdate:
- if operateQueueFlags.Weight == 0 {
- return fmt.Errorf("when %s queue %s, weight must be specified, "+
- "the value must be greater than 0", ActionUpdate, operateQueueFlags.Name)
- }
-
- queueClient := versioned.NewForConfigOrDie(config)
- patchBytes := []byte(fmt.Sprintf(`{"spec":{"weight":%d}}`, operateQueueFlags.Weight))
- _, err := queueClient.SchedulingV1beta1().Queues().Patch(context.TODO(),
- operateQueueFlags.Name, types.MergePatchType, patchBytes, metav1.PatchOptions{})
-
- return err
- case "":
- return fmt.Errorf("action can not be null")
- default:
- return fmt.Errorf("action %s invalid, valid actions are %s, %s and %s",
- operateQueueFlags.Action, ActionOpen, ActionClose, ActionUpdate)
- }
-
- return createQueueCommand(config, action)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
- "os"
- "strings"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- // Initialize client auth plugin.
- _ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
- "k8s.io/client-go/rest"
- "k8s.io/client-go/tools/clientcmd"
-
- busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-func homeDir() string {
- if h := os.Getenv("HOME"); h != "" {
- return h
- }
- return os.Getenv("USERPROFILE") // windows
-}
-
-func buildConfig(master, kubeconfig string) (*rest.Config, error) {
- return clientcmd.BuildConfigFromFlags(master, kubeconfig)
-}
-
-func createQueueCommand(config *rest.Config, action busv1alpha1.Action) error {
- queueClient := versioned.NewForConfigOrDie(config)
- queue, err := queueClient.SchedulingV1beta1().Queues().Get(context.TODO(), operateQueueFlags.Name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- ctrlRef := metav1.NewControllerRef(queue, helpers.V1beta1QueueKind)
- cmd := &busv1alpha1.Command{
- ObjectMeta: metav1.ObjectMeta{
- GenerateName: fmt.Sprintf("%s-%s-",
- queue.Name, strings.ToLower(string(action))),
- OwnerReferences: []metav1.OwnerReference{
- *ctrlRef,
- },
- },
- TargetObject: ctrlRef,
- Action: string(action),
- }
-
- if _, err := queueClient.BusV1alpha1().Commands("default").Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
- return err
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package util
-
-import (
- "context"
- "fmt"
- "os"
- "path/filepath"
- "strings"
- "time"
-
- "github.com/spf13/cobra"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
- "k8s.io/client-go/rest"
- "k8s.io/client-go/tools/clientcmd"
-
- vcbus "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-// CommonFlags are the flags that most command lines have.
-type CommonFlags struct {
- Master string
- Kubeconfig string
-}
-
-// InitFlags initializes the common flags for most command lines.
-func InitFlags(cmd *cobra.Command, cf *CommonFlags) {
- cmd.Flags().StringVarP(&cf.Master, "master", "s", "", "the address of apiserver")
-
- kubeConfFile := os.Getenv("KUBECONFIG")
- if kubeConfFile == "" {
- if home := HomeDir(); home != "" {
- kubeConfFile = filepath.Join(home, ".kube", "config")
- }
- }
- cmd.Flags().StringVarP(&cf.Kubeconfig, "kubeconfig", "k", kubeConfFile, "(optional) absolute path to the kubeconfig file")
-}
-
-// HomeDir gets the env $HOME.
-func HomeDir() string {
- if h := os.Getenv("HOME"); h != "" {
- return h
- }
- return os.Getenv("USERPROFILE") // windows
-}
-
-// BuildConfig builds the configure file for command lines.
-func BuildConfig(master, kubeconfig string) (*rest.Config, error) {
- return clientcmd.BuildConfigFromFlags(master, kubeconfig)
-}
-
-// PopulateResourceListV1 takes strings of form <resourceName1>=<value1>,<resourceName1>=<value2> and returns ResourceList.
-func PopulateResourceListV1(spec string) (v1.ResourceList, error) {
- // empty input gets a nil response to preserve generator test expected behaviors
- if spec == "" {
- return nil, nil
- }
-
- result := v1.ResourceList{}
- resourceStatements := strings.Split(spec, ",")
- for _, resourceStatement := range resourceStatements {
- parts := strings.Split(resourceStatement, "=")
- if len(parts) != 2 {
- return nil, fmt.Errorf("invalid argument syntax %v, expected <resource>=<value>", resourceStatement)
- }
- resourceName := v1.ResourceName(parts[0])
- resourceQuantity, err := resource.ParseQuantity(parts[1])
- if err != nil {
- return nil, err
- }
- result[resourceName] = resourceQuantity
- }
- return result, nil
-}
-
-// CreateQueueCommand executes a command such as open/close
-func CreateQueueCommand(vcClient *versioned.Clientset, ns, name string, action vcbus.Action) error {
- queue, err := vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), name, metav1.GetOptions{})
- if err != nil {
- return err
- }
- ctrlRef := metav1.NewControllerRef(queue, helpers.V1beta1QueueKind)
- cmd := &vcbus.Command{
- ObjectMeta: metav1.ObjectMeta{
- GenerateName: fmt.Sprintf("%s-%s-",
- queue.Name, strings.ToLower(string(action))),
- Namespace: queue.Namespace,
- OwnerReferences: []metav1.OwnerReference{
- *ctrlRef,
- },
- },
- TargetObject: ctrlRef,
- Action: string(action),
- }
-
- if _, err := vcClient.BusV1alpha1().Commands(ns).Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
- return err
- }
-
- return nil
-}
-
-// CreateJobCommand executes a command such as resume/suspend.
-func CreateJobCommand(config *rest.Config, ns, name string, action vcbus.Action) error {
- jobClient := versioned.NewForConfigOrDie(config)
- job, err := jobClient.BatchV1alpha1().Jobs(ns).Get(context.TODO(), name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- ctrlRef := metav1.NewControllerRef(job, helpers.JobKind)
- cmd := &vcbus.Command{
- ObjectMeta: metav1.ObjectMeta{
- GenerateName: fmt.Sprintf("%s-%s-",
- job.Name, strings.ToLower(string(action))),
- Namespace: job.Namespace,
- OwnerReferences: []metav1.OwnerReference{
- *ctrlRef,
- },
- },
- TargetObject: ctrlRef,
- Action: string(action),
- }
-
- if _, err := jobClient.BusV1alpha1().Commands(ns).Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
- return err
- }
-
- return nil
-}
-
-// TranslateTimestampSince translates the time stamp.
-func TranslateTimestampSince(timestamp metav1.Time) string {
- if timestamp.IsZero() {
- return "<unknown>"
- }
- return HumanDuration(time.Since(timestamp.Time))
-}
-
-// HumanDuration translate time.Duration to human readable time string.
-func HumanDuration(d time.Duration) string {
- // Allow deviation no more than 2 seconds(excluded) to tolerate machine time
- // inconsistence, it can be considered as almost now.
- if seconds := int(d.Seconds()); seconds < -1 {
- return "<invalid>"
- } else if seconds < 0 {
- return "0s"
- } else if seconds < 60*2 {
- return fmt.Sprintf("%ds", seconds)
- }
- minutes := int(d / time.Minute)
- if minutes < 10 {
- s := int(d/time.Second) % 60
- if s == 0 {
- return fmt.Sprintf("%dm", minutes)
- }
- return fmt.Sprintf("%dm%ds", minutes, s)
- } else if minutes < 60*3 {
- return fmt.Sprintf("%dm", minutes)
- }
- hours := int(d / time.Hour)
- if hours < 8 {
- m := int(d/time.Minute) % 60
- if m == 0 {
- return fmt.Sprintf("%dh", hours)
- }
- return fmt.Sprintf("%dh%dm", hours, m)
- } else if hours < 48 {
- return fmt.Sprintf("%dh", hours)
- } else if hours < 24*8 {
- h := hours % 24
- if h == 0 {
- return fmt.Sprintf("%dd", hours/24)
- }
- return fmt.Sprintf("%dd%dh", hours/24, h)
- } else if hours < 24*365*2 {
- return fmt.Sprintf("%dd", hours/24)
- } else if hours < 24*365*8 {
- return fmt.Sprintf("%dy%dd", hours/24/365, (hours/24)%365)
- }
- return fmt.Sprintf("%dy", hours/24/365)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package vcancel
-
-import (
- "context"
- "fmt"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type cancelFlags struct {
- util.CommonFlags
-
- Namespace string
- JobName string
-}
-
-var cancelJobFlags = &cancelFlags{}
-
-// InitCancelFlags init the cancel command flags.
-func InitCancelFlags(cmd *cobra.Command) {
- util.InitFlags(cmd, &cancelJobFlags.CommonFlags)
-
- cmd.Flags().StringVarP(&cancelJobFlags.Namespace, "namespace", "N", "default", "the namespace of job")
- cmd.Flags().StringVarP(&cancelJobFlags.JobName, "name", "n", "", "the name of job")
-}
-
-// CancelJob cancel the job.
-func CancelJob() error {
- config, err := util.BuildConfig(cancelJobFlags.Master, cancelJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if cancelJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to cancel a particular job")
- return err
- }
-
- jobClient := versioned.NewForConfigOrDie(config)
- err = jobClient.BatchV1alpha1().Jobs(cancelJobFlags.Namespace).Delete(context.TODO(), cancelJobFlags.JobName, metav1.DeleteOptions{})
- if err != nil {
- return err
- }
- fmt.Printf("cancel job %v successfully\n", cancelJobFlags.JobName)
- return nil
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package vresume
-
-import (
- "fmt"
-
- "github.com/spf13/cobra"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type resumeFlags struct {
- util.CommonFlags
-
- Namespace string
- JobName string
-}
-
-var resumeJobFlags = &resumeFlags{}
-
-// InitResumeFlags init resume command flags.
-func InitResumeFlags(cmd *cobra.Command) {
- util.InitFlags(cmd, &resumeJobFlags.CommonFlags)
-
- cmd.Flags().StringVarP(&resumeJobFlags.Namespace, "namespace", "N", "default", "the namespace of job")
- cmd.Flags().StringVarP(&resumeJobFlags.JobName, "name", "n", "", "the name of job")
-}
-
-// ResumeJob resumes the job.
-func ResumeJob() error {
- config, err := util.BuildConfig(resumeJobFlags.Master, resumeJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
- if resumeJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to resume a particular job")
- return err
- }
-
- return util.CreateJobCommand(config,
- resumeJobFlags.Namespace, resumeJobFlags.JobName,
- v1alpha1.ResumeJobAction)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package vsuspend
-
-import (
- "fmt"
-
- "github.com/spf13/cobra"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type suspendFlags struct {
- util.CommonFlags
-
- Namespace string
- JobName string
-}
-
-var suspendJobFlags = &suspendFlags{}
-
-// InitSuspendFlags init suspend related flags.
-func InitSuspendFlags(cmd *cobra.Command) {
- util.InitFlags(cmd, &suspendJobFlags.CommonFlags)
-
- cmd.Flags().StringVarP(&suspendJobFlags.Namespace, "namespace", "N", "default", "the namespace of job")
- cmd.Flags().StringVarP(&suspendJobFlags.JobName, "name", "n", "", "the name of job")
-}
-
-// SuspendJob suspends the job.
-func SuspendJob() error {
- config, err := util.BuildConfig(suspendJobFlags.Master, suspendJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if suspendJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to suspend a particular job")
- return err
- }
-
- return util.CreateJobCommand(config,
- suspendJobFlags.Namespace, suspendJobFlags.JobName,
- v1alpha1.AbortJobAction)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package apis
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
-)
-
-//JobInfo struct.
-type JobInfo struct {
- Namespace string
- Name string
-
- Job *batch.Job
- Pods map[string]map[string]*v1.Pod
-}
-
-//Clone function clones the k8s pod values to the JobInfo struct.
-func (ji *JobInfo) Clone() *JobInfo {
- job := &JobInfo{
- Namespace: ji.Namespace,
- Name: ji.Name,
- Job: ji.Job,
-
- Pods: make(map[string]map[string]*v1.Pod),
- }
-
- for key, pods := range ji.Pods {
- job.Pods[key] = make(map[string]*v1.Pod)
- for pn, pod := range pods {
- job.Pods[key][pn] = pod
- }
- }
-
- return job
-}
-
-//SetJob sets the volcano jobs values to the JobInfo struct.
-func (ji *JobInfo) SetJob(job *batch.Job) {
- ji.Name = job.Name
- ji.Namespace = job.Namespace
- ji.Job = job
-}
-
-//AddPod adds the k8s pod object values to the Pods field
-//of JobStruct if it doesn't exist. Otherwise it throws error.
-func (ji *JobInfo) AddPod(pod *v1.Pod) error {
- taskName, found := pod.Annotations[batch.TaskSpecKey]
- if !found {
- return fmt.Errorf("failed to taskName of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
-
- _, found = pod.Annotations[batch.JobVersion]
- if !found {
- return fmt.Errorf("failed to find jobVersion of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
-
- if _, found := ji.Pods[taskName]; !found {
- ji.Pods[taskName] = make(map[string]*v1.Pod)
- }
- if _, found := ji.Pods[taskName][pod.Name]; found {
- return fmt.Errorf("duplicated pod")
- }
- ji.Pods[taskName][pod.Name] = pod
-
- return nil
-}
-
-//UpdatePod updates the k8s pod object values to the existing pod.
-func (ji *JobInfo) UpdatePod(pod *v1.Pod) error {
- taskName, found := pod.Annotations[batch.TaskSpecKey]
- if !found {
- return fmt.Errorf("failed to find taskName of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
- _, found = pod.Annotations[batch.JobVersion]
- if !found {
- return fmt.Errorf("failed to find jobVersion of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
-
- if _, found := ji.Pods[taskName]; !found {
- return fmt.Errorf("can not find task %s in cache", taskName)
- }
- if _, found := ji.Pods[taskName][pod.Name]; !found {
- return fmt.Errorf("can not find pod <%s/%s> in cache",
- pod.Namespace, pod.Name)
- }
- ji.Pods[taskName][pod.Name] = pod
-
- return nil
-}
-
-//DeletePod deletes the given k8s pod from the JobInfo struct.
-func (ji *JobInfo) DeletePod(pod *v1.Pod) error {
- taskName, found := pod.Annotations[batch.TaskSpecKey]
- if !found {
- return fmt.Errorf("failed to find taskName of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
- _, found = pod.Annotations[batch.JobVersion]
- if !found {
- return fmt.Errorf("failed to find jobVersion of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
-
- if pods, found := ji.Pods[taskName]; found {
- delete(pods, pod.Name)
- if len(pods) == 0 {
- delete(ji.Pods, taskName)
- }
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package apis
-
-import (
- "fmt"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
-)
-
-//Request struct.
-type Request struct {
- Namespace string
- JobName string
- TaskName string
- QueueName string
-
- Event v1alpha1.Event
- ExitCode int32
- Action v1alpha1.Action
- JobVersion int32
-}
-
-// String function returns the request in string format.
-func (r Request) String() string {
- return fmt.Sprintf(
- "Queue: %s, Job: %s/%s, Task:%s, Event:%s, ExitCode:%d, Action:%s, JobVersion: %d",
- r.QueueName, r.Namespace, r.JobName, r.TaskName, r.Event, r.ExitCode, r.Action, r.JobVersion)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package cache
-
-import (
- "fmt"
- "sync"
- "time"
-
- "golang.org/x/time/rate"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/util/wait"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/volcano/pkg/controllers/apis"
-)
-
-type jobCache struct {
- sync.Mutex
-
- jobs map[string]*apis.JobInfo
- deletedJobs workqueue.RateLimitingInterface
-}
-
-func keyFn(ns, name string) string {
- return fmt.Sprintf("%s/%s", ns, name)
-}
-
-//JobKeyByName gets the key for the job name.
-func JobKeyByName(namespace string, name string) string {
- return keyFn(namespace, name)
-}
-
-//JobKeyByReq gets the key for the job request.
-func JobKeyByReq(req *apis.Request) string {
- return keyFn(req.Namespace, req.JobName)
-}
-
-//JobKey gets the "ns"/"name" format of the given job.
-func JobKey(job *v1alpha1.Job) string {
- return keyFn(job.Namespace, job.Name)
-}
-
-func jobTerminated(job *apis.JobInfo) bool {
- return job.Job == nil && len(job.Pods) == 0
-}
-
-func jobKeyOfPod(pod *v1.Pod) (string, error) {
- jobName, found := pod.Annotations[v1alpha1.JobNameKey]
- if !found {
- return "", fmt.Errorf("failed to find job name of pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
-
- return keyFn(pod.Namespace, jobName), nil
-}
-
-// New gets the job Cache.
-func New() Cache {
- queue := workqueue.NewMaxOfRateLimiter(
- workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 180*time.Second),
- // 10 qps, 100 bucket size. This is only for retry speed and its only the overall factor (not per item)
- &workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
- )
-
- return &jobCache{
- jobs: map[string]*apis.JobInfo{},
- deletedJobs: workqueue.NewRateLimitingQueue(queue),
- }
-}
-
-func (jc *jobCache) Get(key string) (*apis.JobInfo, error) {
- jc.Lock()
- defer jc.Unlock()
-
- job, found := jc.jobs[key]
- if !found {
- return nil, fmt.Errorf("failed to find job <%s>", key)
- }
-
- if job.Job == nil {
- return nil, fmt.Errorf("job <%s> is not ready", key)
- }
-
- return job.Clone(), nil
-}
-
-func (jc *jobCache) GetStatus(key string) (*v1alpha1.JobStatus, error) {
- jc.Lock()
- defer jc.Unlock()
-
- job, found := jc.jobs[key]
- if !found {
- return nil, fmt.Errorf("failed to find job <%s>", key)
- }
-
- if job.Job == nil {
- return nil, fmt.Errorf("job <%s> is not ready", key)
- }
-
- status := job.Job.Status
-
- return &status, nil
-}
-
-func (jc *jobCache) Add(job *v1alpha1.Job) error {
- jc.Lock()
- defer jc.Unlock()
- key := JobKey(job)
- if jobInfo, found := jc.jobs[key]; found {
- if jobInfo.Job == nil {
- jobInfo.SetJob(job)
-
- return nil
- }
- return fmt.Errorf("duplicated jobInfo <%v>", key)
- }
-
- jc.jobs[key] = &apis.JobInfo{
- Name: job.Name,
- Namespace: job.Namespace,
-
- Job: job,
- Pods: make(map[string]map[string]*v1.Pod),
- }
-
- return nil
-}
-
-func (jc *jobCache) Update(obj *v1alpha1.Job) error {
- jc.Lock()
- defer jc.Unlock()
-
- key := JobKey(obj)
- job, found := jc.jobs[key]
- if !found {
- return fmt.Errorf("failed to find job <%v>", key)
- }
- job.Job = obj
-
- return nil
-}
-
-func (jc *jobCache) Delete(obj *v1alpha1.Job) error {
- jc.Lock()
- defer jc.Unlock()
-
- key := JobKey(obj)
- jobInfo, found := jc.jobs[key]
- if !found {
- return fmt.Errorf("failed to find job <%v>", key)
- }
- jobInfo.Job = nil
- jc.deleteJob(jobInfo)
-
- return nil
-}
-
-func (jc *jobCache) AddPod(pod *v1.Pod) error {
- jc.Lock()
- defer jc.Unlock()
-
- key, err := jobKeyOfPod(pod)
- if err != nil {
- return err
- }
-
- job, found := jc.jobs[key]
- if !found {
- job = &apis.JobInfo{
- Pods: make(map[string]map[string]*v1.Pod),
- }
- jc.jobs[key] = job
- }
-
- return job.AddPod(pod)
-}
-
-func (jc *jobCache) UpdatePod(pod *v1.Pod) error {
- jc.Lock()
- defer jc.Unlock()
-
- key, err := jobKeyOfPod(pod)
- if err != nil {
- return err
- }
-
- job, found := jc.jobs[key]
- if !found {
- job = &apis.JobInfo{
- Pods: make(map[string]map[string]*v1.Pod),
- }
- jc.jobs[key] = job
- }
-
- return job.UpdatePod(pod)
-}
-
-func (jc *jobCache) DeletePod(pod *v1.Pod) error {
- jc.Lock()
- defer jc.Unlock()
-
- key, err := jobKeyOfPod(pod)
- if err != nil {
- return err
- }
-
- job, found := jc.jobs[key]
- if !found {
- job = &apis.JobInfo{
- Pods: make(map[string]map[string]*v1.Pod),
- }
- jc.jobs[key] = job
- }
-
- if err := job.DeletePod(pod); err != nil {
- return err
- }
-
- if jc.jobs[key].Job == nil {
- jc.deleteJob(job)
- }
-
- return nil
-}
-
-func (jc *jobCache) Run(stopCh <-chan struct{}) {
- wait.Until(jc.worker, 0, stopCh)
-}
-
-func (jc *jobCache) TaskCompleted(jobKey, taskName string) bool {
- jc.Lock()
- defer jc.Unlock()
-
- var taskReplicas, completed int32
-
- jobInfo, found := jc.jobs[jobKey]
- if !found {
- return false
- }
-
- taskPods, found := jobInfo.Pods[taskName]
-
- if !found {
- return false
- }
-
- if jobInfo.Job == nil {
- return false
- }
-
- for _, task := range jobInfo.Job.Spec.Tasks {
- if task.Name == taskName {
- taskReplicas = task.Replicas
- break
- }
- }
- if taskReplicas <= 0 {
- return false
- }
-
- for _, pod := range taskPods {
- if pod.Status.Phase == v1.PodSucceeded {
- completed++
- }
- }
- return completed >= taskReplicas
-}
-
-func (jc *jobCache) TaskFailed(jobKey, taskName string) bool {
- jc.Lock()
- defer jc.Unlock()
-
- var taskReplicas, retried, maxRetry int32
-
- jobInfo, found := jc.jobs[jobKey]
- if !found {
- return false
- }
-
- taskPods, found := jobInfo.Pods[taskName]
-
- if !found || jobInfo.Job == nil {
- return false
- }
-
- for _, task := range jobInfo.Job.Spec.Tasks {
- if task.Name == taskName {
- maxRetry = task.MaxRetry
- taskReplicas = task.Replicas
- break
- }
- }
-
- // maxRetry == -1 means no limit
- if taskReplicas == 0 || maxRetry == -1 {
- return false
- }
-
- // Compatible with existing job
- if maxRetry == 0 {
- maxRetry = 3
- }
-
- for _, pod := range taskPods {
- if pod.Status.Phase == v1.PodRunning || pod.Status.Phase == v1.PodPending {
- for j := range pod.Status.InitContainerStatuses {
- stat := pod.Status.InitContainerStatuses[j]
- retried += stat.RestartCount
- }
- for j := range pod.Status.ContainerStatuses {
- stat := pod.Status.ContainerStatuses[j]
- retried += stat.RestartCount
- }
- }
- }
- return retried > maxRetry
-}
-
-func (jc *jobCache) worker() {
- for jc.processCleanupJob() {
- }
-}
-
-func (jc *jobCache) processCleanupJob() bool {
- obj, shutdown := jc.deletedJobs.Get()
- if shutdown {
- return false
- }
- defer jc.deletedJobs.Done(obj)
-
- job, ok := obj.(*apis.JobInfo)
- if !ok {
- klog.Errorf("failed to convert %v to *apis.JobInfo", obj)
- return true
- }
-
- jc.Mutex.Lock()
- defer jc.Mutex.Unlock()
-
- if jobTerminated(job) {
- jc.deletedJobs.Forget(obj)
- key := keyFn(job.Namespace, job.Name)
- delete(jc.jobs, key)
- klog.V(3).Infof("Job <%s> was deleted.", key)
- } else {
- // Retry
- jc.deleteJob(job)
- }
- return true
-}
-
-func (jc *jobCache) deleteJob(job *apis.JobInfo) {
- klog.V(3).Infof("Try to delete Job <%v/%v>",
- job.Namespace, job.Name)
-
- jc.deletedJobs.AddRateLimited(job)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package garbagecollector
-
-import (
- "context"
- "fmt"
- "time"
-
- "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/util/wait"
- "k8s.io/client-go/tools/cache"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
- informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
- batchinformers "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1"
- batchlisters "volcano.sh/apis/pkg/client/listers/batch/v1alpha1"
- "volcano.sh/volcano/pkg/controllers/framework"
-)
-
-func init() {
- framework.RegisterController(&gccontroller{})
-}
-
-// gccontroller runs reflectors to watch for changes of managed API
-// objects. Currently it only watches Jobs. Triggered by Job creation
-// and updates, it enqueues Jobs that have non-nil `.spec.ttlSecondsAfterFinished`
-// to the `queue`. The gccontroller has workers who consume `queue`, check whether
-// the Job TTL has expired or not; if the Job TTL hasn't expired, it will add the
-// Job to the queue after the TTL is expected to expire; if the TTL has expired, the
-// worker will send requests to the API server to delete the Jobs accordingly.
-// This is implemented outside of Job controller for separation of concerns, and
-// because it will be extended to handle other finishable resource types.
-type gccontroller struct {
- vcClient vcclientset.Interface
-
- jobInformer batchinformers.JobInformer
-
- // A store of jobs
- jobLister batchlisters.JobLister
- jobSynced func() bool
-
- // queues that need to be updated.
- queue workqueue.RateLimitingInterface
-}
-
-func (gc *gccontroller) Name() string {
- return "gc-controller"
-}
-
-// Initialize creates an instance of gccontroller.
-func (gc *gccontroller) Initialize(opt *framework.ControllerOption) error {
- gc.vcClient = opt.VolcanoClient
- jobInformer := informerfactory.NewSharedInformerFactory(gc.vcClient, 0).Batch().V1alpha1().Jobs()
-
- gc.jobInformer = jobInformer
- gc.jobLister = jobInformer.Lister()
- gc.jobSynced = jobInformer.Informer().HasSynced
- gc.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
-
- jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: gc.addJob,
- UpdateFunc: gc.updateJob,
- })
-
- return nil
-}
-
-// Run starts the worker to clean up Jobs.
-func (gc *gccontroller) Run(stopCh <-chan struct{}) {
- defer gc.queue.ShutDown()
-
- klog.Infof("Starting garbage collector")
- defer klog.Infof("Shutting down garbage collector")
-
- go gc.jobInformer.Informer().Run(stopCh)
- if !cache.WaitForCacheSync(stopCh, gc.jobSynced) {
- return
- }
-
- go wait.Until(gc.worker, time.Second, stopCh)
-
- <-stopCh
-}
-
-func (gc *gccontroller) addJob(obj interface{}) {
- job := obj.(*v1alpha1.Job)
- klog.V(4).Infof("Adding job %s/%s", job.Namespace, job.Name)
-
- if job.DeletionTimestamp == nil && needsCleanup(job) {
- gc.enqueue(job)
- }
-}
-
-func (gc *gccontroller) updateJob(old, cur interface{}) {
- job := cur.(*v1alpha1.Job)
- klog.V(4).Infof("Updating job %s/%s", job.Namespace, job.Name)
-
- if job.DeletionTimestamp == nil && needsCleanup(job) {
- gc.enqueue(job)
- }
-}
-
-func (gc *gccontroller) enqueue(job *v1alpha1.Job) {
- klog.V(4).Infof("Add job %s/%s to cleanup", job.Namespace, job.Name)
- key, err := cache.MetaNamespaceKeyFunc(job)
- if err != nil {
- klog.Errorf("couldn't get key for object %#v: %v", job, err)
- return
- }
-
- gc.queue.Add(key)
-}
-
-func (gc *gccontroller) enqueueAfter(job *v1alpha1.Job, after time.Duration) {
- key, err := cache.MetaNamespaceKeyFunc(job)
- if err != nil {
- klog.Errorf("couldn't get key for object %#v: %v", job, err)
- return
- }
-
- gc.queue.AddAfter(key, after)
-}
-
-func (gc *gccontroller) worker() {
- for gc.processNextWorkItem() {
- }
-}
-
-func (gc *gccontroller) processNextWorkItem() bool {
- key, quit := gc.queue.Get()
- if quit {
- return false
- }
- defer gc.queue.Done(key)
-
- err := gc.processJob(key.(string))
- gc.handleErr(err, key)
-
- return true
-}
-
-func (gc *gccontroller) handleErr(err error, key interface{}) {
- if err == nil {
- gc.queue.Forget(key)
- return
- }
-
- klog.Errorf("error cleaning up Job %v, will retry: %v", key, err)
- gc.queue.AddRateLimited(key)
-}
-
-// processJob will check the Job's state and TTL and delete the Job when it
-// finishes and its TTL after finished has expired. If the Job hasn't finished or
-// its TTL hasn't expired, it will be added to the queue after the TTL is expected
-// to expire.
-// This function is not meant to be invoked concurrently with the same key.
-func (gc *gccontroller) processJob(key string) error {
- namespace, name, err := cache.SplitMetaNamespaceKey(key)
- if err != nil {
- return err
- }
-
- klog.V(4).Infof("Checking if Job %s/%s is ready for cleanup", namespace, name)
- // Ignore the Jobs that are already deleted or being deleted, or the ones that don't need clean up.
- job, err := gc.jobLister.Jobs(namespace).Get(name)
- if errors.IsNotFound(err) {
- return nil
- }
- if err != nil {
- return err
- }
-
- if expired, err := gc.processTTL(job); err != nil {
- return err
- } else if !expired {
- return nil
- }
-
- // The Job's TTL is assumed to have expired, but the Job TTL might be stale.
- // Before deleting the Job, do a final sanity check.
- // If TTL is modified before we do this check, we cannot be sure if the TTL truly expires.
- // The latest Job may have a different UID, but it's fine because the checks will be run again.
- fresh, err := gc.vcClient.BatchV1alpha1().Jobs(namespace).Get(context.TODO(), name, metav1.GetOptions{})
- if errors.IsNotFound(err) {
- return nil
- }
- if err != nil {
- return err
- }
- // Use the latest Job TTL to see if the TTL truly expires.
- if expired, err := gc.processTTL(fresh); err != nil {
- return err
- } else if !expired {
- return nil
- }
- // Cascade deletes the Jobs if TTL truly expires.
- policy := metav1.DeletePropagationForeground
- options := metav1.DeleteOptions{
- PropagationPolicy: &policy,
- Preconditions: &metav1.Preconditions{UID: &fresh.UID},
- }
- klog.V(4).Infof("Cleaning up Job %s/%s", namespace, name)
- return gc.vcClient.BatchV1alpha1().Jobs(fresh.Namespace).Delete(context.TODO(), fresh.Name, options)
-}
-
-// processTTL checks whether a given Job's TTL has expired, and add it to the queue after the TTL is expected to expire
-// if the TTL will expire later.
-func (gc *gccontroller) processTTL(job *v1alpha1.Job) (expired bool, err error) {
- // We don't care about the Jobs that are going to be deleted, or the ones that don't need clean up.
- if job.DeletionTimestamp != nil || !needsCleanup(job) {
- return false, nil
- }
-
- now := time.Now()
- t, err := timeLeft(job, &now)
- if err != nil {
- return false, err
- }
-
- // TTL has expired
- if *t <= 0 {
- return true, nil
- }
-
- gc.enqueueAfter(job, *t)
- return false, nil
-}
-
-// needsCleanup checks whether a Job has finished and has a TTL set.
-func needsCleanup(j *v1alpha1.Job) bool {
- return j.Spec.TTLSecondsAfterFinished != nil && isJobFinished(j)
-}
-
-func isJobFinished(job *v1alpha1.Job) bool {
- return job.Status.State.Phase == v1alpha1.Completed ||
- job.Status.State.Phase == v1alpha1.Failed ||
- job.Status.State.Phase == v1alpha1.Terminated
-}
-
-func getFinishAndExpireTime(j *v1alpha1.Job) (*time.Time, *time.Time, error) {
- if !needsCleanup(j) {
- return nil, nil, fmt.Errorf("job %s/%s should not be cleaned up", j.Namespace, j.Name)
- }
- finishAt, err := jobFinishTime(j)
- if err != nil {
- return nil, nil, err
- }
- finishAtUTC := finishAt.UTC()
- expireAtUTC := finishAtUTC.Add(time.Duration(*j.Spec.TTLSecondsAfterFinished) * time.Second)
- return &finishAtUTC, &expireAtUTC, nil
-}
-
-func timeLeft(j *v1alpha1.Job, since *time.Time) (*time.Duration, error) {
- finishAt, expireAt, err := getFinishAndExpireTime(j)
- if err != nil {
- return nil, err
- }
- if finishAt.UTC().After(since.UTC()) {
- klog.Warningf("Warning: Found Job %s/%s finished in the future. This is likely due to time skew in the cluster. Job cleanup will be deferred.", j.Namespace, j.Name)
- }
- remaining := expireAt.UTC().Sub(since.UTC())
- klog.V(4).Infof("Found Job %s/%s finished at %v, remaining TTL %v since %v, TTL will expire at %v", j.Namespace, j.Name, finishAt.UTC(), remaining, since.UTC(), expireAt.UTC())
- return &remaining, nil
-}
-
-// jobFinishTime takes an already finished Job and returns the time it finishes.
-func jobFinishTime(finishedJob *v1alpha1.Job) (metav1.Time, error) {
- if finishedJob.Status.State.LastTransitionTime.IsZero() {
- return metav1.Time{}, fmt.Errorf("unable to find the time when the Job %s/%s finished", finishedJob.Namespace, finishedJob.Name)
- }
- return finishedJob.Status.State.LastTransitionTime, nil
-}
-
-
-
/*
-Copyright 2017 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "fmt"
- "hash"
- "hash/fnv"
- "time"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/util/wait"
- coreinformers "k8s.io/client-go/informers/core/v1"
- kubeschedulinginformers "k8s.io/client-go/informers/scheduling/v1beta1"
- "k8s.io/client-go/kubernetes"
- corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
- corelisters "k8s.io/client-go/listers/core/v1"
- kubeschedulinglisters "k8s.io/client-go/listers/scheduling/v1beta1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/client-go/tools/record"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
- vcscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
- informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
- batchinformer "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1"
- businformer "volcano.sh/apis/pkg/client/informers/externalversions/bus/v1alpha1"
- schedulinginformers "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
- batchlister "volcano.sh/apis/pkg/client/listers/batch/v1alpha1"
- buslister "volcano.sh/apis/pkg/client/listers/bus/v1alpha1"
- schedulinglisters "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
- jobcache "volcano.sh/volcano/pkg/controllers/cache"
- "volcano.sh/volcano/pkg/controllers/framework"
- "volcano.sh/volcano/pkg/controllers/job/state"
-)
-
-func init() {
- framework.RegisterController(&jobcontroller{})
-}
-
-// jobcontroller the Job jobcontroller type.
-type jobcontroller struct {
- kubeClient kubernetes.Interface
- vcClient vcclientset.Interface
-
- jobInformer batchinformer.JobInformer
- podInformer coreinformers.PodInformer
- pvcInformer coreinformers.PersistentVolumeClaimInformer
- pgInformer schedulinginformers.PodGroupInformer
- svcInformer coreinformers.ServiceInformer
- cmdInformer businformer.CommandInformer
- pcInformer kubeschedulinginformers.PriorityClassInformer
- queueInformer schedulinginformers.QueueInformer
-
- // A store of jobs
- jobLister batchlister.JobLister
- jobSynced func() bool
-
- // A store of pods
- podLister corelisters.PodLister
- podSynced func() bool
-
- pvcLister corelisters.PersistentVolumeClaimLister
- pvcSynced func() bool
-
- // A store of podgroups
- pgLister schedulinglisters.PodGroupLister
- pgSynced func() bool
-
- // A store of service
- svcLister corelisters.ServiceLister
- svcSynced func() bool
-
- cmdLister buslister.CommandLister
- cmdSynced func() bool
-
- pcLister kubeschedulinglisters.PriorityClassLister
- pcSynced func() bool
-
- queueLister schedulinglisters.QueueLister
- queueSynced func() bool
-
- // queue that need to sync up
- queueList []workqueue.RateLimitingInterface
- commandQueue workqueue.RateLimitingInterface
- cache jobcache.Cache
- // Job Event recorder
- recorder record.EventRecorder
-
- errTasks workqueue.RateLimitingInterface
- workers uint32
- maxRequeueNum int
-}
-
-func (cc *jobcontroller) Name() string {
- return "job-controller"
-}
-
-// Initialize creates the new Job job controller.
-func (cc *jobcontroller) Initialize(opt *framework.ControllerOption) error {
- cc.kubeClient = opt.KubeClient
- cc.vcClient = opt.VolcanoClient
-
- sharedInformers := opt.SharedInformerFactory
- workers := opt.WorkerNum
- // Initialize event client
- eventBroadcaster := record.NewBroadcaster()
- eventBroadcaster.StartLogging(klog.Infof)
- eventBroadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: cc.kubeClient.CoreV1().Events("")})
- recorder := eventBroadcaster.NewRecorder(vcscheme.Scheme, v1.EventSource{Component: "vc-controller-manager"})
-
- cc.queueList = make([]workqueue.RateLimitingInterface, workers)
- cc.commandQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
- cc.cache = jobcache.New()
- cc.errTasks = newRateLimitingQueue()
- cc.recorder = recorder
- cc.workers = workers
- cc.maxRequeueNum = opt.MaxRequeueNum
- if cc.maxRequeueNum < 0 {
- cc.maxRequeueNum = -1
- }
-
- var i uint32
- for i = 0; i < workers; i++ {
- cc.queueList[i] = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
- }
-
- cc.jobInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Batch().V1alpha1().Jobs()
- cc.jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: cc.addJob,
- UpdateFunc: cc.updateJob,
- DeleteFunc: cc.deleteJob,
- })
- cc.jobLister = cc.jobInformer.Lister()
- cc.jobSynced = cc.jobInformer.Informer().HasSynced
-
- cc.cmdInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Bus().V1alpha1().Commands()
- cc.cmdInformer.Informer().AddEventHandler(
- cache.FilteringResourceEventHandler{
- FilterFunc: func(obj interface{}) bool {
- switch v := obj.(type) {
- case *busv1alpha1.Command:
- if v.TargetObject != nil &&
- v.TargetObject.APIVersion == batchv1alpha1.SchemeGroupVersion.String() &&
- v.TargetObject.Kind == "Job" {
- return true
- }
-
- return false
- default:
- return false
- }
- },
- Handler: cache.ResourceEventHandlerFuncs{
- AddFunc: cc.addCommand,
- },
- },
- )
- cc.cmdLister = cc.cmdInformer.Lister()
- cc.cmdSynced = cc.cmdInformer.Informer().HasSynced
-
- cc.podInformer = sharedInformers.Core().V1().Pods()
- cc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: cc.addPod,
- UpdateFunc: cc.updatePod,
- DeleteFunc: cc.deletePod,
- })
-
- cc.podLister = cc.podInformer.Lister()
- cc.podSynced = cc.podInformer.Informer().HasSynced
-
- cc.pvcInformer = sharedInformers.Core().V1().PersistentVolumeClaims()
- cc.pvcLister = cc.pvcInformer.Lister()
- cc.pvcSynced = cc.pvcInformer.Informer().HasSynced
-
- cc.svcInformer = sharedInformers.Core().V1().Services()
- cc.svcLister = cc.svcInformer.Lister()
- cc.svcSynced = cc.svcInformer.Informer().HasSynced
-
- cc.pgInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Scheduling().V1beta1().PodGroups()
- cc.pgInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- UpdateFunc: cc.updatePodGroup,
- })
- cc.pgLister = cc.pgInformer.Lister()
- cc.pgSynced = cc.pgInformer.Informer().HasSynced
-
- cc.pcInformer = sharedInformers.Scheduling().V1beta1().PriorityClasses()
- cc.pcLister = cc.pcInformer.Lister()
- cc.pcSynced = cc.pcInformer.Informer().HasSynced
-
- cc.queueInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Scheduling().V1beta1().Queues()
- cc.queueLister = cc.queueInformer.Lister()
- cc.queueSynced = cc.queueInformer.Informer().HasSynced
-
- // Register actions
- state.SyncJob = cc.syncJob
- state.KillJob = cc.killJob
-
- return nil
-}
-
-// Run start JobController.
-func (cc *jobcontroller) Run(stopCh <-chan struct{}) {
- go cc.jobInformer.Informer().Run(stopCh)
- go cc.podInformer.Informer().Run(stopCh)
- go cc.pvcInformer.Informer().Run(stopCh)
- go cc.pgInformer.Informer().Run(stopCh)
- go cc.svcInformer.Informer().Run(stopCh)
- go cc.cmdInformer.Informer().Run(stopCh)
- go cc.pcInformer.Informer().Run(stopCh)
- go cc.queueInformer.Informer().Run(stopCh)
-
- cache.WaitForCacheSync(stopCh, cc.jobSynced, cc.podSynced, cc.pgSynced,
- cc.svcSynced, cc.cmdSynced, cc.pvcSynced, cc.pcSynced, cc.queueSynced)
-
- go wait.Until(cc.handleCommands, 0, stopCh)
- var i uint32
- for i = 0; i < cc.workers; i++ {
- go func(num uint32) {
- wait.Until(
- func() {
- cc.worker(num)
- },
- time.Second,
- stopCh)
- }(i)
- }
-
- go cc.cache.Run(stopCh)
-
- // Re-sync error tasks.
- go wait.Until(cc.processResyncTask, 0, stopCh)
-
- klog.Infof("JobController is running ...... ")
-}
-
-func (cc *jobcontroller) worker(i uint32) {
- klog.Infof("worker %d start ...... ", i)
-
- for cc.processNextReq(i) {
- }
-}
-
-func (cc *jobcontroller) belongsToThisRoutine(key string, count uint32) bool {
- var hashVal hash.Hash32
- var val uint32
-
- hashVal = fnv.New32()
- hashVal.Write([]byte(key))
-
- val = hashVal.Sum32()
-
- return val%cc.workers == count
-}
-
-func (cc *jobcontroller) getWorkerQueue(key string) workqueue.RateLimitingInterface {
- var hashVal hash.Hash32
- var val uint32
-
- hashVal = fnv.New32()
- hashVal.Write([]byte(key))
-
- val = hashVal.Sum32()
-
- queue := cc.queueList[val%cc.workers]
-
- return queue
-}
-
-func (cc *jobcontroller) processNextReq(count uint32) bool {
- queue := cc.queueList[count]
- obj, shutdown := queue.Get()
- if shutdown {
- klog.Errorf("Fail to pop item from queue")
- return false
- }
-
- req := obj.(apis.Request)
- defer queue.Done(req)
-
- key := jobcache.JobKeyByReq(&req)
- if !cc.belongsToThisRoutine(key, count) {
- klog.Errorf("should not occur The job does not belongs to this routine key:%s, worker:%d...... ", key, count)
- queueLocal := cc.getWorkerQueue(key)
- queueLocal.Add(req)
- return true
- }
-
- klog.V(3).Infof("Try to handle request <%v>", req)
-
- jobInfo, err := cc.cache.Get(key)
- if err != nil {
- // TODO(k82cn): ignore not-ready error.
- klog.Errorf("Failed to get job by <%v> from cache: %v", req, err)
- return true
- }
-
- st := state.NewState(jobInfo)
- if st == nil {
- klog.Errorf("Invalid state <%s> of Job <%v/%v>",
- jobInfo.Job.Status.State, jobInfo.Job.Namespace, jobInfo.Job.Name)
- return true
- }
-
- action := applyPolicies(jobInfo.Job, &req)
- klog.V(3).Infof("Execute <%v> on Job <%s/%s> in <%s> by <%T>.",
- action, req.Namespace, req.JobName, jobInfo.Job.Status.State.Phase, st)
-
- if action != busv1alpha1.SyncJobAction {
- cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, batchv1alpha1.ExecuteAction, fmt.Sprintf(
- "Start to execute action %s ", action))
- }
-
- if err := st.Execute(action); err != nil {
- if cc.maxRequeueNum == -1 || queue.NumRequeues(req) < cc.maxRequeueNum {
- klog.V(2).Infof("Failed to handle Job <%s/%s>: %v",
- jobInfo.Job.Namespace, jobInfo.Job.Name, err)
- // If any error, requeue it.
- queue.AddRateLimited(req)
- return true
- }
- cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, batchv1alpha1.ExecuteAction, fmt.Sprintf(
- "Job failed on action %s for retry limit reached", action))
- klog.Warningf("Terminating Job <%s/%s> and releasing resources", jobInfo.Job.Namespace, jobInfo.Job.Name)
- if err = st.Execute(busv1alpha1.TerminateJobAction); err != nil {
- klog.Errorf("Failed to terminate Job<%s/%s>: %v", jobInfo.Job.Namespace, jobInfo.Job.Name, err)
- }
- klog.Warningf("Dropping job<%s/%s> out of the queue: %v because max retries has reached", jobInfo.Job.Namespace, jobInfo.Job.Name, err)
- }
-
- // If no error, forget it.
- queue.Forget(req)
-
- return true
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "reflect"
- "sort"
- "sync"
- "sync/atomic"
- "time"
-
- v1 "k8s.io/api/core/v1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
- jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
- "volcano.sh/volcano/pkg/controllers/job/state"
-)
-
-var calMutex sync.Mutex
-
-func (cc *jobcontroller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.PhaseMap, updateStatus state.UpdateStatusFn) error {
- job := jobInfo.Job
- klog.V(3).Infof("Killing Job <%s/%s>, current version %d", job.Namespace, job.Name, job.Status.Version)
- defer klog.V(3).Infof("Finished Job <%s/%s> killing, current version %d", job.Namespace, job.Name, job.Status.Version)
-
- if job.DeletionTimestamp != nil {
- klog.Infof("Job <%s/%s> is terminating, skip management process.",
- job.Namespace, job.Name)
- return nil
- }
-
- var pending, running, terminating, succeeded, failed, unknown int32
- taskStatusCount := make(map[string]batch.TaskState)
-
- var errs []error
- var total int
-
- for _, pods := range jobInfo.Pods {
- for _, pod := range pods {
- total++
-
- if pod.DeletionTimestamp != nil {
- klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name)
- terminating++
- continue
- }
-
- maxRetry := job.Spec.MaxRetry
- lastRetry := false
- if job.Status.RetryCount > maxRetry-1 {
- lastRetry = true
- }
-
- _, retain := podRetainPhase[pod.Status.Phase]
-
- if !retain && !lastRetry {
- err := cc.deleteJobPod(job.Name, pod)
- if err == nil {
- terminating++
- continue
- }
- // record the err, and then collect the pod info like retained pod
- errs = append(errs, err)
- cc.resyncTask(pod)
- }
-
- classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
- calcPodStatus(pod, taskStatusCount)
- }
- }
-
- if len(errs) != 0 {
- klog.Errorf("failed to kill pods for job %s/%s, with err %+v", job.Namespace, job.Name, errs)
- cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason,
- fmt.Sprintf("Error deleting pods: %+v", errs))
- return fmt.Errorf("failed to kill %d pods of %d", len(errs), total)
- }
-
- job = job.DeepCopy()
- // Job version is bumped only when job is killed
- job.Status.Version++
- job.Status.Pending = pending
- job.Status.Running = running
- job.Status.Succeeded = succeeded
- job.Status.Failed = failed
- job.Status.Terminating = terminating
- job.Status.Unknown = unknown
- job.Status.TaskStatusCount = taskStatusCount
-
- // Update running duration
- klog.V(3).Infof("Running duration is %s", metav1.Duration{Duration: time.Since(jobInfo.Job.CreationTimestamp.Time)}.ToUnstructured())
- job.Status.RunningDuration = &metav1.Duration{Duration: time.Since(jobInfo.Job.CreationTimestamp.Time)}
-
- if updateStatus != nil {
- if updateStatus(&job.Status) {
- job.Status.State.LastTransitionTime = metav1.Now()
- }
- }
-
- // must be called before update job status
- if err := cc.pluginOnJobDelete(job); err != nil {
- return err
- }
-
- // Update Job status
- newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Failed to update status of Job %v/%v: %v",
- job.Namespace, job.Name, err)
- return err
- }
- if e := cc.cache.Update(newJob); e != nil {
- klog.Errorf("KillJob - Failed to update Job %v/%v in cache: %v",
- newJob.Namespace, newJob.Name, e)
- return e
- }
-
- // Delete PodGroup
- if err := cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Delete(context.TODO(), job.Name, metav1.DeleteOptions{}); err != nil {
- if !apierrors.IsNotFound(err) {
- klog.Errorf("Failed to delete PodGroup of Job %v/%v: %v",
- job.Namespace, job.Name, err)
- return err
- }
- }
-
- // NOTE(k82cn): DO NOT delete input/output until job is deleted.
-
- return nil
-}
-
-func (cc *jobcontroller) initiateJob(job *batch.Job) (*batch.Job, error) {
- klog.V(3).Infof("Starting to initiate Job <%s/%s>", job.Namespace, job.Name)
- jobInstance, err := cc.initJobStatus(job)
- if err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.JobStatusError),
- fmt.Sprintf("Failed to initialize job status, err: %v", err))
- return nil, err
- }
-
- if err := cc.pluginOnJobAdd(jobInstance); err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError),
- fmt.Sprintf("Execute plugin when job add failed, err: %v", err))
- return nil, err
- }
-
- newJob, err := cc.createJobIOIfNotExist(jobInstance)
- if err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PVCError),
- fmt.Sprintf("Failed to create PVC, err: %v", err))
- return nil, err
- }
-
- if err := cc.createOrUpdatePodGroup(newJob); err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError),
- fmt.Sprintf("Failed to create PodGroup, err: %v", err))
- return nil, err
- }
-
- return newJob, nil
-}
-
-func (cc *jobcontroller) initOnJobUpdate(job *batch.Job) error {
- klog.V(3).Infof("Starting to initiate Job <%s/%s> on update", job.Namespace, job.Name)
-
- if err := cc.pluginOnJobUpdate(job); err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError),
- fmt.Sprintf("Execute plugin when job add failed, err: %v", err))
- return err
- }
-
- if err := cc.createOrUpdatePodGroup(job); err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError),
- fmt.Sprintf("Failed to create PodGroup, err: %v", err))
- return err
- }
-
- return nil
-}
-
-func (cc *jobcontroller) GetQueueInfo(queue string) (*scheduling.Queue, error) {
- queueInfo, err := cc.queueLister.Get(queue)
- if err != nil {
- klog.Errorf("Failed to get queue from queueLister, error: %s", err.Error())
- }
-
- return queueInfo, err
-}
-
-func (cc *jobcontroller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateStatusFn) error {
- job := jobInfo.Job
- klog.V(3).Infof("Starting to sync up Job <%s/%s>, current version %d", job.Namespace, job.Name, job.Status.Version)
- defer klog.V(3).Infof("Finished Job <%s/%s> sync up, current version %d", job.Namespace, job.Name, job.Status.Version)
-
- if jobInfo.Job.DeletionTimestamp != nil {
- klog.Infof("Job <%s/%s> is terminating, skip management process.",
- jobInfo.Job.Namespace, jobInfo.Job.Name)
- return nil
- }
-
- // deep copy job to prevent mutate it
- job = job.DeepCopy()
-
- // Find queue that job belongs to, and check if the queue has forwarding metadata
- queueInfo, err := cc.GetQueueInfo(job.Spec.Queue)
- if err != nil {
- return err
- }
-
- var jobForwarding bool
- if len(queueInfo.Spec.ExtendClusters) != 0 {
- jobForwarding = true
- if len(job.Annotations) == 0 {
- job.Annotations = make(map[string]string)
- }
- job.Annotations[batch.JobForwardingKey] = "true"
- job, err = cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("failed to update job: %s/%s, error: %s", job.Namespace, job.Name, err.Error())
- return err
- }
- }
-
- // Skip job initiation if job is already initiated
- if !isInitiated(job) {
- if job, err = cc.initiateJob(job); err != nil {
- return err
- }
- } else {
- // TODO: optimize this call it only when scale up/down
- if err = cc.initOnJobUpdate(job); err != nil {
- return err
- }
- }
-
- if len(queueInfo.Spec.ExtendClusters) != 0 {
- jobForwarding = true
- job.Annotations[batch.JobForwardingKey] = "true"
- _, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("failed to update job: %s/%s, error: %s", job.Namespace, job.Name, err.Error())
- return err
- }
- }
-
- var syncTask bool
- if pg, _ := cc.pgLister.PodGroups(job.Namespace).Get(job.Name); pg != nil {
- if pg.Status.Phase != "" && pg.Status.Phase != scheduling.PodGroupPending {
- syncTask = true
- }
-
- for _, condition := range pg.Status.Conditions {
- if condition.Type == scheduling.PodGroupUnschedulableType {
- cc.recorder.Eventf(job, v1.EventTypeWarning, string(batch.PodGroupPending),
- fmt.Sprintf("PodGroup %s:%s unschedule,reason: %s", job.Namespace, job.Name, condition.Message))
- }
- }
- }
-
- if !syncTask {
- if updateStatus != nil {
- if updateStatus(&job.Status) {
- job.Status.State.LastTransitionTime = metav1.Now()
- }
- }
- newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Failed to update status of Job %v/%v: %v",
- job.Namespace, job.Name, err)
- return err
- }
- if e := cc.cache.Update(newJob); e != nil {
- klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v",
- newJob.Namespace, newJob.Name, e)
- return e
- }
- return nil
- }
-
- var running, pending, terminating, succeeded, failed, unknown int32
- taskStatusCount := make(map[string]batch.TaskState)
-
- var podToCreate []*v1.Pod
- var podToDelete []*v1.Pod
- var creationErrs []error
- var deletionErrs []error
- appendMutex := sync.Mutex{}
-
- appendError := func(container *[]error, err error) {
- appendMutex.Lock()
- defer appendMutex.Unlock()
- *container = append(*container, err)
- }
-
- for _, ts := range job.Spec.Tasks {
- ts.Template.Name = ts.Name
- tc := ts.Template.DeepCopy()
- name := ts.Template.Name
-
- pods, found := jobInfo.Pods[name]
- if !found {
- pods = map[string]*v1.Pod{}
- }
-
- for i := 0; i < int(ts.Replicas); i++ {
- podName := fmt.Sprintf(jobhelpers.PodNameFmt, job.Name, name, i)
- if pod, found := pods[podName]; !found {
- newPod := createJobPod(job, tc, ts.TopologyPolicy, i, jobForwarding)
- if err := cc.pluginOnPodCreate(job, newPod); err != nil {
- return err
- }
- podToCreate = append(podToCreate, newPod)
- } else {
- delete(pods, podName)
- if pod.DeletionTimestamp != nil {
- klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name)
- atomic.AddInt32(&terminating, 1)
- continue
- }
-
- classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
- calcPodStatus(pod, taskStatusCount)
- }
- }
-
- for _, pod := range pods {
- podToDelete = append(podToDelete, pod)
- }
- }
-
- waitCreationGroup := sync.WaitGroup{}
- waitCreationGroup.Add(len(podToCreate))
- for _, pod := range podToCreate {
- go func(pod *v1.Pod) {
- defer waitCreationGroup.Done()
- newPod, err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Create(context.TODO(), pod, metav1.CreateOptions{})
- if err != nil && !apierrors.IsAlreadyExists(err) {
- // Failed to create Pod, waitCreationGroup a moment and then create it again
- // This is to ensure all podsMap under the same Job created
- // So gang-scheduling could schedule the Job successfully
- klog.Errorf("Failed to create pod %s for Job %s, err %#v",
- pod.Name, job.Name, err)
- appendError(&creationErrs, fmt.Errorf("failed to create pod %s, err: %#v", pod.Name, err))
- } else {
- classifyAndAddUpPodBaseOnPhase(newPod, &pending, &running, &succeeded, &failed, &unknown)
- calcPodStatus(pod, taskStatusCount)
- klog.V(3).Infof("Created Task <%s> of Job <%s/%s>",
- pod.Name, job.Namespace, job.Name)
- }
- }(pod)
- }
- waitCreationGroup.Wait()
-
- if len(creationErrs) != 0 {
- cc.recorder.Event(job, v1.EventTypeWarning, FailedCreatePodReason,
- fmt.Sprintf("Error creating pods: %+v", creationErrs))
- return fmt.Errorf("failed to create %d pods of %d", len(creationErrs), len(podToCreate))
- }
-
- // Delete pods when scale down.
- waitDeletionGroup := sync.WaitGroup{}
- waitDeletionGroup.Add(len(podToDelete))
- for _, pod := range podToDelete {
- go func(pod *v1.Pod) {
- defer waitDeletionGroup.Done()
- err := cc.deleteJobPod(job.Name, pod)
- if err != nil {
- // Failed to delete Pod, waitCreationGroup a moment and then create it again
- // This is to ensure all podsMap under the same Job created
- // So gang-scheduling could schedule the Job successfully
- klog.Errorf("Failed to delete pod %s for Job %s, err %#v",
- pod.Name, job.Name, err)
- appendError(&deletionErrs, err)
- cc.resyncTask(pod)
- } else {
- klog.V(3).Infof("Deleted Task <%s> of Job <%s/%s>",
- pod.Name, job.Namespace, job.Name)
- atomic.AddInt32(&terminating, 1)
- }
- }(pod)
- }
- waitDeletionGroup.Wait()
-
- if len(deletionErrs) != 0 {
- cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason,
- fmt.Sprintf("Error deleting pods: %+v", deletionErrs))
- return fmt.Errorf("failed to delete %d pods of %d", len(deletionErrs), len(podToDelete))
- }
- job.Status = batch.JobStatus{
- State: job.Status.State,
-
- Pending: pending,
- Running: running,
- Succeeded: succeeded,
- Failed: failed,
- Terminating: terminating,
- Unknown: unknown,
- Version: job.Status.Version,
- MinAvailable: job.Spec.MinAvailable,
- TaskStatusCount: taskStatusCount,
- ControlledResources: job.Status.ControlledResources,
- RetryCount: job.Status.RetryCount,
- }
-
- if updateStatus != nil {
- if updateStatus(&job.Status) {
- job.Status.State.LastTransitionTime = metav1.Now()
- }
- }
- newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Failed to update status of Job %v/%v: %v",
- job.Namespace, job.Name, err)
- return err
- }
- if e := cc.cache.Update(newJob); e != nil {
- klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v",
- newJob.Namespace, newJob.Name, e)
- return e
- }
-
- return nil
-}
-
-func (cc *jobcontroller) createJobIOIfNotExist(job *batch.Job) (*batch.Job, error) {
- // If PVC does not exist, create them for Job.
- var needUpdate bool
- if job.Status.ControlledResources == nil {
- job.Status.ControlledResources = make(map[string]string)
- }
- for index, volume := range job.Spec.Volumes {
- vcName := volume.VolumeClaimName
- if len(vcName) == 0 {
- // NOTE(k82cn): Ensure never have duplicated generated names.
- for {
- vcName = jobhelpers.GenPVCName(job.Name)
- exist, err := cc.checkPVCExist(job, vcName)
- if err != nil {
- return job, err
- }
- if exist {
- continue
- }
- job.Spec.Volumes[index].VolumeClaimName = vcName
- needUpdate = true
- break
- }
- // TODO: check VolumeClaim must be set if VolumeClaimName is empty
- if volume.VolumeClaim != nil {
- if err := cc.createPVC(job, vcName, volume.VolumeClaim); err != nil {
- return job, err
- }
- }
- } else {
- exist, err := cc.checkPVCExist(job, vcName)
- if err != nil {
- return job, err
- }
- if !exist {
- return job, fmt.Errorf("pvc %s is not found, the job will be in the Pending state until the PVC is created", vcName)
- }
- }
- job.Status.ControlledResources["volume-pvc-"+vcName] = vcName
- }
- if needUpdate {
- newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Failed to update Job %v/%v for volume claim name: %v ",
- job.Namespace, job.Name, err)
- return job, err
- }
-
- newJob.Status = job.Status
- return newJob, err
- }
- return job, nil
-}
-
-func (cc *jobcontroller) checkPVCExist(job *batch.Job, pvc string) (bool, error) {
- if _, err := cc.pvcLister.PersistentVolumeClaims(job.Namespace).Get(pvc); err != nil {
- if apierrors.IsNotFound(err) {
- return false, nil
- }
- klog.V(3).Infof("Failed to get PVC %s for job <%s/%s>: %v",
- pvc, job.Namespace, job.Name, err)
- return false, err
- }
- return true, nil
-}
-
-func (cc *jobcontroller) createPVC(job *batch.Job, vcName string, volumeClaim *v1.PersistentVolumeClaimSpec) error {
- pvc := &v1.PersistentVolumeClaim{
- ObjectMeta: metav1.ObjectMeta{
- Namespace: job.Namespace,
- Name: vcName,
- OwnerReferences: []metav1.OwnerReference{
- *metav1.NewControllerRef(job, helpers.JobKind),
- },
- },
- Spec: *volumeClaim,
- }
-
- klog.V(3).Infof("Try to create PVC: %v", pvc)
-
- if _, e := cc.kubeClient.CoreV1().PersistentVolumeClaims(job.Namespace).Create(context.TODO(), pvc, metav1.CreateOptions{}); e != nil {
- klog.V(3).Infof("Failed to create PVC for Job <%s/%s>: %v",
- job.Namespace, job.Name, e)
- return e
- }
- return nil
-}
-
-func (cc *jobcontroller) createOrUpdatePodGroup(job *batch.Job) error {
- // If PodGroup does not exist, create one for Job.
- pg, err := cc.pgLister.PodGroups(job.Namespace).Get(job.Name)
- if err != nil {
- if !apierrors.IsNotFound(err) {
- klog.Errorf("Failed to get PodGroup for Job <%s/%s>: %v",
- job.Namespace, job.Name, err)
- return err
- }
-
- minTaskMember := map[string]int32{}
- for _, task := range job.Spec.Tasks {
- if task.MinAvailable != nil {
- minTaskMember[task.Name] = *task.MinAvailable
- } else {
- minTaskMember[task.Name] = task.Replicas
- }
- }
-
- pg := &scheduling.PodGroup{
- ObjectMeta: metav1.ObjectMeta{
- Namespace: job.Namespace,
- Name: job.Name,
- Annotations: job.Annotations,
- Labels: job.Labels,
- OwnerReferences: []metav1.OwnerReference{
- *metav1.NewControllerRef(job, helpers.JobKind),
- },
- },
- Spec: scheduling.PodGroupSpec{
- MinMember: job.Spec.MinAvailable,
- MinTaskMember: minTaskMember,
- Queue: job.Spec.Queue,
- MinResources: cc.calcPGMinResources(job),
- PriorityClassName: job.Spec.PriorityClassName,
- },
- }
-
- if _, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Create(context.TODO(), pg, metav1.CreateOptions{}); err != nil {
- if !apierrors.IsAlreadyExists(err) {
- klog.Errorf("Failed to create PodGroup for Job <%s/%s>: %v",
- job.Namespace, job.Name, err)
- return err
- }
- }
- return nil
- }
-
- pgShouldUpdate := false
- if pg.Spec.PriorityClassName != job.Spec.PriorityClassName {
- pg.Spec.PriorityClassName = job.Spec.PriorityClassName
- pgShouldUpdate = true
- }
-
- minResources := cc.calcPGMinResources(job)
- if pg.Spec.MinMember != job.Spec.MinAvailable || !equality.Semantic.DeepEqual(pg.Spec.MinResources, minResources) {
- pg.Spec.MinMember = job.Spec.MinAvailable
- pg.Spec.MinResources = minResources
- pgShouldUpdate = true
- }
-
- if pg.Spec.MinTaskMember == nil {
- pgShouldUpdate = true
- pg.Spec.MinTaskMember = make(map[string]int32)
- }
-
- for _, task := range job.Spec.Tasks {
- if task.MinAvailable == nil {
- continue
- }
-
- if taskMember, ok := pg.Spec.MinTaskMember[task.Name]; !ok {
- pgShouldUpdate = true
- pg.Spec.MinTaskMember[task.Name] = *task.MinAvailable
- } else {
- if taskMember == *task.MinAvailable {
- continue
- }
-
- pgShouldUpdate = true
- pg.Spec.MinTaskMember[task.Name] = *task.MinAvailable
- }
- }
-
- if !pgShouldUpdate {
- return nil
- }
-
- _, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Update(context.TODO(), pg, metav1.UpdateOptions{})
- if err != nil {
- klog.V(3).Infof("Failed to update PodGroup for Job <%s/%s>: %v",
- job.Namespace, job.Name, err)
- }
- return err
-}
-
-func (cc *jobcontroller) deleteJobPod(jobName string, pod *v1.Pod) error {
- err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{})
- if err != nil && !apierrors.IsNotFound(err) {
- klog.Errorf("Failed to delete pod %s/%s for Job %s, err %#v",
- pod.Namespace, pod.Name, jobName, err)
-
- return fmt.Errorf("failed to delete pod %s, err %#v", pod.Name, err)
- }
-
- return nil
-}
-
-func (cc *jobcontroller) calcPGMinResources(job *batch.Job) *v1.ResourceList {
- // sort task by priorityClasses
- var tasksPriority TasksPriority
- for _, task := range job.Spec.Tasks {
- tp := TaskPriority{0, task}
- pc := task.Template.Spec.PriorityClassName
-
- priorityClass, err := cc.pcLister.Get(pc)
- if err != nil || priorityClass == nil {
- klog.Warningf("Ignore task %s priority class %s: %v", task.Name, pc, err)
- } else {
- tp.priority = priorityClass.Value
- }
-
- tasksPriority = append(tasksPriority, tp)
- }
-
- sort.Sort(tasksPriority)
-
- minAvailableTasksRes := v1.ResourceList{}
- podCnt := int32(0)
- for _, task := range tasksPriority {
- for i := int32(0); i < task.Replicas; i++ {
- if podCnt >= job.Spec.MinAvailable {
- break
- }
- podCnt++
- for _, c := range task.Template.Spec.Containers {
- addResourceList(minAvailableTasksRes, c.Resources.Requests, c.Resources.Limits)
- }
- }
- }
-
- return &minAvailableTasksRes
-}
-
-func (cc *jobcontroller) initJobStatus(job *batch.Job) (*batch.Job, error) {
- if job.Status.State.Phase != "" {
- return job, nil
- }
-
- job.Status.State.LastTransitionTime = metav1.Now()
- job.Status.State.Phase = batch.Pending
- job.Status.State.LastTransitionTime = metav1.Now()
- job.Status.MinAvailable = job.Spec.MinAvailable
- newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Failed to update status of Job %v/%v: %v",
- job.Namespace, job.Name, err)
- return nil, err
- }
- if err := cc.cache.Update(newJob); err != nil {
- klog.Errorf("CreateJob - Failed to update Job %v/%v in cache: %v",
- newJob.Namespace, newJob.Name, err)
- return nil, err
- }
-
- return newJob, nil
-}
-
-func classifyAndAddUpPodBaseOnPhase(pod *v1.Pod, pending, running, succeeded, failed, unknown *int32) {
- switch pod.Status.Phase {
- case v1.PodPending:
- atomic.AddInt32(pending, 1)
- case v1.PodRunning:
- atomic.AddInt32(running, 1)
- case v1.PodSucceeded:
- atomic.AddInt32(succeeded, 1)
- case v1.PodFailed:
- atomic.AddInt32(failed, 1)
- default:
- atomic.AddInt32(unknown, 1)
- }
-}
-
-func calcPodStatus(pod *v1.Pod, taskStatusCount map[string]batch.TaskState) {
- taskName, found := pod.Annotations[batch.TaskSpecKey]
- if !found {
- return
- }
-
- calMutex.Lock()
- defer calMutex.Unlock()
- if _, ok := taskStatusCount[taskName]; !ok {
- taskStatusCount[taskName] = batch.TaskState{
- Phase: make(map[v1.PodPhase]int32),
- }
- }
-
- switch pod.Status.Phase {
- case v1.PodPending:
- taskStatusCount[taskName].Phase[v1.PodPending]++
- case v1.PodRunning:
- taskStatusCount[taskName].Phase[v1.PodRunning]++
- case v1.PodSucceeded:
- taskStatusCount[taskName].Phase[v1.PodSucceeded]++
- case v1.PodFailed:
- taskStatusCount[taskName].Phase[v1.PodFailed]++
- default:
- taskStatusCount[taskName].Phase[v1.PodUnknown]++
- }
-}
-
-func isInitiated(job *batch.Job) bool {
- if job.Status.State.Phase == "" || job.Status.State.Phase == batch.Pending {
- return false
- }
-
- return true
-}
-
-
-
/*
-Copyright 2017 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "reflect"
- "strconv"
-
- v1 "k8s.io/api/core/v1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- bus "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
- jobcache "volcano.sh/volcano/pkg/controllers/cache"
- jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
-)
-
-func (cc *jobcontroller) addCommand(obj interface{}) {
- cmd, ok := obj.(*bus.Command)
- if !ok {
- klog.Errorf("obj is not Command")
- return
- }
-
- cc.commandQueue.Add(cmd)
-}
-
-func (cc *jobcontroller) addJob(obj interface{}) {
- job, ok := obj.(*batch.Job)
- if !ok {
- klog.Errorf("obj is not Job")
- return
- }
-
- req := apis.Request{
- Namespace: job.Namespace,
- JobName: job.Name,
-
- Event: bus.OutOfSyncEvent,
- }
-
- // TODO(k82cn): if failed to add job, the cache should be refresh
- if err := cc.cache.Add(job); err != nil {
- klog.Errorf("Failed to add job <%s/%s>: %v in cache",
- job.Namespace, job.Name, err)
- }
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-}
-
-func (cc *jobcontroller) updateJob(oldObj, newObj interface{}) {
- newJob, ok := newObj.(*batch.Job)
- if !ok {
- klog.Errorf("newObj is not Job")
- return
- }
-
- oldJob, ok := oldObj.(*batch.Job)
- if !ok {
- klog.Errorf("oldJob is not Job")
- return
- }
-
- // No need to update if ResourceVersion is not changed
- if newJob.ResourceVersion == oldJob.ResourceVersion {
- klog.V(6).Infof("No need to update because job is not modified.")
- return
- }
-
- if err := cc.cache.Update(newJob); err != nil {
- klog.Errorf("UpdateJob - Failed to update job <%s/%s>: %v in cache",
- newJob.Namespace, newJob.Name, err)
- }
-
- // NOTE: Since we only reconcile job based on Spec, we will ignore other attributes
- // For Job status, it's used internally and always been updated via our controller.
- if equality.Semantic.DeepEqual(newJob.Spec, oldJob.Spec) && newJob.Status.State.Phase == oldJob.Status.State.Phase {
- klog.V(6).Infof("Job update event is ignored since no update in 'Spec'.")
- return
- }
-
- req := apis.Request{
- Namespace: newJob.Namespace,
- JobName: newJob.Name,
- Event: bus.OutOfSyncEvent,
- }
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-}
-
-func (cc *jobcontroller) deleteJob(obj interface{}) {
- job, ok := obj.(*batch.Job)
- if !ok {
- // If we reached here it means the Job was deleted but its final state is unrecorded.
- tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
- if !ok {
- klog.Errorf("Couldn't get object from tombstone %#v", obj)
- return
- }
- job, ok = tombstone.Obj.(*batch.Job)
- if !ok {
- klog.Errorf("Tombstone contained object that is not a volcano Job: %#v", obj)
- return
- }
- }
-
- if err := cc.cache.Delete(job); err != nil {
- klog.Errorf("Failed to delete job <%s/%s>: %v in cache",
- job.Namespace, job.Name, err)
- }
-}
-
-func (cc *jobcontroller) addPod(obj interface{}) {
- pod, ok := obj.(*v1.Pod)
- if !ok {
- klog.Errorf("Failed to convert %v to v1.Pod", obj)
- return
- }
- // Filter out pods that are not created from volcano job
- if !isControlledBy(pod, helpers.JobKind) {
- return
- }
-
- jobName, found := pod.Annotations[batch.JobNameKey]
- if !found {
- klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- version, found := pod.Annotations[batch.JobVersion]
- if !found {
- klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- dVersion, err := strconv.Atoi(version)
- if err != nil {
- klog.Infof("Failed to convert jobVersion of Pod <%s/%s> into number, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- if pod.DeletionTimestamp != nil {
- cc.deletePod(pod)
- return
- }
-
- req := apis.Request{
- Namespace: pod.Namespace,
- JobName: jobName,
-
- Event: bus.OutOfSyncEvent,
- JobVersion: int32(dVersion),
- }
-
- if err := cc.cache.AddPod(pod); err != nil {
- klog.Errorf("Failed to add Pod <%s/%s>: %v to cache",
- pod.Namespace, pod.Name, err)
- }
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-}
-
-func (cc *jobcontroller) updatePod(oldObj, newObj interface{}) {
- oldPod, ok := oldObj.(*v1.Pod)
- if !ok {
- klog.Errorf("Failed to convert %v to v1.Pod", oldObj)
- return
- }
-
- newPod, ok := newObj.(*v1.Pod)
- if !ok {
- klog.Errorf("Failed to convert %v to v1.Pod", newObj)
- return
- }
-
- // Filter out pods that are not created from volcano job
- if !isControlledBy(newPod, helpers.JobKind) {
- return
- }
-
- if newPod.ResourceVersion == oldPod.ResourceVersion {
- return
- }
-
- if newPod.DeletionTimestamp != nil {
- cc.deletePod(newObj)
- return
- }
-
- taskName, found := newPod.Annotations[batch.TaskSpecKey]
- if !found {
- klog.Infof("Failed to find taskName of Pod <%s/%s>, skipping",
- newPod.Namespace, newPod.Name)
- return
- }
-
- jobName, found := newPod.Annotations[batch.JobNameKey]
- if !found {
- klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
- newPod.Namespace, newPod.Name)
- return
- }
-
- version, found := newPod.Annotations[batch.JobVersion]
- if !found {
- klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
- newPod.Namespace, newPod.Name)
- return
- }
-
- dVersion, err := strconv.Atoi(version)
- if err != nil {
- klog.Infof("Failed to convert jobVersion of Pod into number <%s/%s>, skipping",
- newPod.Namespace, newPod.Name)
- return
- }
-
- if err := cc.cache.UpdatePod(newPod); err != nil {
- klog.Errorf("Failed to update Pod <%s/%s>: %v in cache",
- newPod.Namespace, newPod.Name, err)
- }
-
- event := bus.OutOfSyncEvent
- var exitCode int32
-
- switch newPod.Status.Phase {
- case v1.PodFailed:
- if oldPod.Status.Phase != v1.PodFailed {
- event = bus.PodFailedEvent
- // TODO: currently only one container pod is supported by volcano
- // Once multi containers pod is supported, update accordingly.
- if len(newPod.Status.ContainerStatuses) > 0 && newPod.Status.ContainerStatuses[0].State.Terminated != nil {
- exitCode = newPod.Status.ContainerStatuses[0].State.Terminated.ExitCode
- }
- }
- case v1.PodSucceeded:
- if oldPod.Status.Phase != v1.PodSucceeded &&
- cc.cache.TaskCompleted(jobcache.JobKeyByName(newPod.Namespace, jobName), taskName) {
- event = bus.TaskCompletedEvent
- }
- case v1.PodPending, v1.PodRunning:
- if cc.cache.TaskFailed(jobcache.JobKeyByName(newPod.Namespace, jobName), taskName) {
- event = bus.TaskFailedEvent
- }
- }
-
- req := apis.Request{
- Namespace: newPod.Namespace,
- JobName: jobName,
- TaskName: taskName,
-
- Event: event,
- ExitCode: exitCode,
- JobVersion: int32(dVersion),
- }
-
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-}
-
-func (cc *jobcontroller) deletePod(obj interface{}) {
- pod, ok := obj.(*v1.Pod)
- if !ok {
- // If we reached here it means the pod was deleted but its final state is unrecorded.
- tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
- if !ok {
- klog.Errorf("Couldn't get object from tombstone %#v", obj)
- return
- }
- pod, ok = tombstone.Obj.(*v1.Pod)
- if !ok {
- klog.Errorf("Tombstone contained object that is not a Pod: %#v", obj)
- return
- }
- }
-
- // Filter out pods that are not created from volcano job
- if !isControlledBy(pod, helpers.JobKind) {
- return
- }
-
- taskName, found := pod.Annotations[batch.TaskSpecKey]
- if !found {
- klog.Infof("Failed to find taskName of Pod <%s/%s>, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- jobName, found := pod.Annotations[batch.JobNameKey]
- if !found {
- klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- version, found := pod.Annotations[batch.JobVersion]
- if !found {
- klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- dVersion, err := strconv.Atoi(version)
- if err != nil {
- klog.Infof("Failed to convert jobVersion of Pod <%s/%s> into number, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- req := apis.Request{
- Namespace: pod.Namespace,
- JobName: jobName,
- TaskName: taskName,
-
- Event: bus.PodEvictedEvent,
- JobVersion: int32(dVersion),
- }
-
- if err := cc.cache.DeletePod(pod); err != nil {
- klog.Errorf("Failed to delete Pod <%s/%s>: %v in cache",
- pod.Namespace, pod.Name, err)
- }
-
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-}
-
-func (cc *jobcontroller) recordJobEvent(namespace, name string, event batch.JobEvent, message string) {
- job, err := cc.cache.Get(jobcache.JobKeyByName(namespace, name))
- if err != nil {
- klog.Warningf("Failed to find job in cache when reporting job event <%s/%s>: %v",
- namespace, name, err)
- return
- }
- cc.recorder.Event(job.Job, v1.EventTypeNormal, string(event), message)
-}
-
-func (cc *jobcontroller) handleCommands() {
- for cc.processNextCommand() {
- }
-}
-
-func (cc *jobcontroller) processNextCommand() bool {
- obj, shutdown := cc.commandQueue.Get()
- if shutdown {
- return false
- }
- cmd := obj.(*bus.Command)
- defer cc.commandQueue.Done(cmd)
-
- if err := cc.vcClient.BusV1alpha1().Commands(cmd.Namespace).Delete(context.TODO(), cmd.Name, metav1.DeleteOptions{}); err != nil {
- if !apierrors.IsNotFound(err) {
- klog.Errorf("Failed to delete Command <%s/%s>.", cmd.Namespace, cmd.Name)
- cc.commandQueue.AddRateLimited(cmd)
- }
- return true
- }
- cc.recordJobEvent(cmd.Namespace, cmd.TargetObject.Name,
- batch.CommandIssued,
- fmt.Sprintf(
- "Start to execute command %s, and clean it up to make sure executed not more than once.", cmd.Action))
- req := apis.Request{
- Namespace: cmd.Namespace,
- JobName: cmd.TargetObject.Name,
- Event: bus.CommandIssuedEvent,
- Action: bus.Action(cmd.Action),
- }
-
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-
- return true
-}
-
-func (cc *jobcontroller) updatePodGroup(oldObj, newObj interface{}) {
- oldPG, ok := oldObj.(*scheduling.PodGroup)
- if !ok {
- klog.Errorf("Failed to convert %v to PodGroup", newObj)
- return
- }
-
- newPG, ok := newObj.(*scheduling.PodGroup)
- if !ok {
- klog.Errorf("Failed to convert %v to PodGroup", newObj)
- return
- }
-
- _, err := cc.cache.Get(jobcache.JobKeyByName(newPG.Namespace, newPG.Name))
- if err != nil && newPG.Annotations != nil {
- klog.Warningf(
- "Failed to find job in cache by PodGroup, this may not be a PodGroup for volcano job.")
- }
-
- if newPG.Status.Phase != oldPG.Status.Phase {
- req := apis.Request{
- Namespace: newPG.Namespace,
- JobName: newPG.Name,
- }
- switch newPG.Status.Phase {
- case scheduling.PodGroupUnknown:
- req.Event = bus.JobUnknownEvent
- }
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
- }
-}
-
-// TODO(k82cn): add handler for PodGroup unschedulable event.
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/volcano/pkg/controllers/job/plugins"
- pluginsinterface "volcano.sh/volcano/pkg/controllers/job/plugins/interface"
-)
-
-func (cc *jobcontroller) pluginOnPodCreate(job *batch.Job, pod *v1.Pod) error {
- client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
- for name, args := range job.Spec.Plugins {
- pb, found := plugins.GetPluginBuilder(name)
- if !found {
- err := fmt.Errorf("failed to get plugin %s", name)
- klog.Error(err)
- return err
- }
- klog.Infof("Starting to execute plugin at <pluginOnPodCreate>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
- if err := pb(client, args).OnPodCreate(pod, job); err != nil {
- klog.Errorf("Failed to process on pod create plugin %s, err %v.", name, err)
- return err
- }
- }
- return nil
-}
-
-func (cc *jobcontroller) pluginOnJobAdd(job *batch.Job) error {
- client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
- if job.Status.ControlledResources == nil {
- job.Status.ControlledResources = make(map[string]string)
- }
- for name, args := range job.Spec.Plugins {
- pb, found := plugins.GetPluginBuilder(name)
- if !found {
- err := fmt.Errorf("failed to get plugin %s", name)
- klog.Error(err)
- return err
- }
- klog.Infof("Starting to execute plugin at <pluginOnJobAdd>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
- if err := pb(client, args).OnJobAdd(job); err != nil {
- klog.Errorf("Failed to process on job add plugin %s, err %v.", name, err)
- return err
- }
- }
-
- return nil
-}
-
-func (cc *jobcontroller) pluginOnJobDelete(job *batch.Job) error {
- if job.Status.ControlledResources == nil {
- job.Status.ControlledResources = make(map[string]string)
- }
- client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
- for name, args := range job.Spec.Plugins {
- pb, found := plugins.GetPluginBuilder(name)
- if !found {
- err := fmt.Errorf("failed to get plugin %s", name)
- klog.Error(err)
- return err
- }
- klog.Infof("Starting to execute plugin at <pluginOnJobDelete>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
- if err := pb(client, args).OnJobDelete(job); err != nil {
- klog.Errorf("failed to process on job delete plugin %s, err %v.", name, err)
- return err
- }
- }
-
- return nil
-}
-
-func (cc *jobcontroller) pluginOnJobUpdate(job *batch.Job) error {
- client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
- if job.Status.ControlledResources == nil {
- job.Status.ControlledResources = make(map[string]string)
- }
- for name, args := range job.Spec.Plugins {
- pb, found := plugins.GetPluginBuilder(name)
- if !found {
- err := fmt.Errorf("failed to get plugin %s", name)
- klog.Error(err)
- return err
- }
- klog.Infof("Starting to execute plugin at <pluginOnJobUpdate>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
- if err := pb(client, args).OnJobUpdate(job); err != nil {
- klog.Errorf("Failed to process on job update plugin %s, err %v.", name, err)
- return err
- }
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "time"
-
- "golang.org/x/time/rate"
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-)
-
-func newRateLimitingQueue() workqueue.RateLimitingInterface {
- return workqueue.NewRateLimitingQueue(workqueue.NewMaxOfRateLimiter(
- workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 180*time.Second),
- // 10 qps, 100 bucket size. This is only for retry speed and its only the overall factor (not per item)
- &workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
- ))
-}
-
-func (cc *jobcontroller) processResyncTask() {
- obj, shutdown := cc.errTasks.Get()
- if shutdown {
- return
- }
-
- // one task only resync 10 times
- if cc.errTasks.NumRequeues(obj) > 10 {
- cc.errTasks.Forget(obj)
- return
- }
-
- defer cc.errTasks.Done(obj)
-
- task, ok := obj.(*v1.Pod)
- if !ok {
- klog.Errorf("failed to convert %v to *v1.Pod", obj)
- return
- }
-
- if err := cc.syncTask(task); err != nil {
- klog.Errorf("Failed to sync pod <%v/%v>, retry it, err %v", task.Namespace, task.Name, err)
- cc.resyncTask(task)
- }
-}
-
-func (cc *jobcontroller) syncTask(oldTask *v1.Pod) error {
- newPod, err := cc.kubeClient.CoreV1().Pods(oldTask.Namespace).Get(context.TODO(), oldTask.Name, metav1.GetOptions{})
- if err != nil {
- if errors.IsNotFound(err) {
- if err := cc.cache.DeletePod(oldTask); err != nil {
- klog.Errorf("failed to delete cache pod <%v/%v>, err %v.", oldTask.Namespace, oldTask.Name, err)
- return err
- }
- klog.V(3).Infof("Pod <%v/%v> was deleted, removed from cache.", oldTask.Namespace, oldTask.Name)
-
- return nil
- }
- return fmt.Errorf("failed to get Pod <%v/%v>: err %v", oldTask.Namespace, oldTask.Name, err)
- }
-
- return cc.cache.UpdatePod(newPod)
-}
-
-func (cc *jobcontroller) resyncTask(task *v1.Pod) {
- cc.errTasks.AddRateLimited(task)
-}
-
-
-
/*
-Copyright 2017 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/runtime/schema"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- schedulingv2 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
- jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
-)
-
-// MakePodName append podname,jobname,taskName and index and returns the string.
-func MakePodName(jobName string, taskName string, index int) string {
- return fmt.Sprintf(jobhelpers.PodNameFmt, jobName, taskName, index)
-}
-
-func createJobPod(job *batch.Job, template *v1.PodTemplateSpec, topologyPolicy batch.NumaPolicy, ix int, jobForwarding bool) *v1.Pod {
- templateCopy := template.DeepCopy()
-
- pod := &v1.Pod{
- ObjectMeta: metav1.ObjectMeta{
- Name: jobhelpers.MakePodName(job.Name, template.Name, ix),
- Namespace: job.Namespace,
- OwnerReferences: []metav1.OwnerReference{
- *metav1.NewControllerRef(job, helpers.JobKind),
- },
- Labels: templateCopy.Labels,
- Annotations: templateCopy.Annotations,
- },
- Spec: templateCopy.Spec,
- }
-
- // If no scheduler name in Pod, use scheduler name from Job.
- if len(pod.Spec.SchedulerName) == 0 {
- pod.Spec.SchedulerName = job.Spec.SchedulerName
- }
-
- volumeMap := make(map[string]string)
- for _, volume := range job.Spec.Volumes {
- vcName := volume.VolumeClaimName
- name := fmt.Sprintf("%s-%s", job.Name, jobhelpers.GenRandomStr(12))
- if _, ok := volumeMap[vcName]; !ok {
- volume := v1.Volume{
- Name: name,
- VolumeSource: v1.VolumeSource{
- PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
- ClaimName: vcName,
- },
- },
- }
- pod.Spec.Volumes = append(pod.Spec.Volumes, volume)
- volumeMap[vcName] = name
- } else {
- // duplicate volumes, should be prevented
- continue
- }
-
- for i, c := range pod.Spec.Containers {
- vm := v1.VolumeMount{
- MountPath: volume.MountPath,
- Name: name,
- }
- pod.Spec.Containers[i].VolumeMounts = append(c.VolumeMounts, vm)
- }
- }
-
- tsKey := templateCopy.Name
- if len(tsKey) == 0 {
- tsKey = batch.DefaultTaskSpec
- }
-
- if len(pod.Annotations) == 0 {
- pod.Annotations = make(map[string]string)
- }
-
- pod.Annotations[batch.TaskSpecKey] = tsKey
- pod.Annotations[schedulingv2.KubeGroupNameAnnotationKey] = job.Name
- pod.Annotations[batch.JobNameKey] = job.Name
- pod.Annotations[batch.QueueNameKey] = job.Spec.Queue
- pod.Annotations[batch.JobVersion] = fmt.Sprintf("%d", job.Status.Version)
- pod.Annotations[batch.PodTemplateKey] = fmt.Sprintf("%s-%s", job.Name, template.Name)
-
- if topologyPolicy != "" {
- pod.Annotations[schedulingv2.NumaPolicyKey] = string(topologyPolicy)
- }
-
- if len(job.Annotations) > 0 {
- if value, found := job.Annotations[schedulingv2.PodPreemptable]; found {
- pod.Annotations[schedulingv2.PodPreemptable] = value
- }
- if value, found := job.Annotations[schedulingv2.RevocableZone]; found {
- pod.Annotations[schedulingv2.RevocableZone] = value
- }
-
- if value, found := job.Annotations[schedulingv2.JDBMinAvailable]; found {
- pod.Annotations[schedulingv2.JDBMinAvailable] = value
- } else if value, found := job.Annotations[schedulingv2.JDBMaxUnavailable]; found {
- pod.Annotations[schedulingv2.JDBMaxUnavailable] = value
- }
- }
-
- if len(pod.Labels) == 0 {
- pod.Labels = make(map[string]string)
- }
-
- // Set pod labels for Service.
- pod.Labels[batch.JobNameKey] = job.Name
- pod.Labels[batch.TaskSpecKey] = tsKey
- pod.Labels[batch.JobNamespaceKey] = job.Namespace
- pod.Labels[batch.QueueNameKey] = job.Spec.Queue
- if len(job.Labels) > 0 {
- if value, found := job.Labels[schedulingv2.PodPreemptable]; found {
- pod.Labels[schedulingv2.PodPreemptable] = value
- }
- }
-
- if jobForwarding {
- pod.Annotations[batch.JobForwardingKey] = "true"
- pod.Labels[batch.JobForwardingKey] = "true"
- }
-
- return pod
-}
-
-func applyPolicies(job *batch.Job, req *apis.Request) v1alpha1.Action {
- if len(req.Action) != 0 {
- return req.Action
- }
-
- if req.Event == v1alpha1.OutOfSyncEvent {
- return v1alpha1.SyncJobAction
- }
-
- // For all the requests triggered from discarded job resources will perform sync action instead
- if req.JobVersion < job.Status.Version {
- klog.Infof("Request %s is outdated, will perform sync instead.", req)
- return v1alpha1.SyncJobAction
- }
-
- // Overwrite Job level policies
- if len(req.TaskName) != 0 {
- // Parse task level policies
- for _, task := range job.Spec.Tasks {
- if task.Name == req.TaskName {
- for _, policy := range task.Policies {
- policyEvents := getEventlist(policy)
-
- if len(policyEvents) > 0 && len(req.Event) > 0 {
- if checkEventExist(policyEvents, req.Event) || checkEventExist(policyEvents, v1alpha1.AnyEvent) {
- return policy.Action
- }
- }
-
- // 0 is not an error code, is prevented in validation admission controller
- if policy.ExitCode != nil && *policy.ExitCode == req.ExitCode {
- return policy.Action
- }
- }
- break
- }
- }
- }
-
- // Parse Job level policies
- for _, policy := range job.Spec.Policies {
- policyEvents := getEventlist(policy)
-
- if len(policyEvents) > 0 && len(req.Event) > 0 {
- if checkEventExist(policyEvents, req.Event) || checkEventExist(policyEvents, v1alpha1.AnyEvent) {
- return policy.Action
- }
- }
-
- // 0 is not an error code, is prevented in validation admission controller
- if policy.ExitCode != nil && *policy.ExitCode == req.ExitCode {
- return policy.Action
- }
- }
-
- return v1alpha1.SyncJobAction
-}
-
-func getEventlist(policy batch.LifecyclePolicy) []v1alpha1.Event {
- policyEventsList := policy.Events
- if len(policy.Event) > 0 {
- policyEventsList = append(policyEventsList, policy.Event)
- }
- return policyEventsList
-}
-
-func checkEventExist(policyEvents []v1alpha1.Event, reqEvent v1alpha1.Event) bool {
- for _, event := range policyEvents {
- if event == reqEvent {
- return true
- }
- }
- return false
-}
-
-func addResourceList(list, req, limit v1.ResourceList) {
- for name, quantity := range req {
- if value, ok := list[name]; !ok {
- list[name] = quantity.DeepCopy()
- } else {
- value.Add(quantity)
- list[name] = value
- }
- }
-
- if req != nil {
- return
- }
-
- // If Requests is omitted for a container,
- // it defaults to Limits if that is explicitly specified.
- for name, quantity := range limit {
- if value, ok := list[name]; !ok {
- list[name] = quantity.DeepCopy()
- } else {
- value.Add(quantity)
- list[name] = value
- }
- }
-}
-
-// TaskPriority structure.
-type TaskPriority struct {
- priority int32
-
- batch.TaskSpec
-}
-
-// TasksPriority is a slice of TaskPriority.
-type TasksPriority []TaskPriority
-
-func (p TasksPriority) Len() int { return len(p) }
-
-func (p TasksPriority) Less(i, j int) bool {
- return p[i].priority > p[j].priority
-}
-
-func (p TasksPriority) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
-
-func isControlledBy(obj metav1.Object, gvk schema.GroupVersionKind) bool {
- controllerRef := metav1.GetControllerOf(obj)
- if controllerRef == nil {
- return false
- }
- if controllerRef.APIVersion == gvk.GroupVersion().String() && controllerRef.Kind == gvk.Kind {
- return true
- }
- return false
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package ssh
-
-import (
- "crypto/rand"
- "crypto/rsa"
- "crypto/x509"
- "encoding/pem"
- "flag"
- "fmt"
-
- "golang.org/x/crypto/ssh"
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
- pluginsinterface "volcano.sh/volcano/pkg/controllers/job/plugins/interface"
-)
-
-type sshPlugin struct {
- // Arguments given for the plugin
- pluginArguments []string
-
- client pluginsinterface.PluginClientset
-
- // flag parse args
- sshKeyFilePath string
-
- // private key string
- sshPrivateKey string
-
- // public key string
- sshPublicKey string
-}
-
-// New creates ssh plugin
-func New(client pluginsinterface.PluginClientset, arguments []string) pluginsinterface.PluginInterface {
- p := sshPlugin{
- pluginArguments: arguments,
- client: client,
- sshKeyFilePath: SSHAbsolutePath,
- }
-
- p.addFlags()
-
- return &p
-}
-
-func (sp *sshPlugin) Name() string {
- return "ssh"
-}
-
-func (sp *sshPlugin) OnPodCreate(pod *v1.Pod, job *batch.Job) error {
- sp.mountRsaKey(pod, job)
-
- return nil
-}
-
-func (sp *sshPlugin) OnJobAdd(job *batch.Job) error {
- if job.Status.ControlledResources["plugin-"+sp.Name()] == sp.Name() {
- return nil
- }
-
- var data map[string][]byte
- var err error
- if len(sp.sshPrivateKey) > 0 {
- data, err = withUserProvidedRsaKey(job, sp.sshPrivateKey, sp.sshPublicKey)
- } else {
- data, err = generateRsaKey(job)
- }
- if err != nil {
- return err
- }
-
- if err := helpers.CreateOrUpdateSecret(job, sp.client.KubeClients, data, sp.secretName(job)); err != nil {
- return fmt.Errorf("create secret for job <%s/%s> with ssh plugin failed for %v",
- job.Namespace, job.Name, err)
- }
-
- job.Status.ControlledResources["plugin-"+sp.Name()] = sp.Name()
-
- return nil
-}
-
-func (sp *sshPlugin) OnJobDelete(job *batch.Job) error {
- if job.Status.ControlledResources["plugin-"+sp.Name()] != sp.Name() {
- return nil
- }
- if err := helpers.DeleteSecret(job, sp.client.KubeClients, sp.secretName(job)); err != nil {
- return err
- }
- delete(job.Status.ControlledResources, "plugin-"+sp.Name())
-
- return nil
-}
-
-// TODO: currently a container using a Secret as a subPath volume mount will not receive Secret updates.
-// we may not update the job secret due to the above reason now.
-// related issue: https://github.com/volcano-sh/volcano/issues/1420
-func (sp *sshPlugin) OnJobUpdate(job *batch.Job) error {
- //data, err := generateRsaKey(job)
- //if err != nil {
- // return err
- //}
- //
- //if err := helpers.CreateOrUpdateSecret(job, sp.client.KubeClients, data, sp.secretName(job)); err != nil {
- // return fmt.Errorf("update secret for job <%s/%s> with ssh plugin failed for %v",
- // job.Namespace, job.Name, err)
- //}
-
- return nil
-}
-
-func (sp *sshPlugin) mountRsaKey(pod *v1.Pod, job *batch.Job) {
- secretName := sp.secretName(job)
-
- sshVolume := v1.Volume{
- Name: secretName,
- }
-
- var mode int32 = 0600
- sshVolume.Secret = &v1.SecretVolumeSource{
- SecretName: secretName,
- Items: []v1.KeyToPath{
- {
- Key: SSHPrivateKey,
- Path: SSHRelativePath + "/" + SSHPrivateKey,
- },
- {
- Key: SSHPublicKey,
- Path: SSHRelativePath + "/" + SSHPublicKey,
- },
- {
- Key: SSHAuthorizedKeys,
- Path: SSHRelativePath + "/" + SSHAuthorizedKeys,
- },
- {
- Key: SSHConfig,
- Path: SSHRelativePath + "/" + SSHConfig,
- },
- },
- DefaultMode: &mode,
- }
-
- if sp.sshKeyFilePath != SSHAbsolutePath {
- var noRootMode int32 = 0600
- sshVolume.Secret.DefaultMode = &noRootMode
- }
-
- pod.Spec.Volumes = append(pod.Spec.Volumes, sshVolume)
-
- for i, c := range pod.Spec.Containers {
- vm := v1.VolumeMount{
- MountPath: sp.sshKeyFilePath,
- SubPath: SSHRelativePath,
- Name: secretName,
- }
-
- pod.Spec.Containers[i].VolumeMounts = append(c.VolumeMounts, vm)
- }
- for i, c := range pod.Spec.InitContainers {
- vm := v1.VolumeMount{
- MountPath: sp.sshKeyFilePath,
- SubPath: SSHRelativePath,
- Name: secretName,
- }
-
- pod.Spec.InitContainers[i].VolumeMounts = append(c.VolumeMounts, vm)
- }
-}
-
-func generateRsaKey(job *batch.Job) (map[string][]byte, error) {
- bitSize := 1024
-
- privateKey, err := rsa.GenerateKey(rand.Reader, bitSize)
- if err != nil {
- klog.Errorf("rsa generateKey err: %v", err)
- return nil, err
- }
-
- // id_rsa
- privBlock := pem.Block{
- Type: "RSA PRIVATE KEY",
- Bytes: x509.MarshalPKCS1PrivateKey(privateKey),
- }
- privateKeyBytes := pem.EncodeToMemory(&privBlock)
-
- // id_rsa.pub
- publicRsaKey, err := ssh.NewPublicKey(&privateKey.PublicKey)
- if err != nil {
- klog.Errorf("ssh newPublicKey err: %v", err)
- return nil, err
- }
- publicKeyBytes := ssh.MarshalAuthorizedKey(publicRsaKey)
-
- data := make(map[string][]byte)
- data[SSHPrivateKey] = privateKeyBytes
- data[SSHPublicKey] = publicKeyBytes
- data[SSHAuthorizedKeys] = publicKeyBytes
- data[SSHConfig] = []byte(generateSSHConfig(job))
-
- return data, nil
-}
-
-func withUserProvidedRsaKey(job *batch.Job, sshPrivateKey string, sshPublicKey string) (map[string][]byte, error) {
- data := make(map[string][]byte)
- data[SSHPrivateKey] = []byte(sshPrivateKey)
- data[SSHPublicKey] = []byte(sshPublicKey)
- data[SSHAuthorizedKeys] = []byte(sshPublicKey)
- data[SSHConfig] = []byte(generateSSHConfig(job))
-
- return data, nil
-}
-
-func (sp *sshPlugin) secretName(job *batch.Job) string {
- return fmt.Sprintf("%s-%s", job.Name, sp.Name())
-}
-
-func (sp *sshPlugin) addFlags() {
- flagSet := flag.NewFlagSet(sp.Name(), flag.ContinueOnError)
- flagSet.StringVar(&sp.sshKeyFilePath, "ssh-key-file-path", sp.sshKeyFilePath, "The path used to store "+
- "ssh private and public keys, it is `/root/.ssh` by default.")
- flagSet.StringVar(&sp.sshPrivateKey, "ssh-private-key", sp.sshPrivateKey, "The input string of the private key")
- flagSet.StringVar(&sp.sshPublicKey, "ssh-public-key", sp.sshPublicKey, "The input string of the public key")
-
- if err := flagSet.Parse(sp.pluginArguments); err != nil {
- klog.Errorf("plugin %s flagset parse failed, err: %v", sp.Name(), err)
- }
-}
-
-func generateSSHConfig(job *batch.Job) string {
- config := "StrictHostKeyChecking no\nUserKnownHostsFile /dev/null\n"
-
- for _, ts := range job.Spec.Tasks {
- for i := 0; i < int(ts.Replicas); i++ {
- hostName := ts.Template.Spec.Hostname
- subdomain := ts.Template.Spec.Subdomain
- if len(hostName) == 0 {
- hostName = jobhelpers.MakePodName(job.Name, ts.Name, i)
- }
- if len(subdomain) == 0 {
- subdomain = job.Name
- }
-
- config += "Host " + hostName + "\n"
- config += " HostName " + hostName + "." + subdomain + "\n"
- if len(ts.Template.Spec.Hostname) != 0 {
- break
- }
- }
- }
-
- return config
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package podgroup
-
-import (
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/util/wait"
- coreinformers "k8s.io/client-go/informers/core/v1"
- "k8s.io/client-go/kubernetes"
- corelisters "k8s.io/client-go/listers/core/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
- informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
- schedulinginformer "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
- schedulinglister "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/framework"
-)
-
-func init() {
- framework.RegisterController(&pgcontroller{})
-}
-
-// pgcontroller the Podgroup pgcontroller type.
-type pgcontroller struct {
- kubeClient kubernetes.Interface
- vcClient vcclientset.Interface
-
- podInformer coreinformers.PodInformer
- pgInformer schedulinginformer.PodGroupInformer
-
- // A store of pods
- podLister corelisters.PodLister
- podSynced func() bool
-
- // A store of podgroups
- pgLister schedulinglister.PodGroupLister
- pgSynced func() bool
-
- queue workqueue.RateLimitingInterface
-}
-
-func (pg *pgcontroller) Name() string {
- return "pg-controller"
-}
-
-// Initialize create new Podgroup Controller.
-func (pg *pgcontroller) Initialize(opt *framework.ControllerOption) error {
- pg.kubeClient = opt.KubeClient
- pg.vcClient = opt.VolcanoClient
-
- pg.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
-
- pg.podInformer = opt.SharedInformerFactory.Core().V1().Pods()
- pg.podLister = pg.podInformer.Lister()
- pg.podSynced = pg.podInformer.Informer().HasSynced
- pg.podInformer.Informer().AddEventHandler(
- cache.FilteringResourceEventHandler{
- FilterFunc: func(obj interface{}) bool {
- switch v := obj.(type) {
- case *v1.Pod:
- if v.Spec.SchedulerName == opt.SchedulerName &&
- (v.Annotations == nil || v.Annotations[scheduling.KubeGroupNameAnnotationKey] == "") {
- return true
- }
- return false
- default:
- return false
- }
- },
- Handler: cache.ResourceEventHandlerFuncs{
- AddFunc: pg.addPod,
- },
- })
-
- pg.pgInformer = informerfactory.NewSharedInformerFactory(pg.vcClient, 0).Scheduling().V1beta1().PodGroups()
- pg.pgLister = pg.pgInformer.Lister()
- pg.pgSynced = pg.pgInformer.Informer().HasSynced
-
- return nil
-}
-
-// Run start NewPodgroupController.
-func (pg *pgcontroller) Run(stopCh <-chan struct{}) {
- go pg.podInformer.Informer().Run(stopCh)
- go pg.pgInformer.Informer().Run(stopCh)
-
- cache.WaitForCacheSync(stopCh, pg.podSynced, pg.pgSynced)
-
- go wait.Until(pg.worker, 0, stopCh)
-
- klog.Infof("PodgroupController is running ...... ")
-}
-
-func (pg *pgcontroller) worker() {
- for pg.processNextReq() {
- }
-}
-
-func (pg *pgcontroller) processNextReq() bool {
- obj, shutdown := pg.queue.Get()
- if shutdown {
- klog.Errorf("Fail to pop item from queue")
- return false
- }
-
- req := obj.(podRequest)
- defer pg.queue.Done(req)
-
- pod, err := pg.podLister.Pods(req.podNamespace).Get(req.podName)
- if err != nil {
- klog.Errorf("Failed to get pod by <%v> from cache: %v", req, err)
- return true
- }
-
- // normal pod use volcano
- if err := pg.createNormalPodPGIfNotExist(pod); err != nil {
- klog.Errorf("Failed to handle Pod <%s/%s>: %v", pod.Namespace, pod.Name, err)
- pg.queue.AddRateLimited(req)
- return true
- }
-
- // If no error, forget it.
- pg.queue.Forget(req)
-
- return true
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package podgroup
-
-import (
- "context"
-
- v1 "k8s.io/api/core/v1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/runtime/schema"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/helpers"
- scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-type podRequest struct {
- podName string
- podNamespace string
-}
-
-func (pg *pgcontroller) addPod(obj interface{}) {
- pod, ok := obj.(*v1.Pod)
- if !ok {
- klog.Errorf("Failed to convert %v to v1.Pod", obj)
- return
- }
-
- req := podRequest{
- podName: pod.Name,
- podNamespace: pod.Namespace,
- }
-
- pg.queue.Add(req)
-}
-
-func (pg *pgcontroller) updatePodAnnotations(pod *v1.Pod, pgName string) error {
- if pod.Annotations == nil {
- pod.Annotations = make(map[string]string)
- }
- if pod.Annotations[scheduling.KubeGroupNameAnnotationKey] == "" {
- pod.Annotations[scheduling.KubeGroupNameAnnotationKey] = pgName
- } else {
- if pod.Annotations[scheduling.KubeGroupNameAnnotationKey] != pgName {
- klog.Errorf("normal pod %s/%s annotations %s value is not %s, but %s", pod.Namespace, pod.Name,
- scheduling.KubeGroupNameAnnotationKey, pgName, pod.Annotations[scheduling.KubeGroupNameAnnotationKey])
- }
- return nil
- }
-
- if _, err := pg.kubeClient.CoreV1().Pods(pod.Namespace).Update(context.TODO(), pod, metav1.UpdateOptions{}); err != nil {
- klog.Errorf("Failed to update pod <%s/%s>: %v", pod.Namespace, pod.Name, err)
- return err
- }
-
- return nil
-}
-
-func (pg *pgcontroller) createNormalPodPGIfNotExist(pod *v1.Pod) error {
- pgName := helpers.GeneratePodgroupName(pod)
-
- if _, err := pg.pgLister.PodGroups(pod.Namespace).Get(pgName); err != nil {
- if !apierrors.IsNotFound(err) {
- klog.Errorf("Failed to get normal PodGroup for Pod <%s/%s>: %v",
- pod.Namespace, pod.Name, err)
- return err
- }
-
- obj := &scheduling.PodGroup{
- ObjectMeta: metav1.ObjectMeta{
- Namespace: pod.Namespace,
- Name: pgName,
- OwnerReferences: newPGOwnerReferences(pod),
- Annotations: map[string]string{},
- Labels: map[string]string{},
- },
- Spec: scheduling.PodGroupSpec{
- MinMember: 1,
- PriorityClassName: pod.Spec.PriorityClassName,
- MinResources: calcPGMinResources(pod),
- },
- }
- if queueName, ok := pod.Annotations[scheduling.QueueNameAnnotationKey]; ok {
- obj.Spec.Queue = queueName
- }
-
- if value, ok := pod.Annotations[scheduling.PodPreemptable]; ok {
- obj.Annotations[scheduling.PodPreemptable] = value
- }
- if value, ok := pod.Annotations[scheduling.RevocableZone]; ok {
- obj.Annotations[scheduling.RevocableZone] = value
- }
- if value, ok := pod.Labels[scheduling.PodPreemptable]; ok {
- obj.Labels[scheduling.PodPreemptable] = value
- }
-
- if value, found := pod.Annotations[scheduling.JDBMinAvailable]; found {
- obj.Annotations[scheduling.JDBMinAvailable] = value
- } else if value, found := pod.Annotations[scheduling.JDBMaxUnavailable]; found {
- obj.Annotations[scheduling.JDBMaxUnavailable] = value
- }
-
- if _, err := pg.vcClient.SchedulingV1beta1().PodGroups(pod.Namespace).Create(context.TODO(), obj, metav1.CreateOptions{}); err != nil {
- klog.Errorf("Failed to create normal PodGroup for Pod <%s/%s>: %v",
- pod.Namespace, pod.Name, err)
- return err
- }
- }
-
- return pg.updatePodAnnotations(pod, pgName)
-}
-
-func newPGOwnerReferences(pod *v1.Pod) []metav1.OwnerReference {
- if len(pod.OwnerReferences) != 0 {
- for _, ownerReference := range pod.OwnerReferences {
- if ownerReference.Controller != nil && *ownerReference.Controller {
- return pod.OwnerReferences
- }
- }
- }
-
- gvk := schema.GroupVersionKind{
- Group: v1.SchemeGroupVersion.Group,
- Version: v1.SchemeGroupVersion.Version,
- Kind: "Pod",
- }
- ref := metav1.NewControllerRef(pod, gvk)
- return []metav1.OwnerReference{*ref}
-}
-
-// addResourceList add list resource quantity
-func addResourceList(list, req, limit v1.ResourceList) {
- for name, quantity := range req {
- if value, ok := list[name]; !ok {
- list[name] = quantity.DeepCopy()
- } else {
- value.Add(quantity)
- list[name] = value
- }
- }
-
- if req != nil {
- return
- }
-
- // If Requests is omitted for a container,
- // it defaults to Limits if that is explicitly specified.
- for name, quantity := range limit {
- if value, ok := list[name]; !ok {
- list[name] = quantity.DeepCopy()
- } else {
- value.Add(quantity)
- list[name] = value
- }
- }
-}
-
-// calcPGMinResources calculate podgroup minimum resource
-func calcPGMinResources(pod *v1.Pod) *v1.ResourceList {
- pgMinRes := v1.ResourceList{}
-
- for _, c := range pod.Spec.Containers {
- addResourceList(pgMinRes, c.Resources.Requests, c.Resources.Limits)
- }
-
- return &pgMinRes
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
- "sync"
- "time"
-
- v1 "k8s.io/api/core/v1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- utilruntime "k8s.io/apimachinery/pkg/util/runtime"
- "k8s.io/apimachinery/pkg/util/wait"
- "k8s.io/client-go/kubernetes"
- corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/client-go/tools/record"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
- versionedscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
- informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
- busv1alpha1informer "volcano.sh/apis/pkg/client/informers/externalversions/bus/v1alpha1"
- schedulinginformer "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
- busv1alpha1lister "volcano.sh/apis/pkg/client/listers/bus/v1alpha1"
- schedulinglister "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
- "volcano.sh/volcano/pkg/controllers/framework"
- queuestate "volcano.sh/volcano/pkg/controllers/queue/state"
-)
-
-func init() {
- framework.RegisterController(&queuecontroller{})
-}
-
-// queuecontroller manages queue status.
-type queuecontroller struct {
- kubeClient kubernetes.Interface
- vcClient vcclientset.Interface
-
- // informer
- queueInformer schedulinginformer.QueueInformer
- pgInformer schedulinginformer.PodGroupInformer
-
- // queueLister
- queueLister schedulinglister.QueueLister
- queueSynced cache.InformerSynced
-
- // podGroup lister
- pgLister schedulinglister.PodGroupLister
- pgSynced cache.InformerSynced
-
- cmdInformer busv1alpha1informer.CommandInformer
- cmdLister busv1alpha1lister.CommandLister
- cmdSynced cache.InformerSynced
-
- // queues that need to be updated.
- queue workqueue.RateLimitingInterface
- commandQueue workqueue.RateLimitingInterface
-
- pgMutex sync.RWMutex
- // queue name -> podgroup namespace/name
- podGroups map[string]map[string]struct{}
-
- syncHandler func(req *apis.Request) error
- syncCommandHandler func(cmd *busv1alpha1.Command) error
-
- enqueueQueue func(req *apis.Request)
-
- recorder record.EventRecorder
- maxRequeueNum int
-}
-
-func (c *queuecontroller) Name() string {
- return "queue-controller"
-}
-
-// NewQueueController creates a QueueController.
-func (c *queuecontroller) Initialize(opt *framework.ControllerOption) error {
- c.vcClient = opt.VolcanoClient
- c.kubeClient = opt.KubeClient
-
- factory := informerfactory.NewSharedInformerFactory(c.vcClient, 0)
- queueInformer := factory.Scheduling().V1beta1().Queues()
- pgInformer := factory.Scheduling().V1beta1().PodGroups()
-
- eventBroadcaster := record.NewBroadcaster()
- eventBroadcaster.StartLogging(klog.Infof)
- eventBroadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: c.kubeClient.CoreV1().Events("")})
-
- c.queueInformer = queueInformer
- c.pgInformer = pgInformer
- c.queueLister = queueInformer.Lister()
- c.queueSynced = queueInformer.Informer().HasSynced
- c.pgLister = pgInformer.Lister()
- c.pgSynced = pgInformer.Informer().HasSynced
- c.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
- c.commandQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
- c.podGroups = make(map[string]map[string]struct{})
- c.recorder = eventBroadcaster.NewRecorder(versionedscheme.Scheme, v1.EventSource{Component: "vc-controller-manager"})
- c.maxRequeueNum = opt.MaxRequeueNum
- if c.maxRequeueNum < 0 {
- c.maxRequeueNum = -1
- }
-
- queueInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: c.addQueue,
- UpdateFunc: c.updateQueue,
- DeleteFunc: c.deleteQueue,
- })
-
- pgInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: c.addPodGroup,
- UpdateFunc: c.updatePodGroup,
- DeleteFunc: c.deletePodGroup,
- })
-
- c.cmdInformer = informerfactory.NewSharedInformerFactory(c.vcClient, 0).Bus().V1alpha1().Commands()
- c.cmdInformer.Informer().AddEventHandler(cache.FilteringResourceEventHandler{
- FilterFunc: func(obj interface{}) bool {
- switch v := obj.(type) {
- case *busv1alpha1.Command:
- return IsQueueReference(v.TargetObject)
- default:
- return false
- }
- },
- Handler: cache.ResourceEventHandlerFuncs{
- AddFunc: c.addCommand,
- },
- })
- c.cmdLister = c.cmdInformer.Lister()
- c.cmdSynced = c.cmdInformer.Informer().HasSynced
-
- queuestate.SyncQueue = c.syncQueue
- queuestate.OpenQueue = c.openQueue
- queuestate.CloseQueue = c.closeQueue
-
- c.syncHandler = c.handleQueue
- c.syncCommandHandler = c.handleCommand
-
- c.enqueueQueue = c.enqueue
-
- return nil
-}
-
-// Run starts QueueController.
-func (c *queuecontroller) Run(stopCh <-chan struct{}) {
- defer utilruntime.HandleCrash()
- defer c.queue.ShutDown()
- defer c.commandQueue.ShutDown()
-
- klog.Infof("Starting queue controller.")
- defer klog.Infof("Shutting down queue controller.")
-
- go c.queueInformer.Informer().Run(stopCh)
- go c.pgInformer.Informer().Run(stopCh)
- go c.cmdInformer.Informer().Run(stopCh)
-
- if !cache.WaitForCacheSync(stopCh, c.queueSynced, c.pgSynced, c.cmdSynced) {
- klog.Errorf("unable to sync caches for queue controller.")
- return
- }
-
- go wait.Until(c.worker, 0, stopCh)
- go wait.Until(c.commandWorker, 0, stopCh)
-
- <-stopCh
-}
-
-// worker runs a worker thread that just dequeues items, processes them, and
-// marks them done. You may run as many of these in parallel as you wish; the
-// workqueue guarantees that they will not end up processing the same `queue`
-// at the same time.
-func (c *queuecontroller) worker() {
- for c.processNextWorkItem() {
- }
-}
-
-func (c *queuecontroller) processNextWorkItem() bool {
- obj, shutdown := c.queue.Get()
- if shutdown {
- return false
- }
- defer c.queue.Done(obj)
-
- req, ok := obj.(*apis.Request)
- if !ok {
- klog.Errorf("%v is not a valid queue request struct.", obj)
- return true
- }
-
- err := c.syncHandler(req)
- c.handleQueueErr(err, obj)
-
- return true
-}
-
-func (c *queuecontroller) handleQueue(req *apis.Request) error {
- startTime := time.Now()
- defer func() {
- klog.V(4).Infof("Finished syncing queue %s (%v).", req.QueueName, time.Since(startTime))
- }()
-
- queue, err := c.queueLister.Get(req.QueueName)
- if err != nil {
- if apierrors.IsNotFound(err) {
- klog.V(4).Infof("Queue %s has been deleted.", req.QueueName)
- return nil
- }
-
- return fmt.Errorf("get queue %s failed for %v", req.QueueName, err)
- }
-
- queueState := queuestate.NewState(queue)
- if queueState == nil {
- return fmt.Errorf("queue %s state %s is invalid", queue.Name, queue.Status.State)
- }
-
- klog.V(4).Infof("Begin execute %s action for queue %s, current status %s", req.Action, req.QueueName, queue.Status.State)
- if err := queueState.Execute(req.Action); err != nil {
- return fmt.Errorf("sync queue %s failed for %v, event is %v, action is %s",
- req.QueueName, err, req.Event, req.Action)
- }
-
- return nil
-}
-
-func (c *queuecontroller) handleQueueErr(err error, obj interface{}) {
- if err == nil {
- c.queue.Forget(obj)
- return
- }
-
- if c.maxRequeueNum == -1 || c.queue.NumRequeues(obj) < c.maxRequeueNum {
- klog.V(4).Infof("Error syncing queue request %v for %v.", obj, err)
- c.queue.AddRateLimited(obj)
- return
- }
-
- req, _ := obj.(*apis.Request)
- c.recordEventsForQueue(req.QueueName, v1.EventTypeWarning, string(req.Action),
- fmt.Sprintf("%v queue failed for %v", req.Action, err))
- klog.V(2).Infof("Dropping queue request %v out of the queue for %v.", obj, err)
- c.queue.Forget(obj)
-}
-
-func (c *queuecontroller) commandWorker() {
- for c.processNextCommand() {
- }
-}
-
-func (c *queuecontroller) processNextCommand() bool {
- obj, shutdown := c.commandQueue.Get()
- if shutdown {
- return false
- }
- defer c.commandQueue.Done(obj)
-
- cmd, ok := obj.(*busv1alpha1.Command)
- if !ok {
- klog.Errorf("%v is not a valid Command struct.", obj)
- return true
- }
-
- err := c.syncCommandHandler(cmd)
- c.handleCommandErr(err, obj)
-
- return true
-}
-
-func (c *queuecontroller) handleCommand(cmd *busv1alpha1.Command) error {
- startTime := time.Now()
- defer func() {
- klog.V(4).Infof("Finished syncing command %s/%s (%v).", cmd.Namespace, cmd.Name, time.Since(startTime))
- }()
-
- err := c.vcClient.BusV1alpha1().Commands(cmd.Namespace).Delete(context.TODO(), cmd.Name, metav1.DeleteOptions{})
- if err != nil {
- if apierrors.IsNotFound(err) {
- return nil
- }
-
- return fmt.Errorf("failed to delete command <%s/%s> for %v", cmd.Namespace, cmd.Name, err)
- }
-
- req := &apis.Request{
- QueueName: cmd.TargetObject.Name,
- Event: busv1alpha1.CommandIssuedEvent,
- Action: busv1alpha1.Action(cmd.Action),
- }
-
- c.enqueueQueue(req)
-
- return nil
-}
-
-func (c *queuecontroller) handleCommandErr(err error, obj interface{}) {
- if err == nil {
- c.commandQueue.Forget(obj)
- return
- }
-
- if c.maxRequeueNum == -1 || c.commandQueue.NumRequeues(obj) < c.maxRequeueNum {
- klog.V(4).Infof("Error syncing command %v for %v.", obj, err)
- c.commandQueue.AddRateLimited(obj)
- return
- }
-
- klog.V(2).Infof("Dropping command %v out of the queue for %v.", obj, err)
- c.commandQueue.Forget(obj)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
- "reflect"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/queue/state"
-
- v1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/client-go/tools/cache"
-
- "k8s.io/klog"
-)
-
-func (c *queuecontroller) syncQueue(queue *schedulingv1beta1.Queue, updateStateFn state.UpdateQueueStatusFn) error {
- klog.V(4).Infof("Begin to sync queue %s.", queue.Name)
- defer klog.V(4).Infof("End sync queue %s.", queue.Name)
-
- podGroups := c.getPodGroups(queue.Name)
- queueStatus := schedulingv1beta1.QueueStatus{}
-
- for _, pgKey := range podGroups {
- // Ignore error here, tt can not occur.
- ns, name, _ := cache.SplitMetaNamespaceKey(pgKey)
-
- // TODO: check NotFound error and sync local cache.
- pg, err := c.pgLister.PodGroups(ns).Get(name)
- if err != nil {
- return err
- }
-
- switch pg.Status.Phase {
- case schedulingv1beta1.PodGroupPending:
- queueStatus.Pending++
- case schedulingv1beta1.PodGroupRunning:
- queueStatus.Running++
- case schedulingv1beta1.PodGroupUnknown:
- queueStatus.Unknown++
- case schedulingv1beta1.PodGroupInqueue:
- queueStatus.Inqueue++
- }
- }
-
- if updateStateFn != nil {
- updateStateFn(&queueStatus, podGroups)
- } else {
- queueStatus.State = queue.Status.State
- }
-
- // ignore update when status does not change
- if equality.Semantic.DeepEqual(queueStatus, queue.Status) {
- return nil
- }
-
- newQueue := queue.DeepCopy()
- newQueue.Status = queueStatus
- if _, err := c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- klog.Errorf("Failed to update status of Queue %s: %v.", newQueue.Name, err)
- return err
- }
-
- return nil
-}
-
-func (c *queuecontroller) openQueue(queue *schedulingv1beta1.Queue, updateStateFn state.UpdateQueueStatusFn) error {
- klog.V(4).Infof("Begin to open queue %s.", queue.Name)
-
- newQueue := queue.DeepCopy()
- newQueue.Status.State = schedulingv1beta1.QueueStateOpen
-
- if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().Update(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.OpenQueueAction),
- fmt.Sprintf("Open queue failed for %v", err))
- return err
- }
-
- c.recorder.Event(newQueue, v1.EventTypeNormal, string(v1alpha1.OpenQueueAction), "Open queue succeed")
- } else {
- return nil
- }
-
- q, err := c.vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), newQueue.Name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- newQueue = q.DeepCopy()
- if updateStateFn != nil {
- updateStateFn(&newQueue.Status, nil)
- } else {
- return fmt.Errorf("internal error, update state function should be provided")
- }
-
- if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.OpenQueueAction),
- fmt.Sprintf("Update queue status from %s to %s failed for %v",
- queue.Status.State, newQueue.Status.State, err))
- return err
- }
- }
-
- return nil
-}
-
-func (c *queuecontroller) closeQueue(queue *schedulingv1beta1.Queue, updateStateFn state.UpdateQueueStatusFn) error {
- klog.V(4).Infof("Begin to close queue %s.", queue.Name)
-
- newQueue := queue.DeepCopy()
- newQueue.Status.State = schedulingv1beta1.QueueStateClosed
-
- if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().Update(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.CloseQueueAction),
- fmt.Sprintf("Close queue failed for %v", err))
- return err
- }
-
- c.recorder.Event(newQueue, v1.EventTypeNormal, string(v1alpha1.CloseQueueAction), "Close queue succeed")
- } else {
- return nil
- }
-
- q, err := c.vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), newQueue.Name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- newQueue = q.DeepCopy()
- podGroups := c.getPodGroups(newQueue.Name)
- if updateStateFn != nil {
- updateStateFn(&newQueue.Status, podGroups)
- } else {
- return fmt.Errorf("internal error, update state function should be provided")
- }
-
- if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.CloseQueueAction),
- fmt.Sprintf("Update queue status from %s to %s failed for %v",
- queue.Status.State, newQueue.Status.State, err))
- return err
- }
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "k8s.io/client-go/tools/cache"
- "k8s.io/klog"
-
- busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
-)
-
-func (c *queuecontroller) enqueue(req *apis.Request) {
- c.queue.Add(req)
-}
-
-func (c *queuecontroller) addQueue(obj interface{}) {
- queue := obj.(*schedulingv1beta1.Queue)
-
- req := &apis.Request{
- QueueName: queue.Name,
-
- Event: busv1alpha1.OutOfSyncEvent,
- Action: busv1alpha1.SyncQueueAction,
- }
-
- c.enqueue(req)
-}
-
-func (c *queuecontroller) deleteQueue(obj interface{}) {
- queue, ok := obj.(*schedulingv1beta1.Queue)
- if !ok {
- tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
- if !ok {
- klog.Errorf("Couldn't get object from tombstone %#v.", obj)
- return
- }
- queue, ok = tombstone.Obj.(*schedulingv1beta1.Queue)
- if !ok {
- klog.Errorf("Tombstone contained object that is not a Queue: %#v.", obj)
- return
- }
- }
-
- c.pgMutex.Lock()
- defer c.pgMutex.Unlock()
- delete(c.podGroups, queue.Name)
-}
-
-func (c *queuecontroller) updateQueue(_, _ interface{}) {
- // currently do not care about queue update
-}
-
-func (c *queuecontroller) addPodGroup(obj interface{}) {
- pg := obj.(*schedulingv1beta1.PodGroup)
- key, _ := cache.MetaNamespaceKeyFunc(obj)
-
- c.pgMutex.Lock()
- defer c.pgMutex.Unlock()
-
- if c.podGroups[pg.Spec.Queue] == nil {
- c.podGroups[pg.Spec.Queue] = make(map[string]struct{})
- }
- c.podGroups[pg.Spec.Queue][key] = struct{}{}
-
- req := &apis.Request{
- QueueName: pg.Spec.Queue,
-
- Event: busv1alpha1.OutOfSyncEvent,
- Action: busv1alpha1.SyncQueueAction,
- }
-
- c.enqueue(req)
-}
-
-func (c *queuecontroller) updatePodGroup(old, new interface{}) {
- oldPG := old.(*schedulingv1beta1.PodGroup)
- newPG := new.(*schedulingv1beta1.PodGroup)
-
- // Note: we have no use case update PodGroup.Spec.Queue
- // So do not consider it here.
- if oldPG.Status.Phase != newPG.Status.Phase {
- c.addPodGroup(newPG)
- }
-}
-
-func (c *queuecontroller) deletePodGroup(obj interface{}) {
- pg, ok := obj.(*schedulingv1beta1.PodGroup)
- if !ok {
- tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
- if !ok {
- klog.Errorf("Couldn't get object from tombstone %#v.", obj)
- return
- }
- pg, ok = tombstone.Obj.(*schedulingv1beta1.PodGroup)
- if !ok {
- klog.Errorf("Tombstone contained object that is not a PodGroup: %#v.", obj)
- return
- }
- }
-
- key, _ := cache.MetaNamespaceKeyFunc(obj)
-
- c.pgMutex.Lock()
- defer c.pgMutex.Unlock()
-
- delete(c.podGroups[pg.Spec.Queue], key)
-
- req := &apis.Request{
- QueueName: pg.Spec.Queue,
-
- Event: busv1alpha1.OutOfSyncEvent,
- Action: busv1alpha1.SyncQueueAction,
- }
-
- c.enqueue(req)
-}
-
-func (c *queuecontroller) addCommand(obj interface{}) {
- cmd, ok := obj.(*busv1alpha1.Command)
- if !ok {
- klog.Errorf("Obj %v is not command.", obj)
- return
- }
-
- c.commandQueue.Add(cmd)
-}
-
-func (c *queuecontroller) getPodGroups(key string) []string {
- c.pgMutex.RLock()
- defer c.pgMutex.RUnlock()
-
- if c.podGroups[key] == nil {
- return nil
- }
- podGroups := make([]string, 0, len(c.podGroups[key]))
- for pgKey := range c.podGroups[key] {
- podGroups = append(podGroups, pgKey)
- }
-
- return podGroups
-}
-
-func (c *queuecontroller) recordEventsForQueue(name, eventType, reason, message string) {
- queue, err := c.queueLister.Get(name)
- if err != nil {
- klog.Errorf("Get queue %s failed for %v.", name, err)
- return
- }
-
- c.recorder.Event(queue, eventType, reason, message)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-)
-
-// IsQueueReference return if ownerReference is Queue Kind.
-func IsQueueReference(ref *metav1.OwnerReference) bool {
- if ref == nil {
- return false
- }
-
- if ref.APIVersion != schedulingv1beta1.SchemeGroupVersion.String() {
- return false
- }
-
- if ref.Kind != "Queue" {
- return false
- }
-
- return true
-}
-
-
-
/*
- Copyright 2021 The Volcano Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-package allocate
-
-import (
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/metrics"
- "volcano.sh/volcano/pkg/scheduler/util"
-)
-
-var targetJob = util.Reservation.TargetJob
-
-type Action struct{}
-
-func New() *Action {
- return &Action{}
-}
-
-func (alloc *Action) Name() string {
- return "allocate"
-}
-
-func (alloc *Action) Initialize() {}
-
-func (alloc *Action) Execute(ssn *framework.Session) {
- klog.V(3).Infof("Enter Allocate ...")
- defer klog.V(3).Infof("Leaving Allocate ...")
-
- // the allocation for pod may have many stages
- // 1. pick a namespace named N (using ssn.NamespaceOrderFn)
- // 2. pick a queue named Q from N (using ssn.QueueOrderFn)
- // 3. pick a job named J from Q (using ssn.JobOrderFn)
- // 4. pick a task T from J (using ssn.TaskOrderFn)
- // 5. use predicateFn to filter out node that T can not be allocated on.
- // 6. use ssn.NodeOrderFn to judge the best node and assign it to T
-
- namespaces := util.NewPriorityQueue(ssn.NamespaceOrderFn)
-
- // jobsMap is map[api.NamespaceName]map[api.QueueID]PriorityQueue(*api.JobInfo)
- // used to find job with highest priority in given queue and namespace
- jobsMap := map[api.NamespaceName]map[api.QueueID]*util.PriorityQueue{}
-
- for _, job := range ssn.Jobs {
- if job.IsPending() {
- klog.V(4).Infof("Job <%s/%s> Queue <%s> skip allocate, reason: job status is pending.",
- job.Namespace, job.Name, job.Queue)
- continue
- }
- if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
- klog.V(4).Infof("Job <%s/%s> Queue <%s> skip allocate, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
- continue
- }
-
- if _, found := ssn.Queues[job.Queue]; !found {
- klog.Warningf("Skip adding Job <%s/%s> because its queue %s is not found",
- job.Namespace, job.Name, job.Queue)
- continue
- }
-
- namespace := api.NamespaceName(job.Namespace)
- queueMap, found := jobsMap[namespace]
- if !found {
- namespaces.Push(namespace)
-
- queueMap = make(map[api.QueueID]*util.PriorityQueue)
- jobsMap[namespace] = queueMap
- }
-
- jobs, found := queueMap[job.Queue]
- if !found {
- jobs = util.NewPriorityQueue(ssn.JobOrderFn)
- queueMap[job.Queue] = jobs
- }
-
- klog.V(4).Infof("Added Job <%s/%s> into Queue <%s>", job.Namespace, job.Name, job.Queue)
- jobs.Push(job)
- }
-
- klog.V(3).Infof("Try to allocate resource to %d Namespaces", len(jobsMap))
-
- pendingTasks := map[api.JobID]*util.PriorityQueue{}
-
- allNodes := ssn.NodeList
- unlockedNodes := allNodes
- if targetJob != nil && len(util.Reservation.LockedNodes) != 0 {
- unlockedNodes = unlockedNodes[0:0]
- for _, node := range allNodes {
- if _, exist := util.Reservation.LockedNodes[node.Name]; !exist {
- unlockedNodes = append(unlockedNodes, node)
- }
- }
- }
- for _, unlockedNode := range unlockedNodes {
- klog.V(4).Infof("unlockedNode ID: %s, Name: %s", unlockedNode.Node.UID, unlockedNode.Node.Name)
- }
- predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) error {
- // Check for Resource Predicate
- if !task.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
- return api.NewFitError(task, node, api.NodeResourceFitFailed)
- }
-
- return ssn.PredicateFn(task, node)
- }
-
- // To pick <namespace, queue> tuple for job, we choose to pick namespace firstly.
- // Because we believe that number of queues would less than namespaces in most case.
- // And, this action would make the resource usage among namespace balanced.
- for {
- if namespaces.Empty() {
- break
- }
-
- // pick namespace from namespaces PriorityQueue
- namespace := namespaces.Pop().(api.NamespaceName)
-
- queueInNamespace := jobsMap[namespace]
-
- // pick queue for given namespace
- //
- // This block use a algorithm with time complex O(n).
- // But at least PriorityQueue could not be used here,
- // because the allocation of job would change the priority of queue among all namespaces,
- // and the PriorityQueue have no ability to update priority for a special queue.
- var queue *api.QueueInfo
- for queueID := range queueInNamespace {
- currentQueue := ssn.Queues[queueID]
- if ssn.Overused(currentQueue) {
- klog.V(3).Infof("Namespace <%s> Queue <%s> is overused, ignore it.", namespace, currentQueue.Name)
- delete(queueInNamespace, queueID)
- continue
- }
- if jobs, found := queueInNamespace[currentQueue.UID]; found && jobs.Empty() {
- continue
- }
-
- if queue == nil || ssn.QueueOrderFn(currentQueue, queue) {
- queue = currentQueue
- }
- }
-
- if queue == nil {
- klog.V(3).Infof("Namespace <%s> have no queue, skip it", namespace)
- continue
- }
-
- klog.V(3).Infof("Try to allocate resource to Jobs in Namespace <%s> Queue <%v>", namespace, queue.Name)
-
- jobs, found := queueInNamespace[queue.UID]
- if !found || jobs.Empty() {
- delete(queueInNamespace, queue.UID)
- namespaces.Push(namespace)
- klog.V(4).Infof("Can not find jobs for queue %s.", queue.Name)
- continue
- }
-
- job := jobs.Pop().(*api.JobInfo)
- var nodes []*api.NodeInfo
- if targetJob != nil && job.UID == targetJob.UID {
- klog.V(4).Infof("Try to allocate resource to target job: %s", job.Name)
- nodes = allNodes
- } else {
- nodes = unlockedNodes
- }
- if _, found = pendingTasks[job.UID]; !found {
- tasks := util.NewPriorityQueue(ssn.TaskOrderFn)
- for _, task := range job.TaskStatusIndex[api.Pending] {
- // Skip BestEffort task in 'allocate' action.
- if task.Resreq.IsEmpty() {
- klog.V(4).Infof("Task <%v/%v> is BestEffort task, skip it.",
- task.Namespace, task.Name)
- continue
- }
-
- tasks.Push(task)
- }
- pendingTasks[job.UID] = tasks
- }
- tasks := pendingTasks[job.UID]
-
- klog.V(3).Infof("Try to allocate resource to %d tasks of Job <%v/%v>",
- tasks.Len(), job.Namespace, job.Name)
-
- stmt := framework.NewStatement(ssn)
-
- for !tasks.Empty() {
- task := tasks.Pop().(*api.TaskInfo)
-
- // Check whether the queue is overused on dimension that the task requested
- taskRequest := task.Resreq.ResourceNames()
- if underusedResources := ssn.UnderusedResources(queue); underusedResources != nil && !underusedResources.Contains(taskRequest) {
- klog.V(3).Infof("Queue <%s> is overused when considering task <%s>, ignore it.", queue.Name, task.Name)
- continue
- }
-
- klog.V(3).Infof("There are <%d> nodes for Job <%v/%v>", len(nodes), job.Namespace, job.Name)
-
- predicateNodes, fitErrors := util.PredicateNodes(task, nodes, predicateFn)
- if len(predicateNodes) == 0 {
- job.NodesFitErrors[task.UID] = fitErrors
- break
- }
-
- var candidateNodes []*api.NodeInfo
- for _, n := range predicateNodes {
- if task.InitResreq.LessEqual(n.Idle, api.Zero) || task.InitResreq.LessEqual(n.FutureIdle(), api.Zero) {
- candidateNodes = append(candidateNodes, n)
- }
- }
-
- // If not candidate nodes for this task, skip it.
- if len(candidateNodes) == 0 {
- continue
- }
-
- nodeScores := util.PrioritizeNodes(task, candidateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
-
- node := ssn.BestNodeFn(task, nodeScores)
- if node == nil {
- node = util.SelectBestNode(nodeScores)
- }
-
- // Allocate idle resource to the task.
- if task.InitResreq.LessEqual(node.Idle, api.Zero) {
- klog.V(3).Infof("Binding Task <%v/%v> to node <%v>",
- task.Namespace, task.Name, node.Name)
- if err := stmt.Allocate(task, node); err != nil {
- klog.Errorf("Failed to bind Task %v on %v in Session %v, err: %v",
- task.UID, node.Name, ssn.UID, err)
- } else {
- metrics.UpdateE2eSchedulingDurationByJob(job.Name, string(job.Queue), job.Namespace, metrics.Duration(job.CreationTimestamp.Time))
- }
- } else {
- klog.V(3).Infof("Predicates failed for task <%s/%s> on node <%s> with limited resources",
- task.Namespace, task.Name, node.Name)
-
- // Allocate releasing resource to the task if any.
- if task.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
- klog.V(3).Infof("Pipelining Task <%v/%v> to node <%v> for <%v> on <%v>",
- task.Namespace, task.Name, node.Name, task.InitResreq, node.Releasing)
- if err := stmt.Pipeline(task, node.Name); err != nil {
- klog.Errorf("Failed to pipeline Task %v on %v in Session %v for %v.",
- task.UID, node.Name, ssn.UID, err)
- } else {
- metrics.UpdateE2eSchedulingDurationByJob(job.Name, string(job.Queue), job.Namespace, metrics.Duration(job.CreationTimestamp.Time))
- }
- }
- }
-
- if ssn.JobReady(job) && !tasks.Empty() {
- jobs.Push(job)
- break
- }
- }
-
- if ssn.JobReady(job) {
- stmt.Commit()
- } else {
- if !ssn.JobPipelined(job) {
- stmt.Discard()
- }
- }
-
- // Added Namespace back until no job in Namespace.
- namespaces.Push(namespace)
- }
-}
-
-func (alloc *Action) UnInitialize() {}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package preempt
-
-import (
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/metrics"
- "volcano.sh/volcano/pkg/scheduler/util"
-)
-
-type Action struct{}
-
-func New() *Action {
- return &Action{}
-}
-
-func (alloc *Action) Name() string {
- return "preempt"
-}
-
-func (alloc *Action) Initialize() {}
-
-func (alloc *Action) Execute(ssn *framework.Session) {
- klog.V(3).Infof("Enter Preempt ...")
- defer klog.V(3).Infof("Leaving Preempt ...")
-
- preemptorsMap := map[api.QueueID]*util.PriorityQueue{}
- preemptorTasks := map[api.JobID]*util.PriorityQueue{}
-
- var underRequest []*api.JobInfo
- queues := map[api.QueueID]*api.QueueInfo{}
-
- for _, job := range ssn.Jobs {
- if job.IsPending() {
- continue
- }
-
- if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
- klog.V(4).Infof("Job <%s/%s> Queue <%s> skip preemption, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
- continue
- }
-
- if queue, found := ssn.Queues[job.Queue]; !found {
- continue
- } else if _, existed := queues[queue.UID]; !existed {
- klog.V(3).Infof("Added Queue <%s> for Job <%s/%s>",
- queue.Name, job.Namespace, job.Name)
- queues[queue.UID] = queue
- }
-
- // check job if starting for more resources.
- if ssn.JobStarving(job) {
- if _, found := preemptorsMap[job.Queue]; !found {
- preemptorsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn)
- }
- preemptorsMap[job.Queue].Push(job)
- underRequest = append(underRequest, job)
- preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
- for _, task := range job.TaskStatusIndex[api.Pending] {
- preemptorTasks[job.UID].Push(task)
- }
- }
- }
-
- // Preemption between Jobs within Queue.
- for _, queue := range queues {
- for {
- preemptors := preemptorsMap[queue.UID]
-
- // If no preemptors, no preemption.
- if preemptors == nil || preemptors.Empty() {
- klog.V(4).Infof("No preemptors in Queue <%s>, break.", queue.Name)
- break
- }
-
- preemptorJob := preemptors.Pop().(*api.JobInfo)
-
- stmt := framework.NewStatement(ssn)
- assigned := false
- for {
- // If job is not request more resource, then stop preempting.
- if !ssn.JobStarving(preemptorJob) {
- break
- }
-
- // If not preemptor tasks, next job.
- if preemptorTasks[preemptorJob.UID].Empty() {
- klog.V(3).Infof("No preemptor task in job <%s/%s>.",
- preemptorJob.Namespace, preemptorJob.Name)
- break
- }
-
- preemptor := preemptorTasks[preemptorJob.UID].Pop().(*api.TaskInfo)
-
- if preempted, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool {
- // Ignore non running task.
- if task.Status != api.Running {
- return false
- }
- // Ignore task with empty resource request.
- if task.Resreq.IsEmpty() {
- return false
- }
- job, found := ssn.Jobs[task.Job]
- if !found {
- return false
- }
- // Preempt other jobs within queue
- return job.Queue == preemptorJob.Queue && preemptor.Job != task.Job
- }); preempted {
- assigned = true
- }
- }
-
- // Commit changes only if job is pipelined, otherwise try next job.
- if ssn.JobPipelined(preemptorJob) {
- stmt.Commit()
- } else {
- stmt.Discard()
- continue
- }
-
- if assigned {
- preemptors.Push(preemptorJob)
- }
- }
-
- // Preemption between Task within Job.
- for _, job := range underRequest {
- // Fix: preemptor numbers lose when in same job
- preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
- for _, task := range job.TaskStatusIndex[api.Pending] {
- preemptorTasks[job.UID].Push(task)
- }
- for {
- if _, found := preemptorTasks[job.UID]; !found {
- break
- }
-
- if preemptorTasks[job.UID].Empty() {
- break
- }
-
- preemptor := preemptorTasks[job.UID].Pop().(*api.TaskInfo)
-
- stmt := framework.NewStatement(ssn)
- assigned, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool {
- // Ignore non running task.
- if task.Status != api.Running {
- return false
- }
- // Ignore task with empty resource request.
- if task.Resreq.IsEmpty() {
- return false
- }
- // Preempt tasks within job.
- return preemptor.Job == task.Job
- })
- stmt.Commit()
-
- // If no preemption, next job.
- if !assigned {
- break
- }
- }
- }
- }
-
- // call victimTasksFn to evict tasks
- victimTasks(ssn)
-}
-
-func (alloc *Action) UnInitialize() {}
-
-func preempt(
- ssn *framework.Session,
- stmt *framework.Statement,
- preemptor *api.TaskInfo,
- filter func(*api.TaskInfo) bool,
-) (bool, error) {
- assigned := false
-
- allNodes := ssn.NodeList
-
- predicateNodes, _ := util.PredicateNodes(preemptor, allNodes, ssn.PredicateFn)
-
- nodeScores := util.PrioritizeNodes(preemptor, predicateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
-
- selectedNodes := util.SortNodes(nodeScores)
- for _, node := range selectedNodes {
- klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.",
- preemptor.Namespace, preemptor.Name, node.Name)
-
- var preemptees []*api.TaskInfo
- for _, task := range node.Tasks {
- if filter == nil {
- preemptees = append(preemptees, task.Clone())
- } else if filter(task) {
- preemptees = append(preemptees, task.Clone())
- }
- }
- victims := ssn.Preemptable(preemptor, preemptees)
- metrics.UpdatePreemptionVictimsCount(len(victims))
-
- if err := util.ValidateVictims(preemptor, node, victims); err != nil {
- klog.V(3).Infof("No validated victims on Node <%s>: %v", node.Name, err)
- continue
- }
-
- victimsQueue := util.NewPriorityQueue(func(l, r interface{}) bool {
- return !ssn.TaskOrderFn(l, r)
- })
- for _, victim := range victims {
- victimsQueue.Push(victim)
- }
- // Preempt victims for tasks, pick lowest priority task first.
- preempted := api.EmptyResource()
-
- for !victimsQueue.Empty() {
- // If reclaimed enough resources, break loop to avoid Sub panic.
- if preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
- break
- }
- preemptee := victimsQueue.Pop().(*api.TaskInfo)
- klog.V(3).Infof("Try to preempt Task <%s/%s> for Tasks <%s/%s>",
- preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name)
- if err := stmt.Evict(preemptee, "preempt"); err != nil {
- klog.Errorf("Failed to preempt Task <%s/%s> for Tasks <%s/%s>: %v",
- preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err)
- continue
- }
- preempted.Add(preemptee.Resreq)
- }
-
- metrics.RegisterPreemptionAttempts()
- klog.V(3).Infof("Preempted <%v> for Task <%s/%s> requested <%v>.",
- preempted, preemptor.Namespace, preemptor.Name, preemptor.InitResreq)
-
- if preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
- if err := stmt.Pipeline(preemptor, node.Name); err != nil {
- klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>",
- preemptor.Namespace, preemptor.Name, node.Name)
- }
-
- // Ignore pipeline error, will be corrected in next scheduling loop.
- assigned = true
-
- break
- }
- }
-
- return assigned, nil
-}
-
-func victimTasks(ssn *framework.Session) {
- stmt := framework.NewStatement(ssn)
- victimTasks := ssn.VictimTasks()
- for _, victim := range victimTasks {
- if err := stmt.Evict(victim.Clone(), "evict"); err != nil {
- klog.Errorf("Failed to evict Task <%s/%s>: %v",
- victim.Namespace, victim.Name, err)
- continue
- }
- }
- stmt.Commit()
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package reclaim
-
-import (
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/util"
-)
-
-type Action struct{}
-
-func New() *Action {
- return &Action{}
-}
-
-func (ra *Action) Name() string {
- return "reclaim"
-}
-
-func (ra *Action) Initialize() {}
-
-func (ra *Action) Execute(ssn *framework.Session) {
- klog.V(3).Infof("Enter Reclaim ...")
- defer klog.V(3).Infof("Leaving Reclaim ...")
-
- queues := util.NewPriorityQueue(ssn.QueueOrderFn)
- queueMap := map[api.QueueID]*api.QueueInfo{}
-
- preemptorsMap := map[api.QueueID]*util.PriorityQueue{}
- preemptorTasks := map[api.JobID]*util.PriorityQueue{}
-
- klog.V(3).Infof("There are <%d> Jobs and <%d> Queues in total for scheduling.",
- len(ssn.Jobs), len(ssn.Queues))
-
- for _, job := range ssn.Jobs {
- if job.IsPending() {
- continue
- }
-
- if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
- klog.V(4).Infof("Job <%s/%s> Queue <%s> skip reclaim, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
- continue
- }
-
- if queue, found := ssn.Queues[job.Queue]; !found {
- klog.Errorf("Failed to find Queue <%s> for Job <%s/%s>",
- job.Queue, job.Namespace, job.Name)
- continue
- } else if _, existed := queueMap[queue.UID]; !existed {
- klog.V(4).Infof("Added Queue <%s> for Job <%s/%s>", queue.Name, job.Namespace, job.Name)
- queueMap[queue.UID] = queue
- queues.Push(queue)
- }
-
- if len(job.TaskStatusIndex[api.Pending]) != 0 {
- if _, found := preemptorsMap[job.Queue]; !found {
- preemptorsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn)
- }
- preemptorsMap[job.Queue].Push(job)
- preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
- for _, task := range job.TaskStatusIndex[api.Pending] {
- preemptorTasks[job.UID].Push(task)
- }
- }
- }
-
- for {
- // If no queues, break
- if queues.Empty() {
- break
- }
-
- var job *api.JobInfo
- var task *api.TaskInfo
-
- queue := queues.Pop().(*api.QueueInfo)
- if ssn.Overused(queue) {
- klog.V(3).Infof("Queue <%s> is overused, ignore it.", queue.Name)
- continue
- }
-
- // Found "high" priority job
- jobs, found := preemptorsMap[queue.UID]
- if !found || jobs.Empty() {
- continue
- } else {
- job = jobs.Pop().(*api.JobInfo)
- }
-
- // Found "high" priority task to reclaim others
- if tasks, found := preemptorTasks[job.UID]; !found || tasks.Empty() {
- continue
- } else {
- task = tasks.Pop().(*api.TaskInfo)
- }
-
- // Check whether the queue is overused on dimension that the task requested
- taskRequest := task.Resreq.ResourceNames()
- if underusedResources := ssn.UnderusedResources(queue); underusedResources != nil && !underusedResources.Contains(taskRequest) {
- klog.V(3).Infof("Queue <%s> is overused when considering task <%s>, ignore it.", queue.Name, task.Name)
- continue
- }
-
- assigned := false
- for _, n := range ssn.Nodes {
- // If predicates failed, next node.
- if err := ssn.PredicateFn(task, n); err != nil {
- continue
- }
-
- klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.",
- task.Namespace, task.Name, n.Name)
-
- var reclaimees []*api.TaskInfo
- for _, task := range n.Tasks {
- // Ignore non running task.
- if task.Status != api.Running {
- continue
- }
-
- if j, found := ssn.Jobs[task.Job]; !found {
- continue
- } else if j.Queue != job.Queue {
- q := ssn.Queues[j.Queue]
- if !q.Reclaimable() {
- continue
- }
- // Clone task to avoid modify Task's status on node.
- reclaimees = append(reclaimees, task.Clone())
- }
- }
- victims := ssn.Reclaimable(task, reclaimees)
-
- if err := util.ValidateVictims(task, n, victims); err != nil {
- klog.V(3).Infof("No validated victims on Node <%s>: %v", n.Name, err)
- continue
- }
-
- resreq := task.InitResreq.Clone()
- reclaimed := api.EmptyResource()
-
- // Reclaim victims for tasks.
- for _, reclaimee := range victims {
- klog.Errorf("Try to reclaim Task <%s/%s> for Tasks <%s/%s>",
- reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name)
- if err := ssn.Evict(reclaimee, "reclaim"); err != nil {
- klog.Errorf("Failed to reclaim Task <%s/%s> for Tasks <%s/%s>: %v",
- reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name, err)
- continue
- }
- reclaimed.Add(reclaimee.Resreq)
- // If reclaimed enough resources, break loop to avoid Sub panic.
- if resreq.LessEqual(reclaimed, api.Zero) {
- break
- }
- }
-
- klog.V(3).Infof("Reclaimed <%v> for task <%s/%s> requested <%v>.",
- reclaimed, task.Namespace, task.Name, task.InitResreq)
-
- if task.InitResreq.LessEqual(reclaimed, api.Zero) {
- if err := ssn.Pipeline(task, n.Name); err != nil {
- klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>",
- task.Namespace, task.Name, n.Name)
- }
-
- // Ignore error of pipeline, will be corrected in next scheduling loop.
- assigned = true
-
- break
- }
- }
-
- if assigned {
- jobs.Push(job)
- }
- queues.Push(queue)
- }
-}
-
-func (ra *Action) UnInitialize() {
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
-)
-
-// ClusterInfo is a snapshot of cluster by cache.
-type ClusterInfo struct {
- Jobs map[JobID]*JobInfo
- Nodes map[string]*NodeInfo
- Queues map[QueueID]*QueueInfo
- NamespaceInfo map[NamespaceName]*NamespaceInfo
- RevocableNodes map[string]*NodeInfo
- NodeList []string
-}
-
-func (ci ClusterInfo) String() string {
- str := "Cache:\n"
-
- if len(ci.Nodes) != 0 {
- str += "Nodes:\n"
- for _, n := range ci.Nodes {
- str += fmt.Sprintf("\t %s: idle(%v) used(%v) allocatable(%v) pods(%d)\n",
- n.Name, n.Idle, n.Used, n.Allocatable, len(n.Tasks))
-
- i := 0
- for _, p := range n.Tasks {
- str += fmt.Sprintf("\t\t %d: %v\n", i, p)
- i++
- }
- }
- }
-
- if len(ci.Jobs) != 0 {
- str += "Jobs:\n"
- for _, job := range ci.Jobs {
- str += fmt.Sprintf("\t Job(%s) name(%s) minAvailable(%v)\n",
- job.UID, job.Name, job.MinAvailable)
-
- i := 0
- for _, task := range job.Tasks {
- str += fmt.Sprintf("\t\t %d: %v\n", i, task)
- i++
- }
- }
- }
-
- if len(ci.NamespaceInfo) != 0 {
- str += "Namespaces:\n"
- for _, ns := range ci.NamespaceInfo {
- str += fmt.Sprintf("\t Namespace(%s) Weight(%v)\n",
- ns.Name, ns.Weight)
- }
- }
-
- if len(ci.NodeList) != 0 {
- str += fmt.Sprintf("NodeList: %v\n", ci.NodeList)
- }
-
- return str
-}
-
-
-
/*
-Copyright 2020 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- v1 "k8s.io/api/core/v1"
-)
-
-// GPUDevice include gpu id, memory and the pods that are sharing it.
-type GPUDevice struct {
- // GPU ID
- ID int
- // The pods that are sharing this GPU
- PodMap map[string]*v1.Pod
- // memory per card
- Memory uint
-}
-
-// NewGPUDevice creates a device
-func NewGPUDevice(id int, mem uint) *GPUDevice {
- return &GPUDevice{
- ID: id,
- Memory: mem,
- PodMap: map[string]*v1.Pod{},
- }
-}
-
-// getUsedGPUMemory calculates the used memory of the device.
-func (g *GPUDevice) getUsedGPUMemory() uint {
- res := uint(0)
- for _, pod := range g.PodMap {
- if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
- continue
- } else {
- gpuRequest := GetGPUResourceOfPod(pod)
- res += gpuRequest
- }
- }
- return res
-}
-
-// GetGPUResourceOfPod returns the GPU resource required by the pod.
-func GetGPUResourceOfPod(pod *v1.Pod) uint {
- var mem uint
- for _, container := range pod.Spec.Containers {
- mem += getGPUResourceOfContainer(&container)
- }
- return mem
-}
-
-// getGPUResourceOfPod returns the GPU resource required by the container.
-func getGPUResourceOfContainer(container *v1.Container) uint {
- var mem uint
- if val, ok := container.Resources.Limits[VolcanoGPUResource]; ok {
- mem = uint(val.Value())
- }
- return mem
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
- clientcache "k8s.io/client-go/tools/cache"
-)
-
-// PodKey returns the string key of a pod.
-func PodKey(pod *v1.Pod) TaskID {
- key, err := clientcache.MetaNamespaceKeyFunc(pod)
- if err != nil {
- return TaskID(fmt.Sprintf("%v/%v", pod.Namespace, pod.Name))
- }
- return TaskID(key)
-}
-
-func getTaskStatus(pod *v1.Pod) TaskStatus {
- switch pod.Status.Phase {
- case v1.PodRunning:
- if pod.DeletionTimestamp != nil {
- return Releasing
- }
-
- return Running
- case v1.PodPending:
- if pod.DeletionTimestamp != nil {
- return Releasing
- }
-
- if len(pod.Spec.NodeName) == 0 {
- return Pending
- }
- return Bound
- case v1.PodUnknown:
- return Unknown
- case v1.PodSucceeded:
- return Succeeded
- case v1.PodFailed:
- return Failed
- }
-
- return Unknown
-}
-
-// AllocatedStatus checks whether the tasks has AllocatedStatus
-func AllocatedStatus(status TaskStatus) bool {
- switch status {
- case Bound, Binding, Running, Allocated:
- return true
- default:
- return false
- }
-}
-
-// MergeErrors is used to merge multiple errors into single error
-func MergeErrors(errs ...error) error {
- msg := "errors: "
-
- foundErr := false
- i := 1
-
- for _, e := range errs {
- if e != nil {
- if foundErr {
- msg = fmt.Sprintf("%s, %d: ", msg, i)
- } else {
- msg = fmt.Sprintf("%s %d: ", msg, i)
- }
-
- msg = fmt.Sprintf("%s%v", msg, e)
- foundErr = true
- i++
- }
- }
-
- if foundErr {
- return fmt.Errorf("%s", msg)
- }
-
- return nil
-}
-
-// JobTerminated checks whether job was terminated.
-func JobTerminated(job *JobInfo) bool {
- return job.PodGroup == nil && len(job.Tasks) == 0
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "errors"
- "fmt"
- "sort"
- "strconv"
- "strings"
- "time"
-
- v1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/types"
- "k8s.io/klog"
- volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-// DisruptionBudget define job min pod available and max pod unvailable value
-type DisruptionBudget struct {
- MinAvailable string
- MaxUnavilable string
-}
-
-// NewDisruptionBudget create disruption budget for job
-func NewDisruptionBudget(minAvailable, maxUnavilable string) *DisruptionBudget {
- disruptionBudget := &DisruptionBudget{
- MinAvailable: minAvailable,
- MaxUnavilable: maxUnavilable,
- }
- return disruptionBudget
-}
-
-// Clone return a clone of DisruptionBudget
-func (db *DisruptionBudget) Clone() *DisruptionBudget {
- return &DisruptionBudget{
- MinAvailable: db.MinAvailable,
- MaxUnavilable: db.MaxUnavilable,
- }
-}
-
-// JobWaitingTime is maximum waiting time that a job could stay Pending in service level agreement
-// when job waits longer than waiting time, it should be inqueue at once, and cluster should reserve resources for it
-const JobWaitingTime = "sla-waiting-time"
-
-// TaskID is UID type for Task
-type TaskID types.UID
-
-// TransactionContext holds all the fields that needed by scheduling transaction
-type TransactionContext struct {
- NodeName string
- Status TaskStatus
-}
-
-// Clone return a clone of TransactionContext
-func (ctx *TransactionContext) Clone() *TransactionContext {
- if ctx == nil {
- return nil
- }
- clone := *ctx
- return &clone
-}
-
-// TaskInfo will have all infos about the task
-type TaskInfo struct {
- UID TaskID
- Job JobID
-
- Name string
- Namespace string
-
- // Resreq is the resource that used when task running.
- Resreq *Resource
- // InitResreq is the resource that used to launch a task.
- InitResreq *Resource
-
- TransactionContext
- // LastTransaction holds the context of last scheduling transaction
- LastTransaction *TransactionContext
-
- Priority int32
- VolumeReady bool
- Preemptable bool
- BestEffort bool
-
- // RevocableZone support set volcano.sh/revocable-zone annotaion or label for pod/podgroup
- // we only support empty value or * value for this version and we will support specify revocable zone name for futrue release
- // empty value means workload can not use revocable node
- // * value means workload can use all the revocable node for during node active revocable time.
- RevocableZone string
-
- TopologyPolicy string
- PodVolumes *volumescheduling.PodVolumes
- Pod *v1.Pod
-}
-
-func getJobID(pod *v1.Pod) JobID {
- if gn, found := pod.Annotations[v1beta1.KubeGroupNameAnnotationKey]; found && len(gn) != 0 {
- // Make sure Pod and PodGroup belong to the same namespace.
- jobID := fmt.Sprintf("%s/%s", pod.Namespace, gn)
- return JobID(jobID)
- }
-
- return ""
-}
-
-func getTaskID(pod *v1.Pod) TaskID {
- if ts, found := pod.Annotations[batch.TaskSpecKey]; found && len(ts) != 0 {
- return TaskID(ts)
- }
-
- return ""
-}
-
-// NewTaskInfo creates new taskInfo object for a Pod
-func NewTaskInfo(pod *v1.Pod) *TaskInfo {
- initResReq := GetPodResourceRequest(pod)
- resReq := initResReq
- bestEffort := initResReq.IsEmpty()
- preemptable := GetPodPreemptable(pod)
- revocableZone := GetPodRevocableZone(pod)
- topologyPolicy := GetPodTopologyPolicy(pod)
-
- jobID := getJobID(pod)
-
- ti := &TaskInfo{
- UID: TaskID(pod.UID),
- Job: jobID,
- Name: pod.Name,
- Namespace: pod.Namespace,
- Priority: 1,
- Pod: pod,
- Resreq: resReq,
- InitResreq: initResReq,
- Preemptable: preemptable,
- BestEffort: bestEffort,
- RevocableZone: revocableZone,
- TopologyPolicy: topologyPolicy,
-
- TransactionContext: TransactionContext{
- NodeName: pod.Spec.NodeName,
- Status: getTaskStatus(pod),
- },
- }
-
- if pod.Spec.Priority != nil {
- ti.Priority = *pod.Spec.Priority
- }
-
- return ti
-}
-
-// GetTransactionContext get transaction context of a task
-func (ti *TaskInfo) GetTransactionContext() TransactionContext {
- return ti.TransactionContext
-}
-
-// GenerateLastTxContext generate and set context of last transaction for a task
-func (ti *TaskInfo) GenerateLastTxContext() {
- ctx := ti.GetTransactionContext()
- ti.LastTransaction = &ctx
-}
-
-// ClearLastTxContext clear context of last transaction for a task
-func (ti *TaskInfo) ClearLastTxContext() {
- ti.LastTransaction = nil
-}
-
-// Clone is used for cloning a task
-func (ti *TaskInfo) Clone() *TaskInfo {
- return &TaskInfo{
- UID: ti.UID,
- Job: ti.Job,
- Name: ti.Name,
- Namespace: ti.Namespace,
- Priority: ti.Priority,
- PodVolumes: ti.PodVolumes,
- Pod: ti.Pod,
- Resreq: ti.Resreq.Clone(),
- InitResreq: ti.InitResreq.Clone(),
- VolumeReady: ti.VolumeReady,
- Preemptable: ti.Preemptable,
- BestEffort: ti.BestEffort,
- RevocableZone: ti.RevocableZone,
- TopologyPolicy: ti.TopologyPolicy,
-
- TransactionContext: TransactionContext{
- NodeName: ti.NodeName,
- Status: ti.Status,
- },
- LastTransaction: ti.LastTransaction.Clone(),
- }
-}
-
-// String returns the taskInfo details in a string
-func (ti TaskInfo) String() string {
- return fmt.Sprintf("Task (%v:%v/%v): job %v, status %v, pri %v"+
- "resreq %v, preemptable %v, revocableZone %v, TopologyPolicy %v",
- ti.UID, ti.Namespace, ti.Name, ti.Job, ti.Status, ti.Priority,
- ti.Resreq, ti.Preemptable, ti.RevocableZone, ti.TopologyPolicy)
-}
-
-// JobID is the type of JobInfo's ID.
-type JobID types.UID
-
-type tasksMap map[TaskID]*TaskInfo
-
-// NodeResourceMap stores resource in a node
-type NodeResourceMap map[string]*Resource
-
-// JobInfo will have all info of a Job
-type JobInfo struct {
- UID JobID
-
- Name string
- Namespace string
-
- Queue QueueID
-
- Priority int32
-
- MinAvailable int32
-
- WaitingTime *time.Duration
-
- JobFitErrors string
- NodesFitErrors map[TaskID]*FitErrors
-
- // All tasks of the Job.
- TaskStatusIndex map[TaskStatus]tasksMap
- Tasks tasksMap
- TaskMinAvailable map[TaskID]int32
- TaskMinAvailableTotal int32
-
- Allocated *Resource
- TotalRequest *Resource
-
- CreationTimestamp metav1.Time
- PodGroup *PodGroup
-
- ScheduleStartTimestamp metav1.Time
-
- Preemptable bool
-
- // RevocableZone support set volcano.sh/revocable-zone annotaion or label for pod/podgroup
- // we only support empty value or * value for this version and we will support specify revocable zone name for futrue release
- // empty value means workload can not use revocable node
- // * value means workload can use all the revocable node for during node active revocable time.
- RevocableZone string
- Budget *DisruptionBudget
-}
-
-// NewJobInfo creates a new jobInfo for set of tasks
-func NewJobInfo(uid JobID, tasks ...*TaskInfo) *JobInfo {
- job := &JobInfo{
- UID: uid,
- MinAvailable: 0,
- NodesFitErrors: make(map[TaskID]*FitErrors),
- Allocated: EmptyResource(),
- TotalRequest: EmptyResource(),
- TaskStatusIndex: map[TaskStatus]tasksMap{},
- Tasks: tasksMap{},
- TaskMinAvailable: map[TaskID]int32{},
- }
-
- for _, task := range tasks {
- job.AddTaskInfo(task)
- }
-
- return job
-}
-
-// UnsetPodGroup removes podGroup details from a job
-func (ji *JobInfo) UnsetPodGroup() {
- ji.PodGroup = nil
-}
-
-// SetPodGroup sets podGroup details to a job
-func (ji *JobInfo) SetPodGroup(pg *PodGroup) {
- ji.Name = pg.Name
- ji.Namespace = pg.Namespace
- ji.MinAvailable = pg.Spec.MinMember
- ji.Queue = QueueID(pg.Spec.Queue)
- ji.CreationTimestamp = pg.GetCreationTimestamp()
-
- var err error
- ji.WaitingTime, err = ji.extractWaitingTime(pg)
- if err != nil {
- klog.Warningf("Error occurs in parsing waiting time for job <%s/%s>, err: %s.",
- pg.Namespace, pg.Name, err.Error())
- ji.WaitingTime = nil
- }
-
- ji.Preemptable = ji.extractPreemptable(pg)
- ji.RevocableZone = ji.extractRevocableZone(pg)
- ji.Budget = ji.extractBudget(pg)
-
- taskMinAvailableTotal := int32(0)
- for task, member := range pg.Spec.MinTaskMember {
- ji.TaskMinAvailable[TaskID(task)] = member
- taskMinAvailableTotal += member
- }
- ji.TaskMinAvailableTotal = taskMinAvailableTotal
-
- ji.PodGroup = pg
-}
-
-// extractWaitingTime reads sla waiting time for job from podgroup annotations
-// TODO: should also read from given field in volcano job spec
-func (ji *JobInfo) extractWaitingTime(pg *PodGroup) (*time.Duration, error) {
- if _, exist := pg.Annotations[JobWaitingTime]; !exist {
- return nil, nil
- }
-
- jobWaitingTime, err := time.ParseDuration(pg.Annotations[JobWaitingTime])
- if err != nil {
- return nil, err
- }
-
- if jobWaitingTime <= 0 {
- return nil, errors.New("invalid sla waiting time")
- }
-
- return &jobWaitingTime, nil
-}
-
-// extractPreemptable return volcano.sh/preemptable value for job
-func (ji *JobInfo) extractPreemptable(pg *PodGroup) bool {
- // check annotaion first
- if len(pg.Annotations) > 0 {
- if value, found := pg.Annotations[v1beta1.PodPreemptable]; found {
- b, err := strconv.ParseBool(value)
- if err != nil {
- klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
- return false
- }
- return b
- }
- }
-
- // it annotation does not exit, check label
- if len(pg.Labels) > 0 {
- if value, found := pg.Labels[v1beta1.PodPreemptable]; found {
- b, err := strconv.ParseBool(value)
- if err != nil {
- klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
- return false
- }
- return b
- }
- }
-
- return false
-}
-
-// extractRevocableZone return volcano.sh/revocable-zone value for pod/podgroup
-func (ji *JobInfo) extractRevocableZone(pg *PodGroup) string {
- // check annotaion first
- if len(pg.Annotations) > 0 {
- if value, found := pg.Annotations[v1beta1.RevocableZone]; found {
- if value != "*" {
- return ""
- }
- return value
- }
-
- if value, found := pg.Annotations[v1beta1.PodPreemptable]; found {
- if b, err := strconv.ParseBool(value); err == nil && b {
- return "*"
- }
- }
- }
-
- return ""
-}
-
-// extractBudget return budget value for job
-func (ji *JobInfo) extractBudget(pg *PodGroup) *DisruptionBudget {
- if len(pg.Annotations) > 0 {
- if value, found := pg.Annotations[v1beta1.JDBMinAvailable]; found {
- return NewDisruptionBudget(value, "")
- } else if value, found := pg.Annotations[v1beta1.JDBMaxUnavailable]; found {
- return NewDisruptionBudget("", value)
- }
- }
-
- return NewDisruptionBudget("", "")
-}
-
-// GetMinResources return the min resources of podgroup.
-func (ji *JobInfo) GetMinResources() *Resource {
- if ji.PodGroup.Spec.MinResources == nil {
- return EmptyResource()
- }
-
- return NewResource(*ji.PodGroup.Spec.MinResources)
-}
-
-func (ji *JobInfo) addTaskIndex(ti *TaskInfo) {
- if _, found := ji.TaskStatusIndex[ti.Status]; !found {
- ji.TaskStatusIndex[ti.Status] = tasksMap{}
- }
- ji.TaskStatusIndex[ti.Status][ti.UID] = ti
-}
-
-// AddTaskInfo is used to add a task to a job
-func (ji *JobInfo) AddTaskInfo(ti *TaskInfo) {
- ji.Tasks[ti.UID] = ti
- ji.addTaskIndex(ti)
- ji.TotalRequest.Add(ti.Resreq)
- if AllocatedStatus(ti.Status) {
- ji.Allocated.Add(ti.Resreq)
- }
-}
-
-// UpdateTaskStatus is used to update task's status in a job.
-// If error occurs both task and job are guaranteed to be in the original state.
-func (ji *JobInfo) UpdateTaskStatus(task *TaskInfo, status TaskStatus) error {
- if err := validateStatusUpdate(task.Status, status); err != nil {
- return err
- }
-
- // First remove the task (if exist) from the task list.
- if _, found := ji.Tasks[task.UID]; found {
- if err := ji.DeleteTaskInfo(task); err != nil {
- return err
- }
- }
-
- // Update task's status to the target status once task addition is guaranteed to succeed.
- task.Status = status
- ji.AddTaskInfo(task)
-
- return nil
-}
-
-func (ji *JobInfo) deleteTaskIndex(ti *TaskInfo) {
- if tasks, found := ji.TaskStatusIndex[ti.Status]; found {
- delete(tasks, ti.UID)
-
- if len(tasks) == 0 {
- delete(ji.TaskStatusIndex, ti.Status)
- }
- }
-}
-
-// DeleteTaskInfo is used to delete a task from a job
-func (ji *JobInfo) DeleteTaskInfo(ti *TaskInfo) error {
- if task, found := ji.Tasks[ti.UID]; found {
- ji.TotalRequest.Sub(task.Resreq)
- if AllocatedStatus(task.Status) {
- ji.Allocated.Sub(task.Resreq)
- }
- delete(ji.Tasks, task.UID)
- ji.deleteTaskIndex(task)
- return nil
- }
-
- return fmt.Errorf("failed to find task <%v/%v> in job <%v/%v>",
- ti.Namespace, ti.Name, ji.Namespace, ji.Name)
-}
-
-// Clone is used to clone a jobInfo object
-func (ji *JobInfo) Clone() *JobInfo {
- info := &JobInfo{
- UID: ji.UID,
- Name: ji.Name,
- Namespace: ji.Namespace,
- Queue: ji.Queue,
- Priority: ji.Priority,
-
- MinAvailable: ji.MinAvailable,
- WaitingTime: ji.WaitingTime,
- JobFitErrors: ji.JobFitErrors,
- NodesFitErrors: make(map[TaskID]*FitErrors),
- Allocated: EmptyResource(),
- TotalRequest: EmptyResource(),
-
- PodGroup: ji.PodGroup,
-
- TaskStatusIndex: map[TaskStatus]tasksMap{},
- TaskMinAvailable: ji.TaskMinAvailable,
- TaskMinAvailableTotal: ji.TaskMinAvailableTotal,
- Tasks: tasksMap{},
- Preemptable: ji.Preemptable,
- RevocableZone: ji.RevocableZone,
- Budget: ji.Budget.Clone(),
- }
-
- ji.CreationTimestamp.DeepCopyInto(&info.CreationTimestamp)
-
- for _, task := range ji.Tasks {
- info.AddTaskInfo(task.Clone())
- }
-
- return info
-}
-
-// String returns a jobInfo object in string format
-func (ji JobInfo) String() string {
- res := ""
-
- i := 0
- for _, task := range ji.Tasks {
- res += fmt.Sprintf("\n\t %d: %v", i, task)
- i++
- }
-
- return fmt.Sprintf("Job (%v): namespace %v (%v), name %v, minAvailable %d, podGroup %+v, preemptable %+v, revocableZone %+v, minAvailable %+v, maxAvailable %+v",
- ji.UID, ji.Namespace, ji.Queue, ji.Name, ji.MinAvailable, ji.PodGroup, ji.Preemptable, ji.RevocableZone, ji.Budget.MinAvailable, ji.Budget.MaxUnavilable) + res
-}
-
-// FitError returns detailed information on why a job's task failed to fit on
-// each available node
-func (ji *JobInfo) FitError() string {
- sortReasonsHistogram := func(reasons map[string]int) []string {
- reasonStrings := []string{}
- for k, v := range reasons {
- reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k))
- }
- sort.Strings(reasonStrings)
- return reasonStrings
- }
-
- // Stat histogram for all tasks of the job
- reasons := make(map[string]int)
- for status, taskMap := range ji.TaskStatusIndex {
- reasons[status.String()] += len(taskMap)
- }
- reasons["minAvailable"] = int(ji.MinAvailable)
- reasonMsg := fmt.Sprintf("%v, %v", scheduling.PodGroupNotReady, strings.Join(sortReasonsHistogram(reasons), ", "))
-
- // Stat histogram for pending tasks only
- reasons = make(map[string]int)
- for uid := range ji.TaskStatusIndex[Pending] {
- reason, _ := ji.TaskSchedulingReason(uid)
- reasons[reason]++
- }
- if len(reasons) > 0 {
- reasonMsg += "; " + fmt.Sprintf("%s: %s", Pending.String(), strings.Join(sortReasonsHistogram(reasons), ", "))
- }
- return reasonMsg
-}
-
-// TaskSchedulingReason get detailed reason and message of the given task
-// It returns detailed reason and message for tasks based on last scheduling transaction.
-func (ji *JobInfo) TaskSchedulingReason(tid TaskID) (reason string, msg string) {
- taskInfo, exists := ji.Tasks[tid]
- if !exists {
- return "", ""
- }
-
- // Get detailed scheduling reason based on LastTransaction
- ctx := taskInfo.GetTransactionContext()
- if taskInfo.LastTransaction != nil {
- ctx = *taskInfo.LastTransaction
- }
-
- msg = ji.JobFitErrors
- switch status := ctx.Status; status {
- case Allocated, Pipelined:
- // Pod is schedulable
- msg = fmt.Sprintf("Pod %s/%s can possibly be assigned to %s", taskInfo.Namespace, taskInfo.Name, ctx.NodeName)
- if status == Pipelined {
- msg += " once resource is released"
- }
- return PodReasonSchedulable, msg
- case Pending:
- if fe := ji.NodesFitErrors[tid]; fe != nil {
- // Pod is not schedulable
- return PodReasonUnschedulable, fe.Error()
- }
- // Pod is not scheduled yet
- return PodReasonUndetermined, msg
- default:
- return status.String(), msg
- }
-}
-
-// ReadyTaskNum returns the number of tasks that are ready or that is best-effort.
-func (ji *JobInfo) ReadyTaskNum() int32 {
- occupied := 0
- occupied += len(ji.TaskStatusIndex[Bound])
- occupied += len(ji.TaskStatusIndex[Binding])
- occupied += len(ji.TaskStatusIndex[Running])
- occupied += len(ji.TaskStatusIndex[Allocated])
- occupied += len(ji.TaskStatusIndex[Succeeded])
-
- if tasks, found := ji.TaskStatusIndex[Pending]; found {
- for _, task := range tasks {
- if task.BestEffort {
- occupied++
- }
- }
- }
-
- return int32(occupied)
-}
-
-// WaitingTaskNum returns the number of tasks that are pipelined.
-func (ji *JobInfo) WaitingTaskNum() int32 {
- return int32(len(ji.TaskStatusIndex[Pipelined]))
-}
-
-// CheckTaskMinAvailable returns whether each task of job is valid.
-func (ji *JobInfo) CheckTaskMinAvailable() bool {
- // if job minAvailable is less than sumof(task minAvailable), skip this check
- if ji.MinAvailable < ji.TaskMinAvailableTotal {
- return true
- }
-
- actual := map[TaskID]int32{}
- for status, tasks := range ji.TaskStatusIndex {
- if AllocatedStatus(status) ||
- status == Succeeded ||
- status == Pipelined ||
- status == Pending {
- for _, task := range tasks {
- actual[getTaskID(task.Pod)]++
- }
- }
- }
-
- klog.V(4).Infof("job %s/%s actual: %+v, ji.TaskMinAvailable: %+v", ji.Name, ji.Namespace, actual, ji.TaskMinAvailable)
- for task, minAvailable := range ji.TaskMinAvailable {
- if act, ok := actual[task]; !ok || act < minAvailable {
- return false
- }
- }
-
- return true
-}
-
-// ValidTaskNum returns the number of tasks that are valid.
-func (ji *JobInfo) ValidTaskNum() int32 {
- occupied := 0
- for status, tasks := range ji.TaskStatusIndex {
- if AllocatedStatus(status) ||
- status == Succeeded ||
- status == Pipelined ||
- status == Pending {
- occupied += len(tasks)
- }
- }
-
- return int32(occupied)
-}
-
-// Ready returns whether job is ready for run
-func (ji *JobInfo) Ready() bool {
- occupied := ji.ReadyTaskNum()
-
- return occupied >= ji.MinAvailable
-}
-
-// IsPending returns whether job is in pending status
-func (ji *JobInfo) IsPending() bool {
- if ji.PodGroup == nil || ji.PodGroup.Status.Phase == scheduling.PodGroupPending || ji.PodGroup.Status.Phase == "" {
- return true
- }
-
- return false
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/klog"
-)
-
-// NamespaceName is name of namespace
-type NamespaceName string
-
-const (
- // NamespaceWeightKey is the key in ResourceQuota.spec.hard indicating the weight of this namespace
- NamespaceWeightKey = "volcano.sh/namespace.weight"
- // DefaultNamespaceWeight is the default weight of namespace
- DefaultNamespaceWeight = 1
-)
-
-// NamespaceInfo records information of namespace
-type NamespaceInfo struct {
- // Name is the name of this namespace
- Name NamespaceName
- // Weight is the highest weight among many ResourceQuota.
- Weight int64
-}
-
-// GetWeight returns weight of a namespace, any invalid case would get default value
-func (n *NamespaceInfo) GetWeight() int64 {
- if n == nil || n.Weight == 0 {
- return DefaultNamespaceWeight
- }
- return n.Weight
-}
-
-type quotaItem struct {
- name string
- weight int64
-}
-
-func quotaItemKeyFunc(obj interface{}) (string, error) {
- item, ok := obj.(*quotaItem)
- if !ok {
- return "", fmt.Errorf("obj with type %T could not parse", obj)
- }
- return item.name, nil
-}
-
-// for big root heap
-func quotaItemLessFunc(a interface{}, b interface{}) bool {
- A := a.(*quotaItem)
- B := b.(*quotaItem)
- return A.weight > B.weight
-}
-
-// NamespaceCollection will record all details about namespace
-type NamespaceCollection struct {
- Name string
-
- quotaWeight *cache.Heap
-}
-
-// NewNamespaceCollection creates new NamespaceCollection object to record all information about a namespace
-func NewNamespaceCollection(name string) *NamespaceCollection {
- n := &NamespaceCollection{
- Name: name,
- quotaWeight: cache.NewHeap(quotaItemKeyFunc, quotaItemLessFunc),
- }
- // add at least one item into quotaWeight.
- // Because cache.Heap.Pop would be blocked until queue is not empty
- n.updateWeight("aItem{
- name: NamespaceWeightKey,
- weight: DefaultNamespaceWeight,
- })
- return n
-}
-
-func (n *NamespaceCollection) deleteWeight(q *quotaItem) {
- n.quotaWeight.Delete(q)
-}
-
-func (n *NamespaceCollection) updateWeight(q *quotaItem) {
- n.quotaWeight.Update(q)
-}
-
-func itemFromQuota(quota *v1.ResourceQuota) *quotaItem {
- var weight int64 = DefaultNamespaceWeight
-
- quotaWeight, ok := quota.Spec.Hard[NamespaceWeightKey]
- if ok {
- weight = quotaWeight.Value()
- }
-
- item := "aItem{
- name: quota.Name,
- weight: weight,
- }
- return item
-}
-
-// Update modify the registered information according quota object
-func (n *NamespaceCollection) Update(quota *v1.ResourceQuota) {
- n.updateWeight(itemFromQuota(quota))
-}
-
-// Delete remove the registered information according quota object
-func (n *NamespaceCollection) Delete(quota *v1.ResourceQuota) {
- n.deleteWeight(itemFromQuota(quota))
-}
-
-// Snapshot will clone a NamespaceInfo without Heap according NamespaceCollection
-func (n *NamespaceCollection) Snapshot() *NamespaceInfo {
- var weight int64 = DefaultNamespaceWeight
-
- obj, err := n.quotaWeight.Pop()
- if err != nil {
- klog.Warningf("namespace %s, quota weight meets error %v when pop", n.Name, err)
- } else {
- item := obj.(*quotaItem)
- weight = item.weight
- n.quotaWeight.Add(item)
- }
-
- return &NamespaceInfo{
- Name: NamespaceName(n.Name),
- Weight: weight,
- }
-}
-
-
-
/*
- Copyright 2021 The Volcano Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
- "strconv"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-// NodeInfo is node level aggregated information.
-type NodeInfo struct {
- Name string
- Node *v1.Node
-
- // The state of node
- State NodeState
-
- // The releasing resource on that node
- Releasing *Resource
- // The pipelined resource on that node
- Pipelined *Resource
- // The idle resource on that node
- Idle *Resource
- // The used resource on that node, including running and terminating
- // pods
- Used *Resource
-
- Allocatable *Resource
- Capability *Resource
-
- Tasks map[TaskID]*TaskInfo
- NumaInfo *NumatopoInfo
- NumaChgFlag NumaChgFlag
- NumaSchedulerInfo *NumatopoInfo
- RevocableZone string
-
- // Used to store custom information
- Others map[string]interface{}
- GPUDevices map[int]*GPUDevice
-
- // enable node resource oversubscription
- OversubscriptionNode bool
- // OfflineJobEvicting true means node resource usage too high then dispatched pod can not use oversubscription resource
- OfflineJobEvicting bool
-
- // Resource Oversubscription feature: the Oversubscription Resource reported in annotation
- OversubscriptionResource *Resource
-}
-
-// FutureIdle returns resources that will be idle in the future:
-//
-// That is current idle resources plus released resources minus pipelined resources.
-func (ni *NodeInfo) FutureIdle() *Resource {
- return ni.Idle.Clone().Add(ni.Releasing).Sub(ni.Pipelined)
-}
-
-// GetNodeAllocatable return node Allocatable without OversubscriptionResource resource
-func (ni *NodeInfo) GetNodeAllocatable() *Resource {
- return NewResource(ni.Node.Status.Allocatable)
-}
-
-// NodeState defines the current state of node.
-type NodeState struct {
- Phase NodePhase
- Reason string
-}
-
-// NewNodeInfo is used to create new nodeInfo object
-func NewNodeInfo(node *v1.Node) *NodeInfo {
- nodeInfo := &NodeInfo{
- Releasing: EmptyResource(),
- Pipelined: EmptyResource(),
- Idle: EmptyResource(),
- Used: EmptyResource(),
-
- Allocatable: EmptyResource(),
- Capability: EmptyResource(),
-
- OversubscriptionResource: EmptyResource(),
- Tasks: make(map[TaskID]*TaskInfo),
-
- GPUDevices: make(map[int]*GPUDevice),
- }
-
- nodeInfo.setOversubscription(node)
-
- if node != nil {
- nodeInfo.Name = node.Name
- nodeInfo.Node = node
- nodeInfo.Idle = NewResource(node.Status.Allocatable).Add(nodeInfo.OversubscriptionResource)
- nodeInfo.Allocatable = NewResource(node.Status.Allocatable).Add(nodeInfo.OversubscriptionResource)
- nodeInfo.Capability = NewResource(node.Status.Capacity).Add(nodeInfo.OversubscriptionResource)
- }
- nodeInfo.setNodeGPUInfo(node)
- nodeInfo.setNodeState(node)
- nodeInfo.setRevocableZone(node)
-
- return nodeInfo
-}
-
-// RefreshNumaSchedulerInfoByCrd used to update scheduler numa information based the CRD numatopo
-func (ni *NodeInfo) RefreshNumaSchedulerInfoByCrd() {
- if ni.NumaInfo == nil {
- ni.NumaSchedulerInfo = nil
- return
- }
-
- tmp := ni.NumaInfo.DeepCopy()
- if ni.NumaChgFlag == NumaInfoMoreFlag {
- ni.NumaSchedulerInfo = tmp
- } else if ni.NumaChgFlag == NumaInfoLessFlag {
- numaResMap := ni.NumaSchedulerInfo.NumaResMap
- for resName, resInfo := range tmp.NumaResMap {
- klog.V(5).Infof("resource %s Allocatable : current %v new %v on node %s",
- resName, numaResMap[resName], resInfo, ni.Name)
- if numaResMap[resName].Allocatable.Size() >= resInfo.Allocatable.Size() {
- numaResMap[resName].Allocatable = resInfo.Allocatable.Clone()
- numaResMap[resName].Capacity = resInfo.Capacity
- }
- }
- }
-
- ni.NumaChgFlag = NumaInfoResetFlag
-}
-
-// Clone used to clone nodeInfo Object
-func (ni *NodeInfo) Clone() *NodeInfo {
- res := NewNodeInfo(ni.Node)
-
- for _, p := range ni.Tasks {
- res.AddTask(p)
- }
-
- if ni.NumaSchedulerInfo != nil {
- res.NumaSchedulerInfo = ni.NumaSchedulerInfo.DeepCopy()
- klog.V(5).Infof("node[%s]", ni.Name)
- for resName, resInfo := range res.NumaSchedulerInfo.NumaResMap {
- klog.V(5).Infof("current resource %s : %v", resName, resInfo)
- }
-
- klog.V(5).Infof("current Policies : %v", res.NumaSchedulerInfo.Policies)
- }
-
- res.Others = ni.Others
- return res
-}
-
-// Ready returns whether node is ready for scheduling
-func (ni *NodeInfo) Ready() bool {
- return ni.State.Phase == Ready
-}
-
-func (ni *NodeInfo) setRevocableZone(node *v1.Node) {
- if node == nil {
- klog.Warningf("the argument node is null.")
- return
- }
-
- revocableZone := ""
- if len(node.Labels) > 0 {
- if value, found := node.Labels[v1beta1.RevocableZone]; found {
- revocableZone = value
- }
- }
- ni.RevocableZone = revocableZone
-}
-
-// Check node if enable Oversubscription and set Oversubscription resources
-// Only support oversubscription cpu and memory resource for this version
-func (ni *NodeInfo) setOversubscription(node *v1.Node) {
- if node == nil {
- return
- }
-
- ni.OversubscriptionNode = false
- ni.OfflineJobEvicting = false
- if len(node.Labels) > 0 {
- if value, found := node.Labels[OversubscriptionNode]; found {
- b, err := strconv.ParseBool(value)
- if err == nil {
- ni.OversubscriptionNode = b
- } else {
- ni.OversubscriptionNode = false
- }
- klog.V(5).Infof("Set node %s Oversubscription to %v", node.Name, ni.OversubscriptionNode)
- }
- }
-
- if len(node.Annotations) > 0 {
- if value, found := node.Annotations[OfflineJobEvicting]; found {
- b, err := strconv.ParseBool(value)
- if err == nil {
- ni.OfflineJobEvicting = b
- } else {
- ni.OfflineJobEvicting = false
- }
- klog.V(5).Infof("Set node %s OfflineJobEvicting to %v", node.Name, ni.OfflineJobEvicting)
- }
- if value, found := node.Annotations[OversubscriptionCPU]; found {
- ni.OversubscriptionResource.MilliCPU, _ = strconv.ParseFloat(value, 64)
- klog.V(5).Infof("Set node %s Oversubscription CPU to %v", node.Name, ni.OversubscriptionResource.MilliCPU)
- }
- if value, found := node.Annotations[OversubscriptionMemory]; found {
- ni.OversubscriptionResource.Memory, _ = strconv.ParseFloat(value, 64)
- klog.V(5).Infof("Set node %s Oversubscription Memory to %v", node.Name, ni.OversubscriptionResource.Memory)
- }
- }
-}
-
-func (ni *NodeInfo) setNodeState(node *v1.Node) {
- // If node is nil, the node is un-initialized in cache
- if node == nil {
- ni.State = NodeState{
- Phase: NotReady,
- Reason: "UnInitialized",
- }
- klog.Warningf("set the node %s status to %s for the reason UnInitialized.", node.Name, NotReady.String())
- return
- }
-
- // set NodeState according to resources
- if !ni.Used.LessEqual(ni.Allocatable, Zero) {
- ni.State = NodeState{
- Phase: NotReady,
- Reason: "OutOfSync",
- }
- return
- }
-
- // If node not ready, e.g. power off
- for _, cond := range node.Status.Conditions {
- if cond.Type == v1.NodeReady && cond.Status != v1.ConditionTrue {
- ni.State = NodeState{
- Phase: NotReady,
- Reason: "NotReady",
- }
- klog.Warningf("set the node %s status to %s.", node.Name, NotReady.String())
- return
- }
- }
-
- // Node is ready (ignore node conditions because of taint/toleration)
- ni.State = NodeState{
- Phase: Ready,
- Reason: "",
- }
-
- klog.V(4).Infof("set the node %s status to %s.", node.Name, Ready.String())
-}
-
-func (ni *NodeInfo) setNodeGPUInfo(node *v1.Node) {
- if node == nil {
- return
- }
- memory, ok := node.Status.Capacity[VolcanoGPUResource]
- if !ok {
- return
- }
- totalMemory := memory.Value()
-
- res, ok := node.Status.Capacity[VolcanoGPUNumber]
- if !ok {
- return
- }
- gpuNumber := res.Value()
- if gpuNumber == 0 {
- klog.Warningf("invalid %s=%s", VolcanoGPUNumber, res.String())
- return
- }
-
- memoryPerCard := uint(totalMemory / gpuNumber)
- for i := 0; i < int(gpuNumber); i++ {
- ni.GPUDevices[i] = NewGPUDevice(i, memoryPerCard)
- }
-}
-
-// SetNode sets kubernetes node object to nodeInfo object
-func (ni *NodeInfo) SetNode(node *v1.Node) {
- ni.setOversubscription(node)
- ni.setNodeState(node)
- ni.setNodeGPUInfo(node)
- ni.setRevocableZone(node)
-
- if !ni.Ready() {
- klog.Warningf("Failed to set node info, phase: %s, reason: %s",
- ni.State.Phase, ni.State.Reason)
- return
- }
-
- ni.Name = node.Name
- ni.Node = node
-
- ni.Allocatable = NewResource(node.Status.Allocatable).Add(ni.OversubscriptionResource)
- ni.Capability = NewResource(node.Status.Capacity).Add(ni.OversubscriptionResource)
- ni.Releasing = EmptyResource()
- ni.Pipelined = EmptyResource()
- ni.Idle = NewResource(node.Status.Allocatable).Add(ni.OversubscriptionResource)
- ni.Used = EmptyResource()
-
- for _, ti := range ni.Tasks {
- switch ti.Status {
- case Releasing:
- ni.Idle.Sub(ti.Resreq)
- ni.Releasing.Add(ti.Resreq)
- ni.Used.Add(ti.Resreq)
- ni.AddGPUResource(ti.Pod)
- case Pipelined:
- ni.Pipelined.Add(ti.Resreq)
- default:
- ni.Idle.Sub(ti.Resreq)
- ni.Used.Add(ti.Resreq)
- ni.AddGPUResource(ti.Pod)
- }
- }
-}
-
-func (ni *NodeInfo) allocateIdleResource(ti *TaskInfo) error {
- if ti.Resreq.LessEqual(ni.Idle, Zero) {
- ni.Idle.Sub(ti.Resreq)
- return nil
- }
-
- return fmt.Errorf("selected node NotReady")
-}
-
-// AddTask is used to add a task in nodeInfo object
-//
-// If error occurs both task and node are guaranteed to be in the original state.
-func (ni *NodeInfo) AddTask(task *TaskInfo) error {
- if len(task.NodeName) > 0 && len(ni.Name) > 0 && task.NodeName != ni.Name {
- return fmt.Errorf("task <%v/%v> already on different node <%v>",
- task.Namespace, task.Name, task.NodeName)
- }
-
- key := PodKey(task.Pod)
- if _, found := ni.Tasks[key]; found {
- return fmt.Errorf("task <%v/%v> already on node <%v>",
- task.Namespace, task.Name, ni.Name)
- }
-
- // Node will hold a copy of task to make sure the status
- // change will not impact resource in node.
- ti := task.Clone()
-
- if ni.Node != nil {
- switch ti.Status {
- case Releasing:
- if err := ni.allocateIdleResource(ti); err != nil {
- return err
- }
- ni.Releasing.Add(ti.Resreq)
- ni.Used.Add(ti.Resreq)
- ni.AddGPUResource(ti.Pod)
- case Pipelined:
- ni.Pipelined.Add(ti.Resreq)
- default:
- if err := ni.allocateIdleResource(ti); err != nil {
- return err
- }
- ni.Used.Add(ti.Resreq)
- ni.AddGPUResource(ti.Pod)
- }
- }
-
- // Update task node name upon successful task addition.
- task.NodeName = ni.Name
- ti.NodeName = ni.Name
- ni.Tasks[key] = ti
-
- return nil
-}
-
-// RemoveTask used to remove a task from nodeInfo object.
-//
-// If error occurs both task and node are guaranteed to be in the original state.
-func (ni *NodeInfo) RemoveTask(ti *TaskInfo) error {
- key := PodKey(ti.Pod)
-
- task, found := ni.Tasks[key]
- if !found {
- klog.Warningf("failed to find task <%v/%v> on host <%v>",
- ti.Namespace, ti.Name, ni.Name)
- return nil
- }
-
- if ni.Node != nil {
- switch task.Status {
- case Releasing:
- ni.Releasing.Sub(task.Resreq)
- ni.Idle.Add(task.Resreq)
- ni.Used.Sub(task.Resreq)
- ni.SubGPUResource(ti.Pod)
- case Pipelined:
- ni.Pipelined.Sub(task.Resreq)
- default:
- ni.Idle.Add(task.Resreq)
- ni.Used.Sub(task.Resreq)
- ni.SubGPUResource(ti.Pod)
- }
- }
-
- delete(ni.Tasks, key)
-
- return nil
-}
-
-// UpdateTask is used to update a task in nodeInfo object.
-//
-// If error occurs both task and node are guaranteed to be in the original state.
-func (ni *NodeInfo) UpdateTask(ti *TaskInfo) error {
- if err := ni.RemoveTask(ti); err != nil {
- return err
- }
-
- if err := ni.AddTask(ti); err != nil {
- // This should never happen if task removal was successful,
- // because only possible error during task addition is when task is still on a node.
- klog.Fatalf("Failed to add Task <%s,%s> to Node <%s> during task update",
- ti.Namespace, ti.Name, ni.Name)
- }
- return nil
-}
-
-// String returns nodeInfo details in string format
-func (ni NodeInfo) String() string {
- tasks := ""
-
- i := 0
- for _, task := range ni.Tasks {
- tasks += fmt.Sprintf("\n\t %d: %v", i, task)
- i++
- }
-
- return fmt.Sprintf("Node (%s): allocatable<%v> idle <%v>, used <%v>, releasing <%v>, oversubscribution <%v>, "+
- "state <phase %s, reaseon %s>, oversubscributionNode <%v>, offlineJobEvicting <%v>,taints <%v>%s",
- ni.Name, ni.Allocatable, ni.Idle, ni.Used, ni.Releasing, ni.OversubscriptionResource, ni.State.Phase, ni.State.Reason, ni.OversubscriptionNode, ni.OfflineJobEvicting, ni.Node.Spec.Taints, tasks)
-}
-
-// Pods returns all pods running in that node
-func (ni *NodeInfo) Pods() (pods []*v1.Pod) {
- for _, t := range ni.Tasks {
- pods = append(pods, t.Pod)
- }
-
- return
-}
-
-// GetDevicesIdleGPUMemory returns all the idle GPU memory by gpu card.
-func (ni *NodeInfo) GetDevicesIdleGPUMemory() map[int]uint {
- devicesAllGPUMemory := ni.getDevicesAllGPUMemory()
- devicesUsedGPUMemory := ni.getDevicesUsedGPUMemory()
- res := map[int]uint{}
- for id, allMemory := range devicesAllGPUMemory {
- if usedMemory, found := devicesUsedGPUMemory[id]; found {
- res[id] = allMemory - usedMemory
- } else {
- res[id] = allMemory
- }
- }
- return res
-}
-
-func (ni *NodeInfo) getDevicesUsedGPUMemory() map[int]uint {
- res := map[int]uint{}
- for _, device := range ni.GPUDevices {
- res[device.ID] = device.getUsedGPUMemory()
- }
- return res
-}
-
-func (ni *NodeInfo) getDevicesAllGPUMemory() map[int]uint {
- res := map[int]uint{}
- for _, device := range ni.GPUDevices {
- res[device.ID] = device.Memory
- }
- return res
-}
-
-// AddGPUResource adds the pod to GPU pool if it is assigned
-func (ni *NodeInfo) AddGPUResource(pod *v1.Pod) {
- gpuRes := GetGPUResourceOfPod(pod)
- if gpuRes > 0 {
- id := GetGPUIndex(pod)
- if dev := ni.GPUDevices[id]; dev != nil {
- dev.PodMap[string(pod.UID)] = pod
- }
- }
-}
-
-// SubGPUResource frees the gpu hold by the pod
-func (ni *NodeInfo) SubGPUResource(pod *v1.Pod) {
- gpuRes := GetGPUResourceOfPod(pod)
- if gpuRes > 0 {
- id := GetGPUIndex(pod)
- if dev := ni.GPUDevices[id]; dev != nil {
- delete(dev.PodMap, string(pod.UID))
- }
- }
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- v1 "k8s.io/api/core/v1"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
-
- nodeinfov1alpha1 "volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1"
-)
-
-// NumaChgFlag indicate node numainfo changed status
-type NumaChgFlag int
-
-const (
- // NumaInfoResetFlag indicate reset operate
- NumaInfoResetFlag NumaChgFlag = 0b00
- // NumaInfoMoreFlag indicate the received allocatable resource is getting more
- NumaInfoMoreFlag NumaChgFlag = 0b11
- // NumaInfoLessFlag indicate the received allocatable resource is getting less
- NumaInfoLessFlag NumaChgFlag = 0b10
-)
-
-// ResourceInfo is the allocatable information for the resource
-type ResourceInfo struct {
- Allocatable cpuset.CPUSet
- Capacity int
-}
-
-// NumatopoInfo is the information about topology manager on the node
-type NumatopoInfo struct {
- Namespace string
- Name string
- Policies map[nodeinfov1alpha1.PolicyName]string
- NumaResMap map[string]*ResourceInfo
- CPUDetail topology.CPUDetails
- ResReserved v1.ResourceList
-}
-
-// DeepCopy used to copy NumatopoInfo
-func (info *NumatopoInfo) DeepCopy() *NumatopoInfo {
- numaInfo := &NumatopoInfo{
- Namespace: info.Namespace,
- Name: info.Name,
- Policies: make(map[nodeinfov1alpha1.PolicyName]string),
- NumaResMap: make(map[string]*ResourceInfo),
- CPUDetail: topology.CPUDetails{},
- ResReserved: make(v1.ResourceList),
- }
-
- policies := info.Policies
- for name, policy := range policies {
- numaInfo.Policies[name] = policy
- }
-
- for resName, resInfo := range info.NumaResMap {
- var tmpInfo ResourceInfo
- tmpInfo.Capacity = resInfo.Capacity
- tmpInfo.Allocatable = resInfo.Allocatable.Clone()
- numaInfo.NumaResMap[resName] = &tmpInfo
- }
-
- cpuDetail := info.CPUDetail
- for cpuID, detail := range cpuDetail {
- numaInfo.CPUDetail[cpuID] = detail
- }
-
- resReserved := info.ResReserved
- for resName, res := range resReserved {
- numaInfo.ResReserved[resName] = res
- }
-
- return numaInfo
-}
-
-// Compare is the function to show the change of the resource on kubelet
-// return val:
-// - true : the resource on kubelet is getting more or no change
-// - false : the resource on kubelet is getting less
-func (info *NumatopoInfo) Compare(newInfo *NumatopoInfo) bool {
- for resName := range info.NumaResMap {
- oldSize := info.NumaResMap[resName].Allocatable.Size()
- newSize := newInfo.NumaResMap[resName].Allocatable.Size()
- if oldSize <= newSize {
- return true
- }
- }
-
- return false
-}
-
-// Allocate is the function to remove the allocated resource
-func (info *NumatopoInfo) Allocate(resSets ResNumaSets) {
- for resName := range resSets {
- info.NumaResMap[resName].Allocatable = info.NumaResMap[resName].Allocatable.Difference(resSets[resName])
- }
-}
-
-// Release is the function to reclaim the allocated resource
-func (info *NumatopoInfo) Release(resSets ResNumaSets) {
- for resName := range resSets {
- info.NumaResMap[resName].Allocatable = info.NumaResMap[resName].Allocatable.Union(resSets[resName])
- }
-}
-
-// GenerateNodeResNumaSets return the idle resource sets of all node
-func GenerateNodeResNumaSets(nodes map[string]*NodeInfo) map[string]ResNumaSets {
- nodeSlice := make(map[string]ResNumaSets)
- for _, node := range nodes {
- if node.NumaSchedulerInfo == nil {
- continue
- }
-
- resMaps := make(ResNumaSets)
- for resName, resMap := range node.NumaSchedulerInfo.NumaResMap {
- resMaps[resName] = resMap.Allocatable.Clone()
- }
-
- nodeSlice[node.Name] = resMaps
- }
-
- return nodeSlice
-}
-
-// GenerateNumaNodes return the numa IDs of all node
-func GenerateNumaNodes(nodes map[string]*NodeInfo) map[string][]int {
- nodeNumaMap := make(map[string][]int)
-
- for _, node := range nodes {
- if node.NumaSchedulerInfo == nil {
- continue
- }
-
- nodeNumaMap[node.Name] = node.NumaSchedulerInfo.CPUDetail.NUMANodes().ToSlice()
- }
-
- return nodeNumaMap
-}
-
-// ResNumaSets is the set map of the resource
-type ResNumaSets map[string]cpuset.CPUSet
-
-// Allocate is to remove the allocated resource which is assigned to task
-func (resSets ResNumaSets) Allocate(taskSets ResNumaSets) {
- for resName := range taskSets {
- if _, ok := resSets[resName]; !ok {
- continue
- }
- resSets[resName] = resSets[resName].Difference(taskSets[resName])
- }
-}
-
-// Release is to reclaim the allocated resource which is assigned to task
-func (resSets ResNumaSets) Release(taskSets ResNumaSets) {
- for resName := range taskSets {
- if _, ok := resSets[resName]; !ok {
- continue
- }
- resSets[resName] = resSets[resName].Union(taskSets[resName])
- }
-}
-
-// Clone is the copy action
-func (resSets ResNumaSets) Clone() ResNumaSets {
- newSets := make(ResNumaSets)
- for resName := range resSets {
- newSets[resName] = resSets[resName].Clone()
- }
-
- return newSets
-}
-
-
-
/*
-Copyright 2019 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
- "strconv"
- "strings"
- "time"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-// Refer k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/predicates.go#GetResourceRequest.
-//
-// GetResourceRequest returns a *Resource that covers the largest width in each resource dimension.
-// Because init-containers run sequentially, we collect the max in each dimension iteratively.
-// In contrast, we sum the resource vectors for regular containers since they run simultaneously.
-//
-// To be consistent with kubernetes default scheduler, it is only used for predicates of actions(e.g.
-// allocate, backfill, preempt, reclaim), please use GetPodResourceWithoutInitContainers for other cases.
-//
-// Example:
-//
-// Pod:
-// InitContainers
-// IC1:
-// CPU: 2
-// Memory: 1G
-// IC2:
-// CPU: 2
-// Memory: 3G
-// Containers
-// C1:
-// CPU: 2
-// Memory: 1G
-// C2:
-// CPU: 1
-// Memory: 1G
-//
-// Result: CPU: 3, Memory: 3G
-
-// GetPodResourceRequest returns all the resource required for that pod
-func GetPodResourceRequest(pod *v1.Pod) *Resource {
- result := GetPodResourceWithoutInitContainers(pod)
-
- // take max_resource(sum_pod, any_init_container)
- for _, container := range pod.Spec.InitContainers {
- result.SetMaxResource(NewResource(container.Resources.Requests))
- }
-
- return result
-}
-
-// GetPodPreemptable return volcano.sh/preemptable value for pod
-func GetPodPreemptable(pod *v1.Pod) bool {
- // check annotaion first
- if len(pod.Annotations) > 0 {
- if value, found := pod.Annotations[v1beta1.PodPreemptable]; found {
- b, err := strconv.ParseBool(value)
- if err != nil {
- klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
- return false
- }
- return b
- }
- }
-
- // it annotation does not exit, check label
- if len(pod.Labels) > 0 {
- if value, found := pod.Labels[v1beta1.PodPreemptable]; found {
- b, err := strconv.ParseBool(value)
- if err != nil {
- klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
- return false
- }
- return b
- }
- }
-
- return false
-}
-
-// GetPodRevocableZone return volcano.sh/revocable-zone value for pod/podgroup
-func GetPodRevocableZone(pod *v1.Pod) string {
- if len(pod.Annotations) > 0 {
- if value, found := pod.Annotations[v1beta1.RevocableZone]; found {
- if value != "*" {
- return ""
- }
- return value
- }
-
- if value, found := pod.Annotations[v1beta1.PodPreemptable]; found {
- if b, err := strconv.ParseBool(value); err == nil && b {
- return "*"
- }
- }
- }
- return ""
-}
-
-// GetPodTopologyPolicy return volcano.sh/numa-topology-policy value for pod
-func GetPodTopologyPolicy(pod *v1.Pod) string {
- if len(pod.Annotations) > 0 {
- if value, found := pod.Annotations[v1beta1.NumaPolicyKey]; found {
- return value
- }
- }
- return ""
-}
-
-// GetPodResourceWithoutInitContainers returns Pod's resource request, it does not contain
-// init containers' resource request.
-func GetPodResourceWithoutInitContainers(pod *v1.Pod) *Resource {
- result := EmptyResource()
- for _, container := range pod.Spec.Containers {
- result.Add(NewResource(container.Resources.Requests))
- }
-
- return result
-}
-
-// GetGPUIndex returns the ID of the GPU
-func GetGPUIndex(pod *v1.Pod) int {
- if len(pod.Annotations) > 0 {
- value, found := pod.Annotations[GPUIndex]
- if found {
- id, err := strconv.Atoi(value)
- if err != nil {
- klog.Errorf("invalid %s=%s", GPUIndex, value)
- return -1
- }
- return id
- }
- }
-
- return -1
-}
-
-func escapeJSONPointer(p string) string {
- // Escaping reference name using https://tools.ietf.org/html/rfc6901
- p = strings.Replace(p, "~", "~0", -1)
- p = strings.Replace(p, "/", "~1", -1)
- return p
-}
-
-// AddGPUIndexPatch returns the patch adding GPU index
-func AddGPUIndexPatch(id int) string {
- return fmt.Sprintf(`[{"op": "add", "path": "/metadata/annotations/%s", "value":"%d"},`+
- `{"op": "add", "path": "/metadata/annotations/%s", "value": "%d"}]`,
- escapeJSONPointer(PredicateTime), time.Now().UnixNano(),
- escapeJSONPointer(GPUIndex), id)
-}
-
-// RemoveGPUIndexPatch returns the patch removing GPU index
-func RemoveGPUIndexPatch() string {
- return fmt.Sprintf(`[{"op": "remove", "path": "/metadata/annotations/%s"},`+
- `{"op": "remove", "path": "/metadata/annotations/%s"]`, escapeJSONPointer(PredicateTime), escapeJSONPointer(GPUIndex))
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "k8s.io/apimachinery/pkg/types"
-
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-// QueueID is UID type, serves as unique ID for each queue
-type QueueID types.UID
-
-// QueueInfo will have all details about queue
-type QueueInfo struct {
- UID QueueID
- Name string
-
- Weight int32
-
- // Weights is a list of slash sperated float numbers.
- // Each of them is a weight corresponding the
- // hierarchy level.
- Weights string
- // Hierarchy is a list of node name along the
- // path from the root to the node itself.
- Hierarchy string
-
- Queue *scheduling.Queue
-}
-
-// NewQueueInfo creates new queueInfo object
-func NewQueueInfo(queue *scheduling.Queue) *QueueInfo {
- return &QueueInfo{
- UID: QueueID(queue.Name),
- Name: queue.Name,
-
- Weight: queue.Spec.Weight,
- Hierarchy: queue.Annotations[v1beta1.KubeHierarchyAnnotationKey],
- Weights: queue.Annotations[v1beta1.KubeHierarchyWeightAnnotationKey],
-
- Queue: queue,
- }
-}
-
-// Clone is used to clone queueInfo object
-func (q *QueueInfo) Clone() *QueueInfo {
- return &QueueInfo{
- UID: q.UID,
- Name: q.Name,
- Weight: q.Weight,
- Hierarchy: q.Hierarchy,
- Weights: q.Weights,
- Queue: q.Queue,
- }
-}
-
-// Reclaimable return whether queue is reclaimable
-func (q *QueueInfo) Reclaimable() bool {
- if q == nil {
- return false
- }
-
- if q.Queue == nil {
- return false
- }
-
- if q.Queue.Spec.Reclaimable == nil {
- return true
- }
-
- return *q.Queue.Spec.Reclaimable
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
- "math"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
- v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
-
- "volcano.sh/volcano/pkg/scheduler/util/assert"
-)
-
-const (
- // GPUResourceName need to follow https://github.com/NVIDIA/k8s-device-plugin/blob/66a35b71ac4b5cbfb04714678b548bd77e5ba719/server.go#L20
- GPUResourceName = "nvidia.com/gpu"
-)
-
-const (
- minResource float64 = 0.1
-)
-
-// DimensionDefaultValue means default value for black resource dimension
-type DimensionDefaultValue string
-
-const (
- // Zero means resource dimension not defined will be treated as zero
- Zero DimensionDefaultValue = "Zero"
- // Infinity means resource dimension not defined will be treated as infinity
- Infinity DimensionDefaultValue = "Infinity"
-)
-
-// Resource struct defines all the resource type
-type Resource struct {
- MilliCPU float64
- Memory float64
-
- // ScalarResources
- ScalarResources map[v1.ResourceName]float64
-
- // MaxTaskNum is only used by predicates; it should NOT
- // be accounted in other operators, e.g. Add.
- MaxTaskNum int
-}
-
-// EmptyResource creates a empty resource object and returns
-func EmptyResource() *Resource {
- return &Resource{}
-}
-
-// NewResource creates a new resource object from resource list
-func NewResource(rl v1.ResourceList) *Resource {
- r := EmptyResource()
- for rName, rQuant := range rl {
- switch rName {
- case v1.ResourceCPU:
- r.MilliCPU += float64(rQuant.MilliValue())
- case v1.ResourceMemory:
- r.Memory += float64(rQuant.Value())
- case v1.ResourcePods:
- r.MaxTaskNum += int(rQuant.Value())
- default:
- //NOTE: When converting this back to k8s resource, we need record the format as well as / 1000
- if v1helper.IsScalarResourceName(rName) {
- r.AddScalar(rName, float64(rQuant.MilliValue()))
- }
- }
- }
- return r
-}
-
-// Clone is used to clone a resource type, which is a deep copy function.
-func (r *Resource) Clone() *Resource {
- clone := &Resource{
- MilliCPU: r.MilliCPU,
- Memory: r.Memory,
- MaxTaskNum: r.MaxTaskNum,
- }
-
- if r.ScalarResources != nil {
- clone.ScalarResources = make(map[v1.ResourceName]float64)
- for k, v := range r.ScalarResources {
- clone.ScalarResources[k] = v
- }
- }
-
- return clone
-}
-
-// String returns resource details in string format
-func (r *Resource) String() string {
- str := fmt.Sprintf("cpu %0.2f, memory %0.2f", r.MilliCPU, r.Memory)
- for rName, rQuant := range r.ScalarResources {
- str = fmt.Sprintf("%s, %s %0.2f", str, rName, rQuant)
- }
- return str
-}
-
-// ResourceNames returns all resource types
-func (r *Resource) ResourceNames() ResourceNameList {
- resNames := ResourceNameList{}
-
- if r.MilliCPU >= minResource {
- resNames = append(resNames, v1.ResourceCPU)
- }
-
- if r.Memory >= minResource {
- resNames = append(resNames, v1.ResourceMemory)
- }
-
- for rName, rMount := range r.ScalarResources {
- if rMount >= minResource {
- resNames = append(resNames, rName)
- }
- }
-
- return resNames
-}
-
-// Get returns the resource value for that particular resource type
-func (r *Resource) Get(rn v1.ResourceName) float64 {
- switch rn {
- case v1.ResourceCPU:
- return r.MilliCPU
- case v1.ResourceMemory:
- return r.Memory
- default:
- if r.ScalarResources == nil {
- return 0
- }
- return r.ScalarResources[rn]
- }
-}
-
-// IsEmpty returns false if any kind of resource is not less than min value, otherwise returns true
-func (r *Resource) IsEmpty() bool {
- if !(r.MilliCPU < minResource && r.Memory < minResource) {
- return false
- }
-
- for _, rQuant := range r.ScalarResources {
- if rQuant >= minResource {
- return false
- }
- }
-
- return true
-}
-
-// IsZero returns false if the given kind of resource is not less than min value
-func (r *Resource) IsZero(rn v1.ResourceName) bool {
- switch rn {
- case v1.ResourceCPU:
- return r.MilliCPU < minResource
- case v1.ResourceMemory:
- return r.Memory < minResource
- default:
- if r.ScalarResources == nil {
- return true
- }
-
- _, found := r.ScalarResources[rn]
- assert.Assertf(found, "unknown resource %s", rn)
-
- return r.ScalarResources[rn] < minResource
- }
-}
-
-// Add is used to add two given resources
-func (r *Resource) Add(rr *Resource) *Resource {
- r.MilliCPU += rr.MilliCPU
- r.Memory += rr.Memory
-
- for rName, rQuant := range rr.ScalarResources {
- if r.ScalarResources == nil {
- r.ScalarResources = map[v1.ResourceName]float64{}
- }
- r.ScalarResources[rName] += rQuant
- }
-
- return r
-}
-
-//Sub subtracts two Resource objects.
-func (r *Resource) Sub(rr *Resource) *Resource {
- assert.Assertf(rr.LessEqual(r, Zero), "resource is not sufficient to do operation: <%v> sub <%v>", r, rr)
-
- r.MilliCPU -= rr.MilliCPU
- r.Memory -= rr.Memory
-
- if r.ScalarResources == nil {
- return r
- }
- for rrName, rrQuant := range rr.ScalarResources {
- r.ScalarResources[rrName] -= rrQuant
- }
-
- return r
-}
-
-// Multi multiples the resource with ratio provided
-func (r *Resource) Multi(ratio float64) *Resource {
- r.MilliCPU *= ratio
- r.Memory *= ratio
- for rName, rQuant := range r.ScalarResources {
- r.ScalarResources[rName] = rQuant * ratio
- }
- return r
-}
-
-// SetMaxResource compares with ResourceList and takes max value for each Resource.
-func (r *Resource) SetMaxResource(rr *Resource) {
- if r == nil || rr == nil {
- return
- }
-
- if rr.MilliCPU > r.MilliCPU {
- r.MilliCPU = rr.MilliCPU
- }
- if rr.Memory > r.Memory {
- r.Memory = rr.Memory
- }
-
- for rrName, rrQuant := range rr.ScalarResources {
- if r.ScalarResources == nil {
- r.ScalarResources = make(map[v1.ResourceName]float64)
- for k, v := range rr.ScalarResources {
- r.ScalarResources[k] = v
- }
- return
- }
- _, ok := r.ScalarResources[rrName]
- if !ok || rrQuant > r.ScalarResources[rrName] {
- r.ScalarResources[rrName] = rrQuant
- }
- }
-}
-
-//FitDelta Computes the delta between a resource object representing available
-//resources an operand representing resources being requested. Any
-//field that is less than 0 after the operation represents an
-//insufficient resource.
-func (r *Resource) FitDelta(rr *Resource) *Resource {
- if rr.MilliCPU > 0 {
- r.MilliCPU -= rr.MilliCPU + minResource
- }
-
- if rr.Memory > 0 {
- r.Memory -= rr.Memory + minResource
- }
-
- if r.ScalarResources == nil {
- r.ScalarResources = make(map[v1.ResourceName]float64)
- }
-
- for rrName, rrQuant := range rr.ScalarResources {
- if rrQuant > 0 {
- _, ok := r.ScalarResources[rrName]
- if !ok {
- r.ScalarResources[rrName] = 0
- }
- r.ScalarResources[rrName] -= rrQuant + minResource
- }
- }
-
- return r
-}
-
-// Less returns true only on condition that all dimensions of resources in r are less than that of rr,
-// Otherwise returns false.
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
-func (r *Resource) Less(rr *Resource, defaultValue DimensionDefaultValue) bool {
- lessFunc := func(l, r float64) bool {
- return l < r
- }
-
- leftResource := r.Clone()
- rightResource := rr.Clone()
-
- if !lessFunc(leftResource.MilliCPU, rightResource.MilliCPU) {
- return false
- }
- if !lessFunc(leftResource.Memory, rightResource.Memory) {
- return false
- }
-
- r.setDefaultValue(leftResource, rightResource, defaultValue)
-
- for resourceName, leftValue := range leftResource.ScalarResources {
- rightValue := rightResource.ScalarResources[resourceName]
- if rightValue == -1 {
- continue
- }
- if leftValue == -1 || !lessFunc(leftValue, rightValue) {
- return false
- }
- }
- return true
-}
-
-// LessEqual returns true only on condition that all dimensions of resources in r are less than or equal with that of rr,
-// Otherwise returns false.
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
-func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) bool {
- lessEqualFunc := func(l, r, diff float64) bool {
- if l < r || math.Abs(l-r) < diff {
- return true
- }
- return false
- }
-
- leftResource := r.Clone()
- rightResource := rr.Clone()
-
- if !lessEqualFunc(leftResource.MilliCPU, rightResource.MilliCPU, minResource) {
- return false
- }
- if !lessEqualFunc(leftResource.Memory, rightResource.Memory, minResource) {
- return false
- }
-
- r.setDefaultValue(leftResource, rightResource, defaultValue)
-
- for resourceName, leftValue := range leftResource.ScalarResources {
- rightValue := rightResource.ScalarResources[resourceName]
- if rightValue == -1 {
- continue
- }
- if leftValue == -1 || !lessEqualFunc(leftValue, rightValue, minResource) {
- return false
- }
- }
- return true
-}
-
-// LessPartly returns true if there exists any dimension whose resource amount in r is less than that in rr.
-// Otherwise returns false.
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
-func (r *Resource) LessPartly(rr *Resource, defaultValue DimensionDefaultValue) bool {
- lessFunc := func(l, r float64) bool {
- return l < r
- }
-
- leftResource := r.Clone()
- rightResource := rr.Clone()
-
- if lessFunc(leftResource.MilliCPU, rightResource.MilliCPU) || lessFunc(leftResource.Memory, rightResource.Memory) {
- return true
- }
-
- r.setDefaultValue(leftResource, rightResource, defaultValue)
-
- for resourceName, leftValue := range leftResource.ScalarResources {
- rightValue := rightResource.ScalarResources[resourceName]
- if leftValue == -1 {
- continue
- }
- if rightValue == -1 || lessFunc(leftValue, rightValue) {
- return true
- }
- }
- return false
-}
-
-// LessEqualPartly returns true if there exists any dimension whose resource amount in r is less than or equal with that in rr.
-// Otherwise returns false.
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
-func (r *Resource) LessEqualPartly(rr *Resource, defaultValue DimensionDefaultValue) bool {
- lessEqualFunc := func(l, r, diff float64) bool {
- if l < r || math.Abs(l-r) < diff {
- return true
- }
- return false
- }
-
- leftResource := r.Clone()
- rightResource := rr.Clone()
-
- if lessEqualFunc(leftResource.MilliCPU, rightResource.MilliCPU, minResource) || lessEqualFunc(leftResource.Memory, rightResource.Memory, minResource) {
- return true
- }
-
- r.setDefaultValue(leftResource, rightResource, defaultValue)
-
- for resourceName, leftValue := range leftResource.ScalarResources {
- rightValue := rightResource.ScalarResources[resourceName]
- if leftValue == -1 {
- continue
- }
- if rightValue == -1 || lessEqualFunc(leftValue, rightValue, minResource) {
- return true
- }
- }
- return false
-}
-
-// Equal returns true only on condition that values in all dimension are equal with each other for r and rr
-// Otherwise returns false.
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
-func (r *Resource) Equal(rr *Resource, defaultValue DimensionDefaultValue) bool {
- equalFunc := func(l, r, diff float64) bool {
- return l == r || math.Abs(l-r) < diff
- }
-
- leftResource := r.Clone()
- rightResource := rr.Clone()
-
- if !equalFunc(leftResource.MilliCPU, rightResource.MilliCPU, minResource) || !equalFunc(leftResource.Memory, rightResource.Memory, minResource) {
- return false
- }
-
- r.setDefaultValue(leftResource, rightResource, defaultValue)
-
- for resourceName, leftValue := range leftResource.ScalarResources {
- rightValue := rightResource.ScalarResources[resourceName]
- if !equalFunc(leftValue, rightValue, minResource) {
- return false
- }
- }
- return true
-}
-
-// Diff calculate the difference between two resource object
-// Note: if `defaultValue` equals `Infinity`, the difference between two values will be `Infinity`, marked as -1
-func (r *Resource) Diff(rr *Resource, defaultValue DimensionDefaultValue) (*Resource, *Resource) {
- leftRes := r.Clone()
- rightRes := rr.Clone()
- increasedVal := EmptyResource()
- decreasedVal := EmptyResource()
- r.setDefaultValue(leftRes, rightRes, defaultValue)
-
- if leftRes.MilliCPU > rightRes.MilliCPU {
- increasedVal.MilliCPU = leftRes.MilliCPU - rightRes.MilliCPU
- } else {
- decreasedVal.MilliCPU = rightRes.MilliCPU - leftRes.MilliCPU
- }
-
- if leftRes.Memory > rightRes.Memory {
- increasedVal.Memory = leftRes.Memory - rightRes.Memory
- } else {
- decreasedVal.Memory = rightRes.Memory - leftRes.Memory
- }
-
- increasedVal.ScalarResources = make(map[v1.ResourceName]float64, 0)
- decreasedVal.ScalarResources = make(map[v1.ResourceName]float64, 0)
- for lName, lQuant := range leftRes.ScalarResources {
- rQuant, _ := rightRes.ScalarResources[lName]
- if lQuant == -1 {
- increasedVal.ScalarResources[lName] = -1
- continue
- }
- if rQuant == -1 {
- decreasedVal.ScalarResources[lName] = -1
- continue
- }
- if lQuant > rQuant {
- increasedVal.ScalarResources[lName] = lQuant - rQuant
- } else {
- decreasedVal.ScalarResources[lName] = rQuant - lQuant
- }
- }
-
- return increasedVal, decreasedVal
-}
-
-// AddScalar adds a resource by a scalar value of this resource.
-func (r *Resource) AddScalar(name v1.ResourceName, quantity float64) {
- r.SetScalar(name, r.ScalarResources[name]+quantity)
-}
-
-// SetScalar sets a resource by a scalar value of this resource.
-func (r *Resource) SetScalar(name v1.ResourceName, quantity float64) {
- // Lazily allocate scalar resource map.
- if r.ScalarResources == nil {
- r.ScalarResources = map[v1.ResourceName]float64{}
- }
- r.ScalarResources[name] = quantity
-}
-
-// MinDimensionResource is used to reset the r resource dimension which is less than rr
-// e.g r resource is <cpu 2000.00, memory 4047845376.00, hugepages-2Mi 0.00, hugepages-1Gi 0.00>
-// rr resource is <cpu 3000.00, memory 1000.00>
-// return r resource is <cpu 2000.00, memory 1000.00, hugepages-2Mi 0.00, hugepages-1Gi 0.00>
-func (r *Resource) MinDimensionResource(rr *Resource) *Resource {
- if rr.MilliCPU < r.MilliCPU {
- r.MilliCPU = rr.MilliCPU
- }
- if rr.Memory < r.Memory {
- r.Memory = rr.Memory
- }
-
- if rr.ScalarResources == nil {
- if r.ScalarResources != nil {
- for name := range r.ScalarResources {
- r.ScalarResources[name] = 0
- }
- }
- } else {
- if r.ScalarResources != nil {
- for name, quant := range rr.ScalarResources {
- if quant < r.ScalarResources[name] {
- r.ScalarResources[name] = quant
- }
- }
- }
- }
- return r
-}
-
-// setDefaultValue sets default value for resource dimension not defined of ScalarResource in leftResource and rightResource
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. It can only be one of 'Zero' or 'Infinity'"
-func (r *Resource) setDefaultValue(leftResource, rightResource *Resource, defaultValue DimensionDefaultValue) {
- if leftResource.ScalarResources == nil {
- leftResource.ScalarResources = map[v1.ResourceName]float64{}
- }
- if rightResource.ScalarResources == nil {
- rightResource.ScalarResources = map[v1.ResourceName]float64{}
- }
- for resourceName := range leftResource.ScalarResources {
- _, ok := rightResource.ScalarResources[resourceName]
- if !ok {
- if defaultValue == Zero {
- rightResource.ScalarResources[resourceName] = 0
- } else if defaultValue == Infinity {
- rightResource.ScalarResources[resourceName] = -1
- }
- }
- }
-
- for resourceName := range rightResource.ScalarResources {
- _, ok := leftResource.ScalarResources[resourceName]
- if !ok {
- if defaultValue == Zero {
- leftResource.ScalarResources[resourceName] = 0
- } else if defaultValue == Infinity {
- leftResource.ScalarResources[resourceName] = -1
- }
- }
- }
-}
-
-// ParseResourceList parses the given configuration map into an API
-// ResourceList or returns an error.
-func ParseResourceList(m map[string]string) (v1.ResourceList, error) {
- if len(m) == 0 {
- return nil, nil
- }
- rl := make(v1.ResourceList)
- for k, v := range m {
- switch v1.ResourceName(k) {
- // CPU, memory, local storage, and PID resources are supported.
- case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceEphemeralStorage:
- q, err := resource.ParseQuantity(v)
- if err != nil {
- return nil, err
- }
- if q.Sign() == -1 {
- return nil, fmt.Errorf("resource quantity for %q cannot be negative: %v", k, v)
- }
- rl[v1.ResourceName(k)] = q
- default:
- return nil, fmt.Errorf("cannot reserve %q resource", k)
- }
- }
- return rl, nil
-}
-
-func GetMinResource() float64 {
- return minResource
-}
-
-// ResourceNameList struct defines resource name collection
-type ResourceNameList []v1.ResourceName
-
-// Contains judges whether rr is subset of r
-func (r ResourceNameList) Contains(rr ResourceNameList) bool {
- for _, rrName := range ([]v1.ResourceName)(rr) {
- isResourceExist := false
- for _, rName := range ([]v1.ResourceName)(r) {
- if rName == rrName {
- isResourceExist = true
- break
- }
- }
- if !isResourceExist {
- return false
- }
- }
- return true
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "k8s.io/apimachinery/pkg/types"
-
- "volcano.sh/apis/pkg/apis/scheduling"
-)
-
-// ClusterID is UID type, serves as unique ID for each queue
-type ClusterID types.UID
-
-// SiloClusterInfo will have all details about queue
-type SiloClusterInfo struct {
- UID ClusterID
- Cluster *scheduling.Cluster
-}
-
-// NewSiloClusterInfo creates new queueInfo object
-func NewSiloClusterInfo(cluster *scheduling.Cluster) *SiloClusterInfo {
- return &SiloClusterInfo{
- UID: ClusterID(cluster.Name),
- Cluster: cluster,
- }
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/types"
-)
-
-func buildNode(name string, alloc v1.ResourceList) *v1.Node {
- return &v1.Node{
- ObjectMeta: metav1.ObjectMeta{
- Name: name,
- },
- Status: v1.NodeStatus{
- Capacity: alloc,
- Allocatable: alloc,
- },
- }
-}
-
-func buildPod(ns, n, nn string, p v1.PodPhase, req v1.ResourceList, owner []metav1.OwnerReference, labels map[string]string) *v1.Pod {
- return &v1.Pod{
- ObjectMeta: metav1.ObjectMeta{
- UID: types.UID(fmt.Sprintf("%v-%v", ns, n)),
- Name: n,
- Namespace: ns,
- OwnerReferences: owner,
- Labels: labels,
- },
- Status: v1.PodStatus{
- Phase: p,
- },
- Spec: v1.PodSpec{
- NodeName: nn,
- Containers: []v1.Container{
- {
- Resources: v1.ResourceRequirements{
- Requests: req,
- },
- },
- },
- },
- }
-}
-
-func buildResourceList(cpu string, memory string) v1.ResourceList {
- return v1.ResourceList{
- v1.ResourceCPU: resource.MustParse(cpu),
- v1.ResourceMemory: resource.MustParse(memory),
- }
-}
-
-func buildResource(cpu string, memory string) *Resource {
- return NewResource(v1.ResourceList{
- v1.ResourceCPU: resource.MustParse(cpu),
- v1.ResourceMemory: resource.MustParse(memory),
- })
-}
-
-func buildOwnerReference(owner string) metav1.OwnerReference {
- controller := true
- return metav1.OwnerReference{
- Controller: &controller,
- UID: types.UID(owner),
- }
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-)
-
-// TaskStatus defines the status of a task/pod.
-type TaskStatus int
-
-const (
- // Pending means the task is pending in the apiserver.
- Pending TaskStatus = 1 << iota
-
- // Allocated means the scheduler assigns a host to it.
- Allocated
-
- // Pipelined means the scheduler assigns a host to wait for releasing resource.
- Pipelined
-
- // Binding means the scheduler send Bind request to apiserver.
- Binding
-
- // Bound means the task/Pod bounds to a host.
- Bound
-
- // Running means a task is running on the host.
- Running
-
- // Releasing means a task/pod is deleted.
- Releasing
-
- // Succeeded means that all containers in the pod have voluntarily terminated
- // with a container exit code of 0, and the system is not going to restart any of these containers.
- Succeeded
-
- // Failed means that all containers in the pod have terminated, and at least one container has
- // terminated in a failure (exited with a non-zero exit code or was stopped by the system).
- Failed
-
- // Unknown means the status of task/pod is unknown to the scheduler.
- Unknown
-)
-
-func (ts TaskStatus) String() string {
- switch ts {
- case Pending:
- return "Pending"
- case Allocated:
- return "Allocated"
- case Pipelined:
- return "Pipelined"
- case Binding:
- return "Binding"
- case Bound:
- return "Bound"
- case Running:
- return "Running"
- case Releasing:
- return "Releasing"
- case Succeeded:
- return "Succeeded"
- case Failed:
- return "Failed"
- default:
- return "Unknown"
- }
-}
-
-// NodePhase defines the phase of node
-type NodePhase int
-
-const (
- // Ready means the node is ready for scheduling
- Ready NodePhase = 1 << iota
- // NotReady means the node is not ready for scheduling
- NotReady
-)
-
-func (np NodePhase) String() string {
- switch np {
- case Ready:
- return "Ready"
- case NotReady:
- return "NotReady"
- }
-
- return "Unknown"
-}
-
-// validateStatusUpdate validates whether the status transfer is valid.
-func validateStatusUpdate(oldStatus, newStatus TaskStatus) error {
- return nil
-}
-
-// LessFn is the func declaration used by sort or priority queue.
-type LessFn func(interface{}, interface{}) bool
-
-// CompareFn is the func declaration used by sort or priority queue.
-type CompareFn func(interface{}, interface{}) int
-
-// ValidateFn is the func declaration used to check object's status.
-type ValidateFn func(interface{}) bool
-
-// ValidateResult is struct to which can used to determine the result
-type ValidateResult struct {
- Pass bool
- Reason string
- Message string
-}
-
-// ValidateExFn is the func declaration used to validate the result.
-type ValidateExFn func(interface{}) *ValidateResult
-
-// VoteFn is the func declaration used to check object's complicated status.
-type VoteFn func(interface{}) int
-
-// JobEnqueuedFn is the func declaration used to call after job enqueued.
-type JobEnqueuedFn func(interface{})
-
-// PredicateFn is the func declaration used to predicate node for task.
-type PredicateFn func(*TaskInfo, *NodeInfo) error
-
-// BestNodeFn is the func declaration used to return the nodeScores to plugins.
-type BestNodeFn func(*TaskInfo, map[float64][]*NodeInfo) *NodeInfo
-
-// EvictableFn is the func declaration used to evict tasks.
-type EvictableFn func(*TaskInfo, []*TaskInfo) ([]*TaskInfo, int)
-
-// NodeOrderFn is the func declaration used to get priority score for a node for a particular task.
-type NodeOrderFn func(*TaskInfo, *NodeInfo) (float64, error)
-
-// BatchNodeOrderFn is the func declaration used to get priority score for ALL nodes for a particular task.
-type BatchNodeOrderFn func(*TaskInfo, []*NodeInfo) (map[string]float64, error)
-
-// NodeMapFn is the func declaration used to get priority score for a node for a particular task.
-type NodeMapFn func(*TaskInfo, *NodeInfo) (float64, error)
-
-// NodeReduceFn is the func declaration used to reduce priority score for a node for a particular task.
-type NodeReduceFn func(*TaskInfo, k8sframework.NodeScoreList) error
-
-// NodeOrderMapFn is the func declaration used to get priority score of all plugins for a node for a particular task.
-type NodeOrderMapFn func(*TaskInfo, *NodeInfo) (map[string]float64, float64, error)
-
-// NodeOrderReduceFn is the func declaration used to reduce priority score of all nodes for a plugin for a particular task.
-type NodeOrderReduceFn func(*TaskInfo, map[string]k8sframework.NodeScoreList) (map[string]float64, error)
-
-// TargetJobFn is the func declaration used to select the target job satisfies some conditions
-type TargetJobFn func([]*JobInfo) *JobInfo
-
-// ReservedNodesFn is the func declaration used to select the reserved nodes
-type ReservedNodesFn func()
-
-// VictimTasksFn is the func declaration used to select victim tasks
-type VictimTasksFn func() []*TaskInfo
-
-// UnderUsedResourceFn is the func declaration used to get under used resource list for queue
-type UnderUsedResourceFn func(*QueueInfo) ResourceNameList
-
-
-
package api
-
-import (
- "fmt"
- "sort"
- "strings"
-)
-
-const (
- // NodePodNumberExceeded means pods in node exceed the allocatable pod number
- NodePodNumberExceeded = "node(s) pod number exceeded"
- // NodeResourceFitFailed means node could not fit the request of pod
- NodeResourceFitFailed = "node(s) resource fit failed"
-
- // AllNodeUnavailableMsg is the default error message
- AllNodeUnavailableMsg = "all nodes are unavailable"
-)
-
-// These are reasons for a pod's transition to a condition.
-const (
- // PodReasonUnschedulable reason in PodScheduled PodCondition means that the scheduler
- // can't schedule the pod right now, for example due to insufficient resources in the cluster.
- PodReasonUnschedulable = "Unschedulable"
- // PodReasonSchedulable reason in PodScheduled PodCondition means that the scheduler
- // can schedule the pod right now, but not bind yet
- PodReasonSchedulable = "Schedulable"
- // PodReasonUndetermined reason in PodScheduled PodCondition means that the scheduler
- // skips scheduling the pod which left the pod `Undetermined`, for example due to unschedulable pod already occurred.
- PodReasonUndetermined = "Undetermined"
-)
-
-// FitErrors is set of FitError on many nodes
-type FitErrors struct {
- nodes map[string]*FitError
- err string
-}
-
-// NewFitErrors returns an FitErrors
-func NewFitErrors() *FitErrors {
- f := new(FitErrors)
- f.nodes = make(map[string]*FitError)
- return f
-}
-
-// SetError set the common error message in FitErrors
-func (f *FitErrors) SetError(err string) {
- f.err = err
-}
-
-// SetNodeError set the node error in FitErrors
-func (f *FitErrors) SetNodeError(nodeName string, err error) {
- var fe *FitError
- switch obj := err.(type) {
- case *FitError:
- obj.NodeName = nodeName
- fe = obj
- default:
- fe = &FitError{
- NodeName: nodeName,
- Reasons: []string{obj.Error()},
- }
- }
-
- f.nodes[nodeName] = fe
-}
-
-// Error returns the final error message
-func (f *FitErrors) Error() string {
- reasons := make(map[string]int)
-
- for _, node := range f.nodes {
- for _, reason := range node.Reasons {
- reasons[reason]++
- }
- }
-
- sortReasonsHistogram := func() []string {
- reasonStrings := []string{}
- for k, v := range reasons {
- reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k))
- }
- sort.Strings(reasonStrings)
- return reasonStrings
- }
- if f.err == "" {
- f.err = AllNodeUnavailableMsg
- }
- reasonMsg := fmt.Sprintf(f.err+": %v.", strings.Join(sortReasonsHistogram(), ", "))
- return reasonMsg
-}
-
-// FitError describe the reason why task could not fit that node
-type FitError struct {
- taskNamespace string
- taskName string
- NodeName string
- Reasons []string
-}
-
-// NewFitError return FitError by message
-func NewFitError(task *TaskInfo, node *NodeInfo, message ...string) *FitError {
- fe := &FitError{
- taskName: task.Name,
- taskNamespace: task.Namespace,
- NodeName: node.Name,
- Reasons: message,
- }
- return fe
-}
-
-// Error returns the final error message
-func (f *FitError) Error() string {
- return fmt.Sprintf("task %s/%s on node %s fit failed: %s", f.taskNamespace, f.taskName, f.NodeName, strings.Join(f.Reasons, ", "))
-}
-
-
-
/*
- Copyright 2021 The Volcano Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-package cache
-
-import (
- "context"
- "fmt"
- "sync"
- "time"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/api/scheduling/v1beta1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/runtime"
- utilruntime "k8s.io/apimachinery/pkg/util/runtime"
- "k8s.io/apimachinery/pkg/util/wait"
- "k8s.io/client-go/informers"
- infov1 "k8s.io/client-go/informers/core/v1"
- schedv1 "k8s.io/client-go/informers/scheduling/v1beta1"
- storagev1 "k8s.io/client-go/informers/storage/v1"
- storagev1alpha1 "k8s.io/client-go/informers/storage/v1alpha1"
- "k8s.io/client-go/kubernetes"
- corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
- "k8s.io/client-go/rest"
- "k8s.io/client-go/tools/cache"
- "k8s.io/client-go/tools/record"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
- podutil "k8s.io/kubernetes/pkg/api/v1/pod"
- volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/apis/scheduling"
- schedulingscheme "volcano.sh/apis/pkg/apis/scheduling/scheme"
- vcv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- vcclient "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
- vcinformer "volcano.sh/apis/pkg/client/informers/externalversions"
- cpuinformerv1 "volcano.sh/apis/pkg/client/informers/externalversions/nodeinfo/v1alpha1"
- vcinformerv1 "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
-
- "volcano.sh/volcano/cmd/scheduler/app/options"
- schedulingapi "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-func init() {
- schemeBuilder := runtime.SchemeBuilder{
- v1.AddToScheme,
- }
-
- utilruntime.Must(schemeBuilder.AddToScheme(scheme.Scheme))
-}
-
-// New returns a Cache implementation.
-func New(config *rest.Config, schedulerName string, defaultQueue string) Cache {
- return newSchedulerCache(config, schedulerName, defaultQueue)
-}
-
-// SchedulerCache cache for the kube batch
-type SchedulerCache struct {
- sync.Mutex
-
- kubeClient *kubernetes.Clientset
- vcClient *vcclient.Clientset
- defaultQueue string
- // schedulerName is the name for volcano scheduler
- schedulerName string
-
- podInformer infov1.PodInformer
- nodeInformer infov1.NodeInformer
- podGroupInformerV1beta1 vcinformerv1.PodGroupInformer
- queueInformerV1beta1 vcinformerv1.QueueInformer
- pvInformer infov1.PersistentVolumeInformer
- pvcInformer infov1.PersistentVolumeClaimInformer
- scInformer storagev1.StorageClassInformer
- pcInformer schedv1.PriorityClassInformer
- quotaInformer infov1.ResourceQuotaInformer
- csiNodeInformer storagev1.CSINodeInformer
- csiDriverInformer storagev1.CSIDriverInformer
- csiStorageCapacityInformer storagev1alpha1.CSIStorageCapacityInformer
- cpuInformer cpuinformerv1.NumatopologyInformer
-
- Binder Binder
- Evictor Evictor
- StatusUpdater StatusUpdater
- PodGroupBinder BatchBinder
- VolumeBinder VolumeBinder
-
- Recorder record.EventRecorder
-
- Jobs map[schedulingapi.JobID]*schedulingapi.JobInfo
- Nodes map[string]*schedulingapi.NodeInfo
- Queues map[schedulingapi.QueueID]*schedulingapi.QueueInfo
- PriorityClasses map[string]*v1beta1.PriorityClass
- NodeList []string
- defaultPriorityClass *v1beta1.PriorityClass
- defaultPriority int32
-
- NamespaceCollection map[string]*schedulingapi.NamespaceCollection
-
- errTasks workqueue.RateLimitingInterface
- deletedJobs workqueue.RateLimitingInterface
-
- informerFactory informers.SharedInformerFactory
-}
-
-type defaultBinder struct {
- kubeclient *kubernetes.Clientset
-}
-
-//Bind will send bind request to api server
-func (db *defaultBinder) Bind(p *v1.Pod, hostname string) error {
- if err := db.kubeclient.CoreV1().Pods(p.Namespace).Bind(context.TODO(),
- &v1.Binding{
- ObjectMeta: metav1.ObjectMeta{Namespace: p.Namespace, Name: p.Name, UID: p.UID, Annotations: p.Annotations},
- Target: v1.ObjectReference{
- Kind: "Node",
- Name: hostname,
- },
- },
- metav1.CreateOptions{}); err != nil {
- klog.Errorf("Failed to bind pod <%v/%v>: %#v", p.Namespace, p.Name, err)
- return err
- }
- return nil
-}
-
-type defaultEvictor struct {
- kubeclient *kubernetes.Clientset
- recorder record.EventRecorder
-}
-
-// Evict will send delete pod request to api server
-func (de *defaultEvictor) Evict(p *v1.Pod, reason string) error {
- klog.V(3).Infof("Evicting pod %v/%v, because of %v", p.Namespace, p.Name, reason)
-
- evictMsg := fmt.Sprintf("Pod is evicted, because of %v", reason)
- annotations := map[string]string{}
- // record that we are evicting the pod
- de.recorder.AnnotatedEventf(p, annotations, v1.EventTypeWarning, "Evict", evictMsg)
-
- pod := p.DeepCopy()
- condition := &v1.PodCondition{
- Type: v1.PodReady,
- Status: v1.ConditionFalse,
- Reason: "Evict",
- Message: evictMsg,
- }
- if !podutil.UpdatePodCondition(&pod.Status, condition) {
- klog.V(1).Infof("UpdatePodCondition: existed condition, not update")
- klog.V(1).Infof("%+v", pod.Status.Conditions)
- return nil
- }
- if _, err := de.kubeclient.CoreV1().Pods(p.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}); err != nil {
- klog.Errorf("Failed to update pod <%v/%v> status: %v", pod.Namespace, pod.Name, err)
- return err
- }
- if err := de.kubeclient.CoreV1().Pods(p.Namespace).Delete(context.TODO(), p.Name, metav1.DeleteOptions{}); err != nil {
- klog.Errorf("Failed to evict pod <%v/%v>: %#v", p.Namespace, p.Name, err)
- return err
- }
-
- return nil
-}
-
-// defaultStatusUpdater is the default implementation of the StatusUpdater interface
-type defaultStatusUpdater struct {
- kubeclient *kubernetes.Clientset
- vcclient *vcclient.Clientset
-}
-
-// following the same logic as podutil.UpdatePodCondition
-func podConditionHaveUpdate(status *v1.PodStatus, condition *v1.PodCondition) bool {
- lastTransitionTime := metav1.Now()
- // Try to find this pod condition.
- _, oldCondition := podutil.GetPodCondition(status, condition.Type)
-
- if oldCondition == nil {
- // We are adding new pod condition.
- return true
- }
- // We are updating an existing condition, so we need to check if it has changed.
- if condition.Status == oldCondition.Status {
- lastTransitionTime = oldCondition.LastTransitionTime
- }
-
- isEqual := condition.Status == oldCondition.Status &&
- condition.Reason == oldCondition.Reason &&
- condition.Message == oldCondition.Message &&
- condition.LastProbeTime.Equal(&oldCondition.LastProbeTime) &&
- lastTransitionTime.Equal(&oldCondition.LastTransitionTime)
-
- // Return true if one of the fields have changed.
- return !isEqual
-}
-
-// UpdatePodCondition will Update pod with podCondition
-func (su *defaultStatusUpdater) UpdatePodCondition(pod *v1.Pod, condition *v1.PodCondition) (*v1.Pod, error) {
- klog.V(3).Infof("Updating pod condition for %s/%s to (%s==%s)", pod.Namespace, pod.Name, condition.Type, condition.Status)
- if podutil.UpdatePodCondition(&pod.Status, condition) {
- return su.kubeclient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{})
- }
- return pod, nil
-}
-
-// UpdatePodGroup will Update pod with podCondition
-func (su *defaultStatusUpdater) UpdatePodGroup(pg *schedulingapi.PodGroup) (*schedulingapi.PodGroup, error) {
- podgroup := &vcv1beta1.PodGroup{}
- if err := schedulingscheme.Scheme.Convert(&pg.PodGroup, podgroup, nil); err != nil {
- klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
- return nil, err
- }
-
- updated, err := su.vcclient.SchedulingV1beta1().PodGroups(podgroup.Namespace).Update(context.TODO(), podgroup, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Error while updating PodGroup with error: %v", err)
- return nil, err
- }
-
- podGroupInfo := &schedulingapi.PodGroup{Version: schedulingapi.PodGroupVersionV1Beta1}
- if err := schedulingscheme.Scheme.Convert(updated, &podGroupInfo.PodGroup, nil); err != nil {
- klog.Errorf("Error while converting v1alpha.PodGroup to api.PodGroup with error: %v", err)
- return nil, err
- }
-
- return podGroupInfo, nil
-}
-
-type defaultVolumeBinder struct {
- volumeBinder volumescheduling.SchedulerVolumeBinder
-}
-
-// AllocateVolumes allocates volume on the host to the task
-func (dvb *defaultVolumeBinder) AllocateVolumes(task *schedulingapi.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error {
- allBound, err := dvb.volumeBinder.AssumePodVolumes(task.Pod, hostname, podVolumes)
- task.VolumeReady = allBound
-
- return err
-}
-
-// GetPodVolumes get pod volume on the host
-func (dvb *defaultVolumeBinder) GetPodVolumes(task *schedulingapi.TaskInfo,
- node *v1.Node) (podVolumes *volumescheduling.PodVolumes, err error) {
- boundClaims, claimsToBind, _, err := dvb.volumeBinder.GetPodVolumes(task.Pod)
- if err != nil {
- return nil, err
- }
-
- podVolumes, _, err = dvb.volumeBinder.FindPodVolumes(task.Pod, boundClaims, claimsToBind, node)
- return podVolumes, err
-}
-
-// BindVolumes binds volumes to the task
-func (dvb *defaultVolumeBinder) BindVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) error {
- // If task's volumes are ready, did not bind them again.
- if task.VolumeReady {
- return nil
- }
-
- return dvb.volumeBinder.BindPodVolumes(task.Pod, podVolumes)
-}
-
-type podgroupBinder struct {
- kubeclient *kubernetes.Clientset
- vcclient *vcclient.Clientset
-}
-
-// Bind will add silo cluster annotaion on pod and podgroup
-func (pgb *podgroupBinder) Bind(job *schedulingapi.JobInfo, cluster string) (*schedulingapi.JobInfo, error) {
- if len(job.Tasks) == 0 {
- klog.V(4).Infof("Job pods have not been created yet")
- return job, nil
- }
- for _, task := range job.Tasks {
- pod := task.Pod
- pod.Annotations[batch.ForwardClusterKey] = cluster
- pod.ResourceVersion = ""
- _, err := pgb.kubeclient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Error while update pod annotation with error: %v", err)
- return nil, err
- }
- }
-
- pg := job.PodGroup
- pg.Annotations[batch.ForwardClusterKey] = cluster
- podgroup := &vcv1beta1.PodGroup{}
- if err := schedulingscheme.Scheme.Convert(&pg.PodGroup, podgroup, nil); err != nil {
- klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
- return nil, err
- }
- newPg, err := pgb.vcclient.SchedulingV1beta1().PodGroups(pg.Namespace).Update(context.TODO(), podgroup, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Error while update PodGroup annotation with error: %v", err)
- return nil, err
- }
- job.PodGroup.ResourceVersion = newPg.ResourceVersion
- klog.V(4).Infof("Bind PodGroup <%s> successfully", job.PodGroup.Name)
- return job, nil
-}
-
-func newSchedulerCache(config *rest.Config, schedulerName string, defaultQueue string) *SchedulerCache {
- kubeClient, err := kubernetes.NewForConfig(config)
- if err != nil {
- panic(fmt.Sprintf("failed init kubeClient, with err: %v", err))
- }
- vcClient, err := vcclient.NewForConfig(config)
- if err != nil {
- panic(fmt.Sprintf("failed init vcClient, with err: %v", err))
- }
- eventClient, err := kubernetes.NewForConfig(config)
- if err != nil {
- panic(fmt.Sprintf("failed init eventClient, with err: %v", err))
- }
-
- // create default queue
- reclaimable := true
- defaultQue := vcv1beta1.Queue{
- ObjectMeta: metav1.ObjectMeta{
- Name: defaultQueue,
- },
- Spec: vcv1beta1.QueueSpec{
- Reclaimable: &reclaimable,
- Weight: 1,
- },
- }
- if _, err := vcClient.SchedulingV1beta1().Queues().Create(context.TODO(), &defaultQue, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) {
- panic(fmt.Sprintf("failed init default queue, with err: %v", err))
- }
-
- sc := &SchedulerCache{
- Jobs: make(map[schedulingapi.JobID]*schedulingapi.JobInfo),
- Nodes: make(map[string]*schedulingapi.NodeInfo),
- Queues: make(map[schedulingapi.QueueID]*schedulingapi.QueueInfo),
- PriorityClasses: make(map[string]*v1beta1.PriorityClass),
- errTasks: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
- deletedJobs: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
- kubeClient: kubeClient,
- vcClient: vcClient,
- defaultQueue: defaultQueue,
- schedulerName: schedulerName,
-
- NamespaceCollection: make(map[string]*schedulingapi.NamespaceCollection),
-
- NodeList: []string{},
- }
-
- // Prepare event clients.
- broadcaster := record.NewBroadcaster()
- broadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: eventClient.CoreV1().Events("")})
- sc.Recorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: schedulerName})
-
- sc.Binder = &defaultBinder{
- kubeclient: sc.kubeClient,
- }
-
- sc.Evictor = &defaultEvictor{
- kubeclient: sc.kubeClient,
- recorder: sc.Recorder,
- }
-
- sc.StatusUpdater = &defaultStatusUpdater{
- kubeclient: sc.kubeClient,
- vcclient: sc.vcClient,
- }
-
- sc.PodGroupBinder = &podgroupBinder{
- kubeclient: sc.kubeClient,
- vcclient: sc.vcClient,
- }
-
- informerFactory := informers.NewSharedInformerFactory(sc.kubeClient, 0)
- sc.informerFactory = informerFactory
-
- // create informer for node information
- sc.nodeInformer = informerFactory.Core().V1().Nodes()
- sc.nodeInformer.Informer().AddEventHandlerWithResyncPeriod(
- cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddNode,
- UpdateFunc: sc.UpdateNode,
- DeleteFunc: sc.DeleteNode,
- },
- 0,
- )
-
- sc.podInformer = informerFactory.Core().V1().Pods()
- sc.pvcInformer = informerFactory.Core().V1().PersistentVolumeClaims()
- sc.pvInformer = informerFactory.Core().V1().PersistentVolumes()
- sc.scInformer = informerFactory.Storage().V1().StorageClasses()
- sc.csiNodeInformer = informerFactory.Storage().V1().CSINodes()
- sc.csiDriverInformer = informerFactory.Storage().V1().CSIDrivers()
- sc.csiStorageCapacityInformer = informerFactory.Storage().V1alpha1().CSIStorageCapacities()
- sc.VolumeBinder = &defaultVolumeBinder{
- volumeBinder: volumescheduling.NewVolumeBinder(
- sc.kubeClient,
- sc.podInformer,
- sc.nodeInformer,
- sc.csiNodeInformer,
- sc.pvcInformer,
- sc.pvInformer,
- sc.scInformer,
- &volumescheduling.CapacityCheck{
- CSIDriverInformer: sc.csiDriverInformer,
- CSIStorageCapacityInformer: sc.csiStorageCapacityInformer,
- },
- 30*time.Second,
- ),
- }
-
- // create informer for pod information
- sc.podInformer.Informer().AddEventHandler(
- cache.FilteringResourceEventHandler{
- FilterFunc: func(obj interface{}) bool {
- switch v := obj.(type) {
- case *v1.Pod:
- if !responsibleForPod(v, schedulerName) {
- if len(v.Spec.NodeName) == 0 {
- return false
- }
- }
- return true
- default:
- return false
- }
- },
- Handler: cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddPod,
- UpdateFunc: sc.UpdatePod,
- DeleteFunc: sc.DeletePod,
- },
- })
-
- sc.pcInformer = informerFactory.Scheduling().V1beta1().PriorityClasses()
- sc.pcInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddPriorityClass,
- UpdateFunc: sc.UpdatePriorityClass,
- DeleteFunc: sc.DeletePriorityClass,
- })
-
- sc.quotaInformer = informerFactory.Core().V1().ResourceQuotas()
- sc.quotaInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddResourceQuota,
- UpdateFunc: sc.UpdateResourceQuota,
- DeleteFunc: sc.DeleteResourceQuota,
- })
-
- vcinformers := vcinformer.NewSharedInformerFactory(sc.vcClient, 0)
-
- // create informer for PodGroup(v1beta1) information
- sc.podGroupInformerV1beta1 = vcinformers.Scheduling().V1beta1().PodGroups()
- sc.podGroupInformerV1beta1.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddPodGroupV1beta1,
- UpdateFunc: sc.UpdatePodGroupV1beta1,
- DeleteFunc: sc.DeletePodGroupV1beta1,
- })
-
- // create informer(v1beta1) for Queue information
- sc.queueInformerV1beta1 = vcinformers.Scheduling().V1beta1().Queues()
- sc.queueInformerV1beta1.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddQueueV1beta1,
- UpdateFunc: sc.UpdateQueueV1beta1,
- DeleteFunc: sc.DeleteQueueV1beta1,
- })
-
- sc.cpuInformer = vcinformers.Nodeinfo().V1alpha1().Numatopologies()
- sc.cpuInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddNumaInfoV1alpha1,
- UpdateFunc: sc.UpdateNumaInfoV1alpha1,
- DeleteFunc: sc.DeleteNumaInfoV1alpha1,
- })
- return sc
-}
-
-// Run starts the schedulerCache
-func (sc *SchedulerCache) Run(stopCh <-chan struct{}) {
- go sc.podInformer.Informer().Run(stopCh)
- go sc.nodeInformer.Informer().Run(stopCh)
- go sc.podGroupInformerV1beta1.Informer().Run(stopCh)
- go sc.pvInformer.Informer().Run(stopCh)
- go sc.pvcInformer.Informer().Run(stopCh)
- go sc.scInformer.Informer().Run(stopCh)
- go sc.queueInformerV1beta1.Informer().Run(stopCh)
- go sc.quotaInformer.Informer().Run(stopCh)
- go sc.cpuInformer.Informer().Run(stopCh)
-
- if options.ServerOpts.EnablePriorityClass {
- go sc.pcInformer.Informer().Run(stopCh)
- }
-
- // Re-sync error tasks.
- go wait.Until(sc.processResyncTask, 0, stopCh)
-
- // Cleanup jobs.
- go wait.Until(sc.processCleanupJob, 0, stopCh)
-}
-
-// WaitForCacheSync sync the cache with the api server
-func (sc *SchedulerCache) WaitForCacheSync(stopCh <-chan struct{}) bool {
- return cache.WaitForCacheSync(stopCh,
- func() []cache.InformerSynced {
- informerSynced := []cache.InformerSynced{
- sc.podInformer.Informer().HasSynced,
- sc.podGroupInformerV1beta1.Informer().HasSynced,
- sc.nodeInformer.Informer().HasSynced,
- sc.pvInformer.Informer().HasSynced,
- sc.pvcInformer.Informer().HasSynced,
- sc.scInformer.Informer().HasSynced,
- sc.queueInformerV1beta1.Informer().HasSynced,
- sc.quotaInformer.Informer().HasSynced,
- sc.cpuInformer.Informer().HasSynced,
- }
- if options.ServerOpts.EnablePriorityClass {
- informerSynced = append(informerSynced, sc.pcInformer.Informer().HasSynced)
- }
- return informerSynced
- }()...,
- )
-}
-
-// findJobAndTask returns job and the task info
-func (sc *SchedulerCache) findJobAndTask(taskInfo *schedulingapi.TaskInfo) (*schedulingapi.JobInfo, *schedulingapi.TaskInfo, error) {
- job, found := sc.Jobs[taskInfo.Job]
- if !found {
- return nil, nil, fmt.Errorf("failed to find Job %v for Task %v",
- taskInfo.Job, taskInfo.UID)
- }
-
- task, found := job.Tasks[taskInfo.UID]
- if !found {
- return nil, nil, fmt.Errorf("failed to find task in status %v by id %v",
- taskInfo.Status, taskInfo.UID)
- }
-
- return job, task, nil
-}
-
-// Evict will evict the pod.
-//
-// If error occurs both task and job are guaranteed to be in the original state.
-func (sc *SchedulerCache) Evict(taskInfo *schedulingapi.TaskInfo, reason string) error {
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- job, task, err := sc.findJobAndTask(taskInfo)
-
- if err != nil {
- return err
- }
-
- node, found := sc.Nodes[task.NodeName]
- if !found {
- return fmt.Errorf("failed to bind Task %v to host %v, host does not exist",
- task.UID, task.NodeName)
- }
-
- originalStatus := task.Status
- if err := job.UpdateTaskStatus(task, schedulingapi.Releasing); err != nil {
- return err
- }
-
- // Add new task to node.
- if err := node.UpdateTask(task); err != nil {
- // After failing to update task to a node we need to revert task status from Releasing,
- // otherwise task might be stuck in the Releasing state indefinitely.
- if err := job.UpdateTaskStatus(task, originalStatus); err != nil {
- klog.Errorf("Task <%s/%s> will be resynchronized after failing to revert status "+
- "from %s to %s after failing to update Task on Node <%s>: %v",
- task.Namespace, task.Name, task.Status, originalStatus, node.Name, err)
- sc.resyncTask(task)
- }
- return err
- }
-
- p := task.Pod
-
- go func() {
- err := sc.Evictor.Evict(p, reason)
- if err != nil {
- sc.resyncTask(task)
- }
- }()
-
- podgroup := &vcv1beta1.PodGroup{}
- if err := schedulingscheme.Scheme.Convert(&job.PodGroup.PodGroup, podgroup, nil); err != nil {
- klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
- return err
- }
- sc.Recorder.Eventf(podgroup, v1.EventTypeNormal, "Evict", reason)
- return nil
-}
-
-// Bind binds task to the target host.
-func (sc *SchedulerCache) Bind(taskInfo *schedulingapi.TaskInfo, hostname string) error {
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- job, task, err := sc.findJobAndTask(taskInfo)
-
- if err != nil {
- return err
- }
-
- node, found := sc.Nodes[hostname]
- if !found {
- return fmt.Errorf("failed to bind Task %v to host %v, host does not exist",
- task.UID, hostname)
- }
-
- originalStatus := task.Status
- if err := job.UpdateTaskStatus(task, schedulingapi.Binding); err != nil {
- return err
- }
-
- // Add task to the node.
- if err := node.AddTask(task); err != nil {
- // After failing to update task to a node we need to revert task status from Releasing,
- // otherwise task might be stuck in the Releasing state indefinitely.
- if err := job.UpdateTaskStatus(task, originalStatus); err != nil {
- klog.Errorf("Task <%s/%s> will be resynchronized after failing to revert status "+
- "from %s to %s after failing to update Task on Node <%s>: %v",
- task.Namespace, task.Name, task.Status, originalStatus, node.Name, err)
- sc.resyncTask(task)
- }
- return err
- }
-
- p := task.Pod
- if !(task.TopologyPolicy == "" || task.TopologyPolicy == "none") {
- if err := sc.Binder.Bind(p, hostname); err != nil {
- sc.resyncTask(task)
- } else {
- sc.Recorder.Eventf(p, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v", p.Namespace, p.Name, hostname)
- }
- } else {
- go func() {
- if err := sc.Binder.Bind(p, hostname); err != nil {
- sc.resyncTask(task)
- } else {
- sc.Recorder.Eventf(p, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v", p.Namespace, p.Name, hostname)
- }
- }()
- }
-
- return nil
-}
-
-// BindPodGroup binds job to silo cluster
-func (sc *SchedulerCache) BindPodGroup(job *schedulingapi.JobInfo, cluster string) error {
- if _, err := sc.PodGroupBinder.Bind(job, cluster); err != nil {
- klog.Errorf("Bind job <%s> to cluster <%s> failed: %v", job.Name, cluster, err)
- return err
- }
- return nil
-}
-
-// GetPodVolumes get pod volume on the host
-func (sc *SchedulerCache) GetPodVolumes(task *schedulingapi.TaskInfo, node *v1.Node) (*volumescheduling.PodVolumes, error) {
- return sc.VolumeBinder.GetPodVolumes(task, node)
-}
-
-// AllocateVolumes allocates volume on the host to the task
-func (sc *SchedulerCache) AllocateVolumes(task *schedulingapi.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error {
- return sc.VolumeBinder.AllocateVolumes(task, hostname, podVolumes)
-}
-
-// BindVolumes binds volumes to the task
-func (sc *SchedulerCache) BindVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) error {
- return sc.VolumeBinder.BindVolumes(task, podVolumes)
-}
-
-// Client returns the kubernetes clientSet
-func (sc *SchedulerCache) Client() kubernetes.Interface {
- return sc.kubeClient
-}
-
-// SharedInformerFactory returns the scheduler SharedInformerFactory
-func (sc *SchedulerCache) SharedInformerFactory() informers.SharedInformerFactory {
- return sc.informerFactory
-}
-
-// UpdateSchedulerNumaInfo used to update scheduler node cache NumaSchedulerInfo
-func (sc *SchedulerCache) UpdateSchedulerNumaInfo(AllocatedSets map[string]schedulingapi.ResNumaSets) error {
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- for nodeName, sets := range AllocatedSets {
- if _, found := sc.Nodes[nodeName]; !found {
- continue
- }
-
- numaInfo := sc.Nodes[nodeName].NumaSchedulerInfo
- if numaInfo == nil {
- continue
- }
-
- numaInfo.Allocate(sets)
- }
- return nil
-}
-
-// taskUnschedulable updates pod status of pending task
-func (sc *SchedulerCache) taskUnschedulable(task *schedulingapi.TaskInfo, reason, message string) error {
- pod := task.Pod
-
- condition := &v1.PodCondition{
- Type: v1.PodScheduled,
- Status: v1.ConditionFalse,
- Reason: reason, // Add more reasons in order to distinguish more specific scenario of pending tasks
- Message: message,
- }
-
- if podConditionHaveUpdate(&pod.Status, condition) {
- pod = pod.DeepCopy()
-
- // The reason field in 'Events' should be "FailedScheduling", there is not constants defined for this in
- // k8s core, so using the same string here.
- // The reason field in PodCondition can be "Unschedulable"
- sc.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", message)
- if _, err := sc.StatusUpdater.UpdatePodCondition(pod, condition); err != nil {
- return err
- }
- } else {
- klog.V(4).Infof("task unscheduleable %s/%s, message: %s, skip by no condition update", pod.Namespace, pod.Name, message)
- }
-
- return nil
-}
-
-func (sc *SchedulerCache) deleteJob(job *schedulingapi.JobInfo) {
- klog.V(3).Infof("Try to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name)
-
- sc.deletedJobs.AddRateLimited(job)
-}
-
-func (sc *SchedulerCache) processCleanupJob() {
- obj, shutdown := sc.deletedJobs.Get()
- if shutdown {
- return
- }
-
- defer sc.deletedJobs.Done(obj)
-
- job, found := obj.(*schedulingapi.JobInfo)
- if !found {
- klog.Errorf("Failed to convert <%v> to *JobInfo", obj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- if schedulingapi.JobTerminated(job) {
- delete(sc.Jobs, job.UID)
- klog.V(3).Infof("Job <%v:%v/%v> was deleted.", job.UID, job.Namespace, job.Name)
- } else {
- // Retry
- sc.deleteJob(job)
- }
-}
-
-func (sc *SchedulerCache) resyncTask(task *schedulingapi.TaskInfo) {
- sc.errTasks.AddRateLimited(task)
-}
-
-func (sc *SchedulerCache) processResyncTask() {
- obj, shutdown := sc.errTasks.Get()
- if shutdown {
- return
- }
-
- defer sc.errTasks.Done(obj)
-
- task, ok := obj.(*schedulingapi.TaskInfo)
- if !ok {
- klog.Errorf("failed to convert %v to *schedulingapi.TaskInfo", obj)
- return
- }
-
- if err := sc.syncTask(task); err != nil {
- klog.Errorf("Failed to sync pod <%v/%v>, retry it.", task.Namespace, task.Name)
- sc.resyncTask(task)
- }
-}
-
-// Snapshot returns the complete snapshot of the cluster from cache
-func (sc *SchedulerCache) Snapshot() *schedulingapi.ClusterInfo {
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- snapshot := &schedulingapi.ClusterInfo{
- Nodes: make(map[string]*schedulingapi.NodeInfo),
- Jobs: make(map[schedulingapi.JobID]*schedulingapi.JobInfo),
- Queues: make(map[schedulingapi.QueueID]*schedulingapi.QueueInfo),
- NamespaceInfo: make(map[schedulingapi.NamespaceName]*schedulingapi.NamespaceInfo),
- RevocableNodes: make(map[string]*schedulingapi.NodeInfo),
- NodeList: make([]string, len(sc.NodeList)),
- }
-
- copy(snapshot.NodeList, sc.NodeList)
- for _, value := range sc.Nodes {
- value.RefreshNumaSchedulerInfoByCrd()
- }
-
- for _, value := range sc.Nodes {
- if !value.Ready() {
- continue
- }
-
- snapshot.Nodes[value.Name] = value.Clone()
-
- if value.RevocableZone != "" {
- snapshot.RevocableNodes[value.Name] = snapshot.Nodes[value.Name]
- }
- }
-
- for _, value := range sc.Queues {
- snapshot.Queues[value.UID] = value.Clone()
- }
-
- var cloneJobLock sync.Mutex
- var wg sync.WaitGroup
-
- cloneJob := func(value *schedulingapi.JobInfo) {
- defer wg.Done()
- if value.PodGroup != nil {
- value.Priority = sc.defaultPriority
-
- priName := value.PodGroup.Spec.PriorityClassName
- if priorityClass, found := sc.PriorityClasses[priName]; found {
- value.Priority = priorityClass.Value
- }
-
- klog.V(4).Infof("The priority of job <%s/%s> is <%s/%d>",
- value.Namespace, value.Name, priName, value.Priority)
- }
-
- clonedJob := value.Clone()
-
- cloneJobLock.Lock()
- snapshot.Jobs[value.UID] = clonedJob
- cloneJobLock.Unlock()
- }
-
- for _, value := range sc.NamespaceCollection {
- info := value.Snapshot()
- snapshot.NamespaceInfo[info.Name] = info
- klog.V(4).Infof("Namespace %s has weight %v",
- value.Name, info.GetWeight())
- }
-
- for _, value := range sc.Jobs {
- // If no scheduling spec, does not handle it.
- if value.PodGroup == nil {
- klog.V(4).Infof("The scheduling spec of Job <%v:%s/%s> is nil, ignore it.",
- value.UID, value.Namespace, value.Name)
-
- continue
- }
-
- if _, found := snapshot.Queues[value.Queue]; !found {
- klog.V(3).Infof("The Queue <%v> of Job <%v/%v> does not exist, ignore it.",
- value.Queue, value.Namespace, value.Name)
- continue
- }
-
- wg.Add(1)
- go cloneJob(value)
- }
- wg.Wait()
-
- klog.V(3).Infof("There are <%d> Jobs, <%d> Queues and <%d> Nodes in total for scheduling.",
- len(snapshot.Jobs), len(snapshot.Queues), len(snapshot.Nodes))
-
- return snapshot
-}
-
-// String returns information about the cache in a string format
-func (sc *SchedulerCache) String() string {
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- str := "Cache:\n"
-
- if len(sc.Nodes) != 0 {
- str += "Nodes:\n"
- for _, n := range sc.Nodes {
- str += fmt.Sprintf("\t %s: idle(%v) used(%v) allocatable(%v) pods(%d)\n",
- n.Name, n.Idle, n.Used, n.Allocatable, len(n.Tasks))
-
- i := 0
- for _, p := range n.Tasks {
- str += fmt.Sprintf("\t\t %d: %v\n", i, p)
- i++
- }
- }
- }
-
- if len(sc.Jobs) != 0 {
- str += "Jobs:\n"
- for _, job := range sc.Jobs {
- str += fmt.Sprintf("\t %s\n", job)
- }
- }
-
- if len(sc.NamespaceCollection) != 0 {
- str += "Namespaces:\n"
- for _, ns := range sc.NamespaceCollection {
- info := ns.Snapshot()
- str += fmt.Sprintf("\t Namespace(%s) Weight(%v)\n",
- info.Name, info.Weight)
- }
- }
-
- if len(sc.NodeList) != 0 {
- str += fmt.Sprintf("NodeList: %v\n", sc.NodeList)
- }
-
- return str
-}
-
-// RecordJobStatusEvent records related events according to job status.
-func (sc *SchedulerCache) RecordJobStatusEvent(job *schedulingapi.JobInfo) {
- pgUnschedulable := job.PodGroup != nil &&
- (job.PodGroup.Status.Phase == scheduling.PodGroupUnknown ||
- job.PodGroup.Status.Phase == scheduling.PodGroupPending ||
- job.PodGroup.Status.Phase == scheduling.PodGroupInqueue)
-
- // If pending or unschedulable, record unschedulable event.
- if pgUnschedulable {
- msg := fmt.Sprintf("%v/%v tasks in gang unschedulable: %v",
- len(job.TaskStatusIndex[schedulingapi.Pending]),
- len(job.Tasks),
- job.FitError())
- sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeWarning, string(scheduling.PodGroupUnschedulableType), msg)
- } else {
- sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeNormal, string(scheduling.PodGroupScheduled), string(scheduling.PodGroupReady))
- }
-
- baseErrorMessage := job.JobFitErrors
- if baseErrorMessage == "" {
- baseErrorMessage = schedulingapi.AllNodeUnavailableMsg
- }
- // Update podCondition for tasks Allocated and Pending before job discarded
- for _, status := range []schedulingapi.TaskStatus{schedulingapi.Allocated, schedulingapi.Pending, schedulingapi.Pipelined} {
- for _, taskInfo := range job.TaskStatusIndex[status] {
- reason, msg := job.TaskSchedulingReason(taskInfo.UID)
- if len(msg) == 0 {
- msg = baseErrorMessage
- }
- if err := sc.taskUnschedulable(taskInfo, reason, msg); err != nil {
- klog.Errorf("Failed to update unschedulable task status <%s/%s>: %v",
- taskInfo.Namespace, taskInfo.Name, err)
- }
- }
- }
-}
-
-// UpdateJobStatus update the status of job and its tasks.
-func (sc *SchedulerCache) UpdateJobStatus(job *schedulingapi.JobInfo, updatePG bool) (*schedulingapi.JobInfo, error) {
- if updatePG {
- pg, err := sc.StatusUpdater.UpdatePodGroup(job.PodGroup)
- if err != nil {
- return nil, err
- }
- job.PodGroup = pg
- }
-
- sc.RecordJobStatusEvent(job)
-
- return job, nil
-}
-
-func (sc *SchedulerCache) recordPodGroupEvent(podGroup *schedulingapi.PodGroup, eventType, reason, msg string) {
- if podGroup == nil {
- return
- }
-
- pg := &vcv1beta1.PodGroup{}
- if err := schedulingscheme.Scheme.Convert(&podGroup.PodGroup, pg, nil); err != nil {
- klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
- return
- }
- sc.Recorder.Eventf(pg, eventType, reason, msg)
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package cache
-
-import (
- "context"
- "fmt"
- "strconv"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/api/scheduling/v1beta1"
- "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
-
- nodeinfov1alpha1 "volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1"
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/apis/pkg/apis/scheduling/scheme"
- schedulingv1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/apis/pkg/apis/utils"
- schedulingapi "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-func isTerminated(status schedulingapi.TaskStatus) bool {
- return status == schedulingapi.Succeeded || status == schedulingapi.Failed
-}
-
-// getOrCreateJob will return corresponding Job for pi if it exists, or it will create a Job and return it if
-// pi.Pod.Spec.SchedulerName is same as volcano scheduler's name, otherwise it will return nil.
-func (sc *SchedulerCache) getOrCreateJob(pi *schedulingapi.TaskInfo) *schedulingapi.JobInfo {
- if len(pi.Job) == 0 {
- if pi.Pod.Spec.SchedulerName != sc.schedulerName {
- klog.V(4).Infof("Pod %s/%s will not not scheduled by %s, skip creating PodGroup and Job for it",
- pi.Pod.Namespace, pi.Pod.Name, sc.schedulerName)
- }
- return nil
- }
-
- if _, found := sc.Jobs[pi.Job]; !found {
- sc.Jobs[pi.Job] = schedulingapi.NewJobInfo(pi.Job)
- }
-
- return sc.Jobs[pi.Job]
-}
-
-func (sc *SchedulerCache) addTask(pi *schedulingapi.TaskInfo) error {
- if len(pi.NodeName) != 0 {
- if _, found := sc.Nodes[pi.NodeName]; !found {
- sc.Nodes[pi.NodeName] = schedulingapi.NewNodeInfo(nil)
- sc.Nodes[pi.NodeName].Name = pi.NodeName
- }
-
- node := sc.Nodes[pi.NodeName]
- if !isTerminated(pi.Status) {
- if err := node.AddTask(pi); err != nil {
- return err
- }
- } else {
- klog.V(4).Infof("Pod <%v/%v> is in status %s.", pi.Namespace, pi.Name, pi.Status.String())
- }
- }
-
- job := sc.getOrCreateJob(pi)
- if job != nil {
- job.AddTaskInfo(pi)
- }
-
- return nil
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) addPod(pod *v1.Pod) error {
- pi := schedulingapi.NewTaskInfo(pod)
-
- return sc.addTask(pi)
-}
-
-func (sc *SchedulerCache) syncTask(oldTask *schedulingapi.TaskInfo) error {
- newPod, err := sc.kubeClient.CoreV1().Pods(oldTask.Namespace).Get(context.TODO(), oldTask.Name, metav1.GetOptions{})
- if err != nil {
- if errors.IsNotFound(err) {
- err := sc.deleteTask(oldTask)
- if err != nil {
- klog.Errorf("Failed to delete Pod <%v/%v> and remove from cache: %s", oldTask.Namespace, oldTask.Name, err.Error())
- return err
- }
- klog.V(3).Infof("Pod <%v/%v> was deleted, removed from cache.", oldTask.Namespace, oldTask.Name)
-
- return nil
- }
- return fmt.Errorf("failed to get Pod <%v/%v>: err %v", oldTask.Namespace, oldTask.Name, err)
- }
-
- newTask := schedulingapi.NewTaskInfo(newPod)
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
- return sc.updateTask(oldTask, newTask)
-}
-
-func (sc *SchedulerCache) updateTask(oldTask, newTask *schedulingapi.TaskInfo) error {
- if err := sc.deleteTask(oldTask); err != nil {
- klog.Warningf("Failed to delete task: %v", err)
- }
-
- return sc.addTask(newTask)
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) updatePod(oldPod, newPod *v1.Pod) error {
- if err := sc.deletePod(oldPod); err != nil {
- return err
- }
- //when delete pod, the ownerreference of pod will be set nil,just as orphan pod
- if len(utils.GetController(newPod)) == 0 {
- newPod.OwnerReferences = oldPod.OwnerReferences
- }
- return sc.addPod(newPod)
-}
-
-func (sc *SchedulerCache) deleteTask(pi *schedulingapi.TaskInfo) error {
- var jobErr, nodeErr, numaErr error
-
- if len(pi.Job) != 0 {
- if job, found := sc.Jobs[pi.Job]; found {
- jobErr = job.DeleteTaskInfo(pi)
- } else {
- jobErr = fmt.Errorf("failed to find Job <%v> for Task %v/%v",
- pi.Job, pi.Namespace, pi.Name)
- }
- }
-
- if len(pi.NodeName) != 0 {
- node := sc.Nodes[pi.NodeName]
- if node != nil {
- nodeErr = node.RemoveTask(pi)
- }
- }
-
- if jobErr != nil || nodeErr != nil {
- return schedulingapi.MergeErrors(jobErr, nodeErr, numaErr)
- }
-
- return nil
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) deletePod(pod *v1.Pod) error {
- pi := schedulingapi.NewTaskInfo(pod)
-
- // Delete the Task in cache to handle Binding status.
- task := pi
- if job, found := sc.Jobs[pi.Job]; found {
- if t, found := job.Tasks[pi.UID]; found {
- task = t
- }
- }
- if err := sc.deleteTask(task); err != nil {
- klog.Warningf("Failed to delete task: %v", err)
- }
-
- // If job was terminated, delete it.
- if job, found := sc.Jobs[pi.Job]; found && schedulingapi.JobTerminated(job) {
- sc.deleteJob(job)
- }
-
- return nil
-}
-
-// AddPod add pod to scheduler cache
-func (sc *SchedulerCache) AddPod(obj interface{}) {
- pod, ok := obj.(*v1.Pod)
- if !ok {
- klog.Errorf("Cannot convert to *v1.Pod: %v", obj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.addPod(pod)
- if err != nil {
- klog.Errorf("Failed to add pod <%s/%s> into cache: %v",
- pod.Namespace, pod.Name, err)
- return
- }
- klog.V(3).Infof("Added pod <%s/%v> into cache.", pod.Namespace, pod.Name)
-}
-
-// UpdatePod update pod to scheduler cache
-func (sc *SchedulerCache) UpdatePod(oldObj, newObj interface{}) {
- oldPod, ok := oldObj.(*v1.Pod)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *v1.Pod: %v", oldObj)
- return
- }
- newPod, ok := newObj.(*v1.Pod)
- if !ok {
- klog.Errorf("Cannot convert newObj to *v1.Pod: %v", newObj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.updatePod(oldPod, newPod)
- if err != nil {
- klog.Errorf("Failed to update pod %v in cache: %v", oldPod.Name, err)
- return
- }
-
- klog.V(4).Infof("Updated pod <%s/%v> in cache.", oldPod.Namespace, oldPod.Name)
-}
-
-// DeletePod delete pod from scheduler cache
-func (sc *SchedulerCache) DeletePod(obj interface{}) {
- var pod *v1.Pod
- switch t := obj.(type) {
- case *v1.Pod:
- pod = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- pod, ok = t.Obj.(*v1.Pod)
- if !ok {
- klog.Errorf("Cannot convert to *v1.Pod: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to *v1.Pod: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.deletePod(pod)
- if err != nil {
- klog.Errorf("Failed to delete pod %v from cache: %v", pod.Name, err)
- return
- }
-
- klog.V(3).Infof("Deleted pod <%s/%v> from cache.", pod.Namespace, pod.Name)
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) addNode(node *v1.Node) error {
- if sc.Nodes[node.Name] != nil {
- sc.Nodes[node.Name].SetNode(node)
- } else {
- sc.Nodes[node.Name] = schedulingapi.NewNodeInfo(node)
- }
- return nil
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) updateNode(oldNode, newNode *v1.Node) error {
- if sc.Nodes[newNode.Name] != nil {
- sc.Nodes[newNode.Name].SetNode(newNode)
- return nil
- }
-
- return fmt.Errorf("node <%s> does not exist", newNode.Name)
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) deleteNode(node *v1.Node) error {
- if _, ok := sc.Nodes[node.Name]; !ok {
- return fmt.Errorf("node <%s> does not exist", node.Name)
- }
-
- numaInfo := sc.Nodes[node.Name].NumaInfo
- if numaInfo != nil {
- klog.V(3).Infof("delete numatopo <%s/%s>", numaInfo.Namespace, numaInfo.Name)
- err := sc.vcClient.NodeinfoV1alpha1().Numatopologies().Delete(context.TODO(), numaInfo.Name, metav1.DeleteOptions{})
- if err != nil {
- klog.Errorf("delete numatopo <%s/%s> failed.", numaInfo.Namespace, numaInfo.Name)
- }
- }
-
- delete(sc.Nodes, node.Name)
-
- return nil
-}
-
-// AddNode add node to scheduler cache
-func (sc *SchedulerCache) AddNode(obj interface{}) {
- node, ok := obj.(*v1.Node)
- if !ok {
- klog.Errorf("Cannot convert to *v1.Node: %v", obj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.addNode(node)
- if err != nil {
- klog.Errorf("Failed to add node %s into cache: %v", node.Name, err)
- return
- }
- sc.NodeList = append(sc.NodeList, node.Name)
-}
-
-// UpdateNode update node to scheduler cache
-func (sc *SchedulerCache) UpdateNode(oldObj, newObj interface{}) {
- oldNode, ok := oldObj.(*v1.Node)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *v1.Node: %v", oldObj)
- return
- }
- newNode, ok := newObj.(*v1.Node)
- if !ok {
- klog.Errorf("Cannot convert newObj to *v1.Node: %v", newObj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.updateNode(oldNode, newNode)
- if err != nil {
- klog.Errorf("Failed to update node %v in cache: %v", oldNode.Name, err)
- return
- }
-}
-
-// DeleteNode delete node from scheduler cache
-func (sc *SchedulerCache) DeleteNode(obj interface{}) {
- var node *v1.Node
- switch t := obj.(type) {
- case *v1.Node:
- node = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- node, ok = t.Obj.(*v1.Node)
- if !ok {
- klog.Errorf("Cannot convert to *v1.Node: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to *v1.Node: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.deleteNode(node)
- if err != nil {
- klog.Errorf("Failed to delete node %s from cache: %v", node.Name, err)
- return
- }
-
- for i, name := range sc.NodeList {
- if name == node.Name {
- sc.NodeList = append(sc.NodeList[:i], sc.NodeList[i+1:]...)
- break
- }
- }
-}
-
-func getJobID(pg *schedulingapi.PodGroup) schedulingapi.JobID {
- return schedulingapi.JobID(fmt.Sprintf("%s/%s", pg.Namespace, pg.Name))
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) setPodGroup(ss *schedulingapi.PodGroup) error {
- job := getJobID(ss)
- if _, found := sc.Jobs[job]; !found {
- sc.Jobs[job] = schedulingapi.NewJobInfo(job)
- }
-
- sc.Jobs[job].SetPodGroup(ss)
-
- // TODO(k82cn): set default queue in admission.
- if len(ss.Spec.Queue) == 0 {
- sc.Jobs[job].Queue = schedulingapi.QueueID(sc.defaultQueue)
- }
-
- return nil
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) updatePodGroup(newPodGroup *schedulingapi.PodGroup) error {
- return sc.setPodGroup(newPodGroup)
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) deletePodGroup(id schedulingapi.JobID) error {
- job, found := sc.Jobs[id]
- if !found {
- return fmt.Errorf("can not found job %v", id)
- }
-
- // Unset SchedulingSpec
- job.UnsetPodGroup()
-
- sc.deleteJob(job)
-
- return nil
-}
-
-// AddPodGroupV1beta1 add podgroup to scheduler cache
-func (sc *SchedulerCache) AddPodGroupV1beta1(obj interface{}) {
- ss, ok := obj.(*schedulingv1.PodGroup)
- if !ok {
- klog.Errorf("Cannot convert to *schedulingv1.PodGroup: %v", obj)
- return
- }
-
- podgroup := scheduling.PodGroup{}
- if err := scheme.Scheme.Convert(ss, &podgroup, nil); err != nil {
- klog.Errorf("Failed to convert podgroup from %T to %T", ss, podgroup)
- return
- }
-
- pg := &schedulingapi.PodGroup{PodGroup: podgroup, Version: schedulingapi.PodGroupVersionV1Beta1}
- klog.V(4).Infof("Add PodGroup(%s) into cache, spec(%#v)", ss.Name, ss.Spec)
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- if err := sc.setPodGroup(pg); err != nil {
- klog.Errorf("Failed to add PodGroup %s into cache: %v", ss.Name, err)
- return
- }
-}
-
-// UpdatePodGroupV1beta1 add podgroup to scheduler cache
-func (sc *SchedulerCache) UpdatePodGroupV1beta1(oldObj, newObj interface{}) {
- oldSS, ok := oldObj.(*schedulingv1.PodGroup)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *schedulingv1.SchedulingSpec: %v", oldObj)
- return
- }
- newSS, ok := newObj.(*schedulingv1.PodGroup)
- if !ok {
- klog.Errorf("Cannot convert newObj to *schedulingv1.SchedulingSpec: %v", newObj)
- return
- }
-
- if oldSS.ResourceVersion == newSS.ResourceVersion {
- return
- }
-
- podgroup := scheduling.PodGroup{}
- if err := scheme.Scheme.Convert(newSS, &podgroup, nil); err != nil {
- klog.Errorf("Failed to convert podgroup from %T to %T", newSS, podgroup)
- return
- }
-
- pg := &schedulingapi.PodGroup{PodGroup: podgroup, Version: schedulingapi.PodGroupVersionV1Beta1}
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- if err := sc.updatePodGroup(pg); err != nil {
- klog.Errorf("Failed to update SchedulingSpec %s into cache: %v", pg.Name, err)
- return
- }
-}
-
-// DeletePodGroupV1beta1 delete podgroup from scheduler cache
-func (sc *SchedulerCache) DeletePodGroupV1beta1(obj interface{}) {
- var ss *schedulingv1.PodGroup
- switch t := obj.(type) {
- case *schedulingv1.PodGroup:
- ss = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- ss, ok = t.Obj.(*schedulingv1.PodGroup)
- if !ok {
- klog.Errorf("Cannot convert to podgroup: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to podgroup: %v", t)
- return
- }
-
- jobID := schedulingapi.JobID(fmt.Sprintf("%s/%s", ss.Namespace, ss.Name))
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- if err := sc.deletePodGroup(jobID); err != nil {
- klog.Errorf("Failed to delete podgroup %s from cache: %v", ss.Name, err)
- return
- }
-}
-
-// AddQueueV1beta1 add queue to scheduler cache
-func (sc *SchedulerCache) AddQueueV1beta1(obj interface{}) {
- ss, ok := obj.(*schedulingv1.Queue)
- if !ok {
- klog.Errorf("Cannot convert to *schedulingv1.Queue: %v", obj)
- return
- }
-
- queue := &scheduling.Queue{}
- if err := scheme.Scheme.Convert(ss, queue, nil); err != nil {
- klog.Errorf("Failed to convert queue from %T to %T", ss, queue)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- klog.V(4).Infof("Add Queue(%s) into cache, spec(%#v)", ss.Name, ss.Spec)
- sc.addQueue(queue)
-}
-
-// UpdateQueueV1beta1 update queue to scheduler cache
-func (sc *SchedulerCache) UpdateQueueV1beta1(oldObj, newObj interface{}) {
- oldSS, ok := oldObj.(*schedulingv1.Queue)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *schedulingv1.Queue: %v", oldObj)
- return
- }
- newSS, ok := newObj.(*schedulingv1.Queue)
- if !ok {
- klog.Errorf("Cannot convert newObj to *schedulingv1.Queue: %v", newObj)
- return
- }
-
- if oldSS.ResourceVersion == newSS.ResourceVersion {
- return
- }
-
- newQueue := &scheduling.Queue{}
- if err := scheme.Scheme.Convert(newSS, newQueue, nil); err != nil {
- klog.Errorf("Failed to convert queue from %T to %T", newSS, newQueue)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
- sc.updateQueue(newQueue)
-}
-
-// DeleteQueueV1beta1 delete queue from the scheduler cache
-func (sc *SchedulerCache) DeleteQueueV1beta1(obj interface{}) {
- var ss *schedulingv1.Queue
- switch t := obj.(type) {
- case *schedulingv1.Queue:
- ss = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- ss, ok = t.Obj.(*schedulingv1.Queue)
- if !ok {
- klog.Errorf("Cannot convert to *schedulingv1.Queue: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to *schedulingv1.Queue: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
- sc.deleteQueue(schedulingapi.QueueID(ss.Name))
-}
-
-func (sc *SchedulerCache) addQueue(queue *scheduling.Queue) {
- qi := schedulingapi.NewQueueInfo(queue)
- sc.Queues[qi.UID] = qi
-}
-
-func (sc *SchedulerCache) updateQueue(queue *scheduling.Queue) {
- sc.addQueue(queue)
-}
-
-func (sc *SchedulerCache) deleteQueue(id schedulingapi.QueueID) {
- delete(sc.Queues, id)
-}
-
-//DeletePriorityClass delete priorityclass from the scheduler cache
-func (sc *SchedulerCache) DeletePriorityClass(obj interface{}) {
- var ss *v1beta1.PriorityClass
- switch t := obj.(type) {
- case *v1beta1.PriorityClass:
- ss = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- ss, ok = t.Obj.(*v1beta1.PriorityClass)
- if !ok {
- klog.Errorf("Cannot convert to *v1beta1.PriorityClass: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to *v1beta1.PriorityClass: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- sc.deletePriorityClass(ss)
-}
-
-//UpdatePriorityClass update priorityclass to scheduler cache
-func (sc *SchedulerCache) UpdatePriorityClass(oldObj, newObj interface{}) {
- oldSS, ok := oldObj.(*v1beta1.PriorityClass)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *v1beta1.PriorityClass: %v", oldObj)
-
- return
- }
-
- newSS, ok := newObj.(*v1beta1.PriorityClass)
- if !ok {
- klog.Errorf("Cannot convert newObj to *v1beta1.PriorityClass: %v", newObj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- sc.deletePriorityClass(oldSS)
- sc.addPriorityClass(newSS)
-}
-
-//AddPriorityClass add priorityclass to scheduler cache
-func (sc *SchedulerCache) AddPriorityClass(obj interface{}) {
- var ss *v1beta1.PriorityClass
- switch t := obj.(type) {
- case *v1beta1.PriorityClass:
- ss = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- ss, ok = t.Obj.(*v1beta1.PriorityClass)
- if !ok {
- klog.Errorf("Cannot convert to *v1beta1.PriorityClass: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to *v1beta1.PriorityClass: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- sc.addPriorityClass(ss)
-}
-
-func (sc *SchedulerCache) deletePriorityClass(pc *v1beta1.PriorityClass) {
- if pc.GlobalDefault {
- sc.defaultPriorityClass = nil
- sc.defaultPriority = 0
- }
-
- delete(sc.PriorityClasses, pc.Name)
-}
-
-func (sc *SchedulerCache) addPriorityClass(pc *v1beta1.PriorityClass) {
- if pc.GlobalDefault {
- if sc.defaultPriorityClass != nil {
- klog.Errorf("Updated default priority class from <%s> to <%s> forcefully.",
- sc.defaultPriorityClass.Name, pc.Name)
- }
- sc.defaultPriorityClass = pc
- sc.defaultPriority = pc.Value
- }
-
- sc.PriorityClasses[pc.Name] = pc
-}
-
-func (sc *SchedulerCache) updateResourceQuota(quota *v1.ResourceQuota) {
- collection, ok := sc.NamespaceCollection[quota.Namespace]
- if !ok {
- collection = schedulingapi.NewNamespaceCollection(quota.Namespace)
- sc.NamespaceCollection[quota.Namespace] = collection
- }
-
- collection.Update(quota)
-}
-
-func (sc *SchedulerCache) deleteResourceQuota(quota *v1.ResourceQuota) {
- collection, ok := sc.NamespaceCollection[quota.Namespace]
- if !ok {
- return
- }
-
- collection.Delete(quota)
-}
-
-// DeleteResourceQuota delete ResourceQuota from the scheduler cache
-func (sc *SchedulerCache) DeleteResourceQuota(obj interface{}) {
- var r *v1.ResourceQuota
- switch t := obj.(type) {
- case *v1.ResourceQuota:
- r = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- r, ok = t.Obj.(*v1.ResourceQuota)
- if !ok {
- klog.Errorf("Cannot convert to *v1.ResourceQuota: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to *v1.ResourceQuota: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- klog.V(3).Infof("Delete ResourceQuota <%s/%v> in cache", r.Namespace, r.Name)
- sc.deleteResourceQuota(r)
-}
-
-// UpdateResourceQuota update ResourceQuota to scheduler cache
-func (sc *SchedulerCache) UpdateResourceQuota(oldObj, newObj interface{}) {
- newR, ok := newObj.(*v1.ResourceQuota)
- if !ok {
- klog.Errorf("Cannot convert newObj to *v1.ResourceQuota: %v", newObj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- klog.V(3).Infof("Update ResourceQuota <%s/%v> in cache, with spec: %v.", newR.Namespace, newR.Name, newR.Spec.Hard)
- sc.updateResourceQuota(newR)
-}
-
-// AddResourceQuota add ResourceQuota to scheduler cache
-func (sc *SchedulerCache) AddResourceQuota(obj interface{}) {
- var r *v1.ResourceQuota
- switch t := obj.(type) {
- case *v1.ResourceQuota:
- r = t
- default:
- klog.Errorf("Cannot convert to *v1.ResourceQuota: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- klog.V(3).Infof("Add ResourceQuota <%s/%v> in cache, with spec: %v.", r.Namespace, r.Name, r.Spec.Hard)
- sc.updateResourceQuota(r)
-}
-
-func getNumaInfo(srcInfo *nodeinfov1alpha1.Numatopology) *schedulingapi.NumatopoInfo {
- numaInfo := &schedulingapi.NumatopoInfo{
- Namespace: srcInfo.Namespace,
- Name: srcInfo.Name,
- Policies: make(map[nodeinfov1alpha1.PolicyName]string),
- NumaResMap: make(map[string]*schedulingapi.ResourceInfo),
- CPUDetail: topology.CPUDetails{},
- ResReserved: make(v1.ResourceList),
- }
-
- policies := srcInfo.Spec.Policies
- for name, policy := range policies {
- numaInfo.Policies[name] = policy
- }
-
- numaResMap := srcInfo.Spec.NumaResMap
- for name, resInfo := range numaResMap {
- tmp := schedulingapi.ResourceInfo{}
- tmp.Capacity = resInfo.Capacity
- tmp.Allocatable = cpuset.MustParse(resInfo.Allocatable)
- numaInfo.NumaResMap[name] = &tmp
- }
-
- cpuDetail := srcInfo.Spec.CPUDetail
- for key, detail := range cpuDetail {
- cpuID, _ := strconv.Atoi(key)
- numaInfo.CPUDetail[cpuID] = topology.CPUInfo{
- NUMANodeID: detail.NUMANodeID,
- SocketID: detail.SocketID,
- CoreID: detail.CoreID,
- }
- }
-
- resReserved, err := schedulingapi.ParseResourceList(srcInfo.Spec.ResReserved)
- if err != nil {
- klog.Errorf("ParseResourceList failed, err=%v", err)
- } else {
- numaInfo.ResReserved = resReserved
- }
-
- return numaInfo
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) addNumaInfo(info *nodeinfov1alpha1.Numatopology) error {
- if sc.Nodes[info.Name] == nil {
- sc.Nodes[info.Name] = schedulingapi.NewNodeInfo(nil)
- sc.Nodes[info.Name].Name = info.Name
- }
-
- if sc.Nodes[info.Name].NumaInfo == nil {
- sc.Nodes[info.Name].NumaInfo = getNumaInfo(info)
- }
-
- newLocalInfo := getNumaInfo(info)
- if sc.Nodes[info.Name].NumaInfo.Compare(newLocalInfo) {
- sc.Nodes[info.Name].NumaChgFlag = schedulingapi.NumaInfoMoreFlag
- } else {
- sc.Nodes[info.Name].NumaChgFlag = schedulingapi.NumaInfoLessFlag
- }
-
- sc.Nodes[info.Name].NumaInfo = newLocalInfo
-
- for resName, NumaResInfo := range sc.Nodes[info.Name].NumaInfo.NumaResMap {
- klog.V(3).Infof("resource %s Allocatable %v on node[%s] into cache", resName, NumaResInfo, info.Name)
- }
-
- klog.V(3).Infof("Policies %v on node[%s] into cache, change= %v",
- sc.Nodes[info.Name].NumaInfo.Policies, info.Name, sc.Nodes[info.Name].NumaChgFlag)
- return nil
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) deleteNumaInfo(info *nodeinfov1alpha1.Numatopology) {
- if sc.Nodes[info.Name] != nil {
- sc.Nodes[info.Name].NumaInfo = nil
- sc.Nodes[info.Name].NumaChgFlag = schedulingapi.NumaInfoResetFlag
- klog.V(3).Infof("delete numainfo in cahce for node<%s>", info.Name)
- }
-}
-
-// AddNumaInfoV1alpha1 add numa information to scheduler cache
-func (sc *SchedulerCache) AddNumaInfoV1alpha1(obj interface{}) {
- ss, ok := obj.(*nodeinfov1alpha1.Numatopology)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *nodeinfov1alpha1.Numatopology: %v", obj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- sc.addNumaInfo(ss)
-}
-
-// UpdateNumaInfoV1alpha1 update numa information to scheduler cache
-func (sc *SchedulerCache) UpdateNumaInfoV1alpha1(oldObj, newObj interface{}) {
- ss, ok := newObj.(*nodeinfov1alpha1.Numatopology)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *nodeinfov1alpha1.Numatopology: %v", newObj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
- sc.addNumaInfo(ss)
- klog.V(3).Infof("update numaInfo<%s> in cahce, with spec: Policy: %v, resMap: %v", ss.Name, ss.Spec.Policies, ss.Spec.NumaResMap)
-}
-
-// DeleteNumaInfoV1alpha1 delete numa information from scheduler cache
-func (sc *SchedulerCache) DeleteNumaInfoV1alpha1(obj interface{}) {
- var ss *nodeinfov1alpha1.Numatopology
- switch t := obj.(type) {
- case *nodeinfov1alpha1.Numatopology:
- ss = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- ss, ok = t.Obj.(*nodeinfov1alpha1.Numatopology)
- if !ok {
- klog.Errorf("Cannot convert to Numatopo: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to Numatopo: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- sc.deleteNumaInfo(ss)
- klog.V(3).Infof("Delete numaInfo<%s> from cahce, with spec: Policy: %v, resMap: %v", ss.Name, ss.Spec.Policies, ss.Spec.NumaResMap)
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package cache
-
-import v1 "k8s.io/api/core/v1"
-
-// responsibleForPod returns true if the pod has asked to be scheduled by the given scheduler.
-func responsibleForPod(pod *v1.Pod, schedulerName string) bool {
- return schedulerName == pod.Spec.SchedulerName
-}
-
-
-
/*
-Copyright 2019 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- "strconv"
-
- "volcano.sh/volcano/pkg/scheduler/conf"
-
- "k8s.io/klog"
-)
-
-// Arguments map
-type Arguments map[string]string
-
-// GetInt get the integer value from string
-func (a Arguments) GetInt(ptr *int, key string) {
- if ptr == nil {
- return
- }
-
- argv, ok := a[key]
- if !ok || argv == "" {
- return
- }
-
- value, err := strconv.Atoi(argv)
- if err != nil {
- klog.Warningf("Could not parse argument: %s for key %s, with err %v", argv, key, err)
- return
- }
-
- *ptr = value
-}
-
-// GetFloat64 get the float64 value from string
-func (a Arguments) GetFloat64(ptr *float64, key string) {
- if ptr == nil {
- return
- }
-
- argv, ok := a[key]
- if !ok || len(argv) == 0 {
- return
- }
-
- value, err := strconv.ParseFloat(argv, 64)
- if err != nil {
- klog.Warningf("Could not parse argument: %s for key %s, with err %v", argv, key, err)
- return
- }
-
- *ptr = value
-}
-
-// GetBool get the bool value from string
-func (a Arguments) GetBool(ptr *bool, key string) {
- if ptr == nil {
- return
- }
-
- argv, ok := a[key]
- if !ok || argv == "" {
- return
- }
-
- value, err := strconv.ParseBool(argv)
- if err != nil {
- klog.Warningf("Could not parse argument: %s for key %s, with err %v", argv, key, err)
- return
- }
-
- *ptr = value
-}
-
-// GetArgOfActionFromConf return argument of action reading from configuration of schedule
-func GetArgOfActionFromConf(configurations []conf.Configuration, actionName string) Arguments {
- for _, c := range configurations {
- if c.Name == actionName {
- return c.Arguments
- }
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- "time"
-
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/cache"
- "volcano.sh/volcano/pkg/scheduler/conf"
- "volcano.sh/volcano/pkg/scheduler/metrics"
-)
-
-// OpenSession start the session
-func OpenSession(cache cache.Cache, tiers []conf.Tier, configurations []conf.Configuration) *Session {
- ssn := openSession(cache)
- ssn.Tiers = tiers
- ssn.Configurations = configurations
-
- for _, tier := range tiers {
- for _, plugin := range tier.Plugins {
- if pb, found := GetPluginBuilder(plugin.Name); !found {
- klog.Errorf("Failed to get plugin %s.", plugin.Name)
- } else {
- plugin := pb(plugin.Arguments)
- ssn.plugins[plugin.Name()] = plugin
- onSessionOpenStart := time.Now()
- plugin.OnSessionOpen(ssn)
- metrics.UpdatePluginDuration(plugin.Name(), metrics.OnSessionOpen, metrics.Duration(onSessionOpenStart))
- }
- }
- }
- return ssn
-}
-
-// CloseSession close the session
-func CloseSession(ssn *Session) {
- for _, plugin := range ssn.plugins {
- onSessionCloseStart := time.Now()
- plugin.OnSessionClose(ssn)
- metrics.UpdatePluginDuration(plugin.Name(), metrics.OnSessionClose, metrics.Duration(onSessionCloseStart))
- }
-
- closeSession(ssn)
-}
-
-
-
package framework
-
-import (
- "context"
- "math/rand"
- "reflect"
- "time"
-
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-const (
- jobUpdaterWorker = 16
-
- jobConditionUpdateTime = time.Minute
- jobConditionUpdateTimeJitter = 30 * time.Second
-)
-
-// TimeJitterAfter means: new after old + duration + jitter
-func TimeJitterAfter(new, old time.Time, duration, maxJitter time.Duration) bool {
- var jitter int64
- if maxJitter > 0 {
- jitter = rand.Int63n(int64(maxJitter))
- }
- return new.After(old.Add(duration + time.Duration(jitter)))
-}
-
-type jobUpdater struct {
- ssn *Session
- jobQueue []*api.JobInfo
-}
-
-func newJobUpdater(ssn *Session) *jobUpdater {
- queue := make([]*api.JobInfo, 0, len(ssn.Jobs))
- for _, job := range ssn.Jobs {
- queue = append(queue, job)
- }
-
- ju := &jobUpdater{
- ssn: ssn,
- jobQueue: queue,
- }
- return ju
-}
-
-func (ju *jobUpdater) UpdateAll() {
- workqueue.ParallelizeUntil(context.TODO(), jobUpdaterWorker, len(ju.jobQueue), ju.updateJob)
-}
-
-func isPodGroupConditionsUpdated(newCondition, oldCondition []scheduling.PodGroupCondition) bool {
- if len(newCondition) != len(oldCondition) {
- return true
- }
-
- for index, newCond := range newCondition {
- oldCond := oldCondition[index]
-
- newTime := newCond.LastTransitionTime
- oldTime := oldCond.LastTransitionTime
- if TimeJitterAfter(newTime.Time, oldTime.Time, jobConditionUpdateTime, jobConditionUpdateTimeJitter) {
- return true
- }
-
- // if newCond is not new enough, we treat it the same as the old one
- newCond.LastTransitionTime = oldTime
-
- // comparing should ignore the TransitionID
- newTransitionID := newCond.TransitionID
- newCond.TransitionID = oldCond.TransitionID
-
- shouldUpdate := !equality.Semantic.DeepEqual(&newCond, &oldCond)
-
- newCond.LastTransitionTime = newTime
- newCond.TransitionID = newTransitionID
- if shouldUpdate {
- return true
- }
- }
-
- return false
-}
-
-func isPodGroupStatusUpdated(newStatus, oldStatus scheduling.PodGroupStatus) bool {
- newCondition := newStatus.Conditions
- newStatus.Conditions = nil
- oldCondition := oldStatus.Conditions
- oldStatus.Conditions = nil
-
- return !equality.Semantic.DeepEqual(newStatus, oldStatus) || isPodGroupConditionsUpdated(newCondition, oldCondition)
-}
-
-// updateJob update specified job
-func (ju *jobUpdater) updateJob(index int) {
- job := ju.jobQueue[index]
- ssn := ju.ssn
-
- job.PodGroup.Status = jobStatus(ssn, job)
- oldStatus, found := ssn.podGroupStatus[job.UID]
- updatePG := !found || isPodGroupStatusUpdated(job.PodGroup.Status, oldStatus)
- if _, err := ssn.cache.UpdateJobStatus(job, updatePG); err != nil {
- klog.Errorf("Failed to update job <%s/%s>: %v",
- job.Namespace, job.Name, err)
- }
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- "fmt"
- "path/filepath"
- "plugin"
- "strings"
- "sync"
-
- "k8s.io/klog"
-)
-
-var pluginMutex sync.Mutex
-
-// PluginBuilder plugin management
-type PluginBuilder = func(Arguments) Plugin
-
-// Plugin management
-var pluginBuilders = map[string]PluginBuilder{}
-
-// RegisterPluginBuilder register the plugin
-func RegisterPluginBuilder(name string, pc PluginBuilder) {
- pluginMutex.Lock()
- defer pluginMutex.Unlock()
-
- pluginBuilders[name] = pc
-}
-
-// CleanupPluginBuilders cleans up all the plugin
-func CleanupPluginBuilders() {
- pluginMutex.Lock()
- defer pluginMutex.Unlock()
-
- pluginBuilders = map[string]PluginBuilder{}
-}
-
-// GetPluginBuilder get the pluginbuilder by name
-func GetPluginBuilder(name string) (PluginBuilder, bool) {
- pluginMutex.Lock()
- defer pluginMutex.Unlock()
-
- pb, found := pluginBuilders[name]
- return pb, found
-}
-
-// LoadCustomPlugins loads custom implement plugins
-func LoadCustomPlugins(pluginsDir string) error {
- pluginPaths, _ := filepath.Glob(fmt.Sprintf("%s/*.so", pluginsDir))
- for _, pluginPath := range pluginPaths {
- pluginBuilder, err := loadPluginBuilder(pluginPath)
- if err != nil {
- return err
- }
- pluginName := getPluginName(pluginPath)
- RegisterPluginBuilder(pluginName, pluginBuilder)
- klog.V(4).Infof("Custom plugin %s loaded", pluginName)
- }
-
- return nil
-}
-
-func getPluginName(pluginPath string) string {
- return strings.TrimSuffix(filepath.Base(pluginPath), filepath.Ext(pluginPath))
-}
-
-func loadPluginBuilder(pluginPath string) (PluginBuilder, error) {
- plug, err := plugin.Open(pluginPath)
- if err != nil {
- return nil, err
- }
-
- symBuilder, err := plug.Lookup("New")
- if err != nil {
- return nil, err
- }
-
- builder, ok := symBuilder.(PluginBuilder)
- if !ok {
- return nil, fmt.Errorf("unexpected plugin: %s, failed to convert PluginBuilder `New`", pluginPath)
- }
-
- return builder, nil
-}
-
-// Action management
-var actionMap = map[string]Action{}
-
-// RegisterAction register action
-func RegisterAction(act Action) {
- pluginMutex.Lock()
- defer pluginMutex.Unlock()
-
- actionMap[act.Name()] = act
-}
-
-// GetAction get the action by name
-func GetAction(name string) (Action, bool) {
- pluginMutex.Lock()
- defer pluginMutex.Unlock()
-
- act, found := actionMap[name]
- return act, found
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/types"
- "k8s.io/apimachinery/pkg/util/uuid"
- "k8s.io/client-go/informers"
- "k8s.io/client-go/kubernetes"
- "k8s.io/klog"
- volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
-
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/cache"
- "volcano.sh/volcano/pkg/scheduler/conf"
- "volcano.sh/volcano/pkg/scheduler/metrics"
- "volcano.sh/volcano/pkg/scheduler/util"
-)
-
-// Session information for the current session
-type Session struct {
- UID types.UID
-
- kubeClient kubernetes.Interface
- cache cache.Cache
- informerFactory informers.SharedInformerFactory
-
- TotalResource *api.Resource
- // podGroupStatus cache podgroup status during schedule
- // This should not be mutated after initiated
- podGroupStatus map[api.JobID]scheduling.PodGroupStatus
-
- Jobs map[api.JobID]*api.JobInfo
- Nodes map[string]*api.NodeInfo
- RevocableNodes map[string]*api.NodeInfo
- Queues map[api.QueueID]*api.QueueInfo
- NamespaceInfo map[api.NamespaceName]*api.NamespaceInfo
-
- Tiers []conf.Tier
- Configurations []conf.Configuration
- NodeList []*api.NodeInfo
-
- plugins map[string]Plugin
- eventHandlers []*EventHandler
- jobOrderFns map[string]api.CompareFn
- queueOrderFns map[string]api.CompareFn
- taskOrderFns map[string]api.CompareFn
- namespaceOrderFns map[string]api.CompareFn
- clusterOrderFns map[string]api.CompareFn
- predicateFns map[string]api.PredicateFn
- bestNodeFns map[string]api.BestNodeFn
- nodeOrderFns map[string]api.NodeOrderFn
- batchNodeOrderFns map[string]api.BatchNodeOrderFn
- nodeMapFns map[string]api.NodeMapFn
- nodeReduceFns map[string]api.NodeReduceFn
- preemptableFns map[string]api.EvictableFn
- reclaimableFns map[string]api.EvictableFn
- overusedFns map[string]api.ValidateFn
- underUsedFns map[string]api.UnderUsedResourceFn
- jobReadyFns map[string]api.ValidateFn
- jobPipelinedFns map[string]api.VoteFn
- jobValidFns map[string]api.ValidateExFn
- jobEnqueueableFns map[string]api.VoteFn
- jobEnqueuedFns map[string]api.JobEnqueuedFn
- targetJobFns map[string]api.TargetJobFn
- reservedNodesFns map[string]api.ReservedNodesFn
- victimTasksFns map[string]api.VictimTasksFn
- jobStarvingFns map[string]api.ValidateFn
-}
-
-func openSession(cache cache.Cache) *Session {
- ssn := &Session{
- UID: uuid.NewUUID(),
- kubeClient: cache.Client(),
- cache: cache,
- informerFactory: cache.SharedInformerFactory(),
-
- TotalResource: api.EmptyResource(),
- podGroupStatus: map[api.JobID]scheduling.PodGroupStatus{},
-
- Jobs: map[api.JobID]*api.JobInfo{},
- Nodes: map[string]*api.NodeInfo{},
- RevocableNodes: map[string]*api.NodeInfo{},
- Queues: map[api.QueueID]*api.QueueInfo{},
-
- plugins: map[string]Plugin{},
- jobOrderFns: map[string]api.CompareFn{},
- queueOrderFns: map[string]api.CompareFn{},
- taskOrderFns: map[string]api.CompareFn{},
- namespaceOrderFns: map[string]api.CompareFn{},
- clusterOrderFns: map[string]api.CompareFn{},
- predicateFns: map[string]api.PredicateFn{},
- bestNodeFns: map[string]api.BestNodeFn{},
- nodeOrderFns: map[string]api.NodeOrderFn{},
- batchNodeOrderFns: map[string]api.BatchNodeOrderFn{},
- nodeMapFns: map[string]api.NodeMapFn{},
- nodeReduceFns: map[string]api.NodeReduceFn{},
- preemptableFns: map[string]api.EvictableFn{},
- reclaimableFns: map[string]api.EvictableFn{},
- overusedFns: map[string]api.ValidateFn{},
- underUsedFns: map[string]api.UnderUsedResourceFn{},
- jobReadyFns: map[string]api.ValidateFn{},
- jobPipelinedFns: map[string]api.VoteFn{},
- jobValidFns: map[string]api.ValidateExFn{},
- jobEnqueueableFns: map[string]api.VoteFn{},
- jobEnqueuedFns: map[string]api.JobEnqueuedFn{},
- targetJobFns: map[string]api.TargetJobFn{},
- reservedNodesFns: map[string]api.ReservedNodesFn{},
- victimTasksFns: map[string]api.VictimTasksFn{},
- jobStarvingFns: map[string]api.ValidateFn{},
- }
-
- snapshot := cache.Snapshot()
-
- ssn.Jobs = snapshot.Jobs
- for _, job := range ssn.Jobs {
- // only conditions will be updated periodically
- if job.PodGroup != nil && job.PodGroup.Status.Conditions != nil {
- ssn.podGroupStatus[job.UID] = job.PodGroup.Status
- }
-
- if vjr := ssn.JobValid(job); vjr != nil {
- if !vjr.Pass {
- jc := &scheduling.PodGroupCondition{
- Type: scheduling.PodGroupUnschedulableType,
- Status: v1.ConditionTrue,
- LastTransitionTime: metav1.Now(),
- TransitionID: string(ssn.UID),
- Reason: vjr.Reason,
- Message: vjr.Message,
- }
-
- if err := ssn.UpdatePodGroupCondition(job, jc); err != nil {
- klog.Errorf("Failed to update job condition: %v", err)
- }
- }
-
- delete(ssn.Jobs, job.UID)
- }
- }
- ssn.NodeList = util.GetNodeList(snapshot.Nodes, snapshot.NodeList)
- ssn.Nodes = snapshot.Nodes
- ssn.RevocableNodes = snapshot.RevocableNodes
- ssn.Queues = snapshot.Queues
- ssn.NamespaceInfo = snapshot.NamespaceInfo
- // calculate all nodes' resource only once in each schedule cycle, other plugins can clone it when need
- for _, n := range ssn.Nodes {
- ssn.TotalResource.Add(n.Allocatable)
- }
-
- klog.V(3).Infof("Open Session %v with <%d> Job and <%d> Queues",
- ssn.UID, len(ssn.Jobs), len(ssn.Queues))
-
- return ssn
-}
-
-func closeSession(ssn *Session) {
- ju := newJobUpdater(ssn)
- ju.UpdateAll()
-
- ssn.Jobs = nil
- ssn.Nodes = nil
- ssn.RevocableNodes = nil
- ssn.plugins = nil
- ssn.eventHandlers = nil
- ssn.jobOrderFns = nil
- ssn.namespaceOrderFns = nil
- ssn.queueOrderFns = nil
- ssn.clusterOrderFns = nil
- ssn.NodeList = nil
- ssn.TotalResource = nil
-
- klog.V(3).Infof("Close Session %v", ssn.UID)
-}
-
-func jobStatus(ssn *Session, jobInfo *api.JobInfo) scheduling.PodGroupStatus {
- status := jobInfo.PodGroup.Status
-
- unschedulable := false
- for _, c := range status.Conditions {
- if c.Type == scheduling.PodGroupUnschedulableType &&
- c.Status == v1.ConditionTrue &&
- c.TransitionID == string(ssn.UID) {
- unschedulable = true
- break
- }
- }
-
- // If running tasks && unschedulable, unknown phase
- if len(jobInfo.TaskStatusIndex[api.Running]) != 0 && unschedulable {
- status.Phase = scheduling.PodGroupUnknown
- } else {
- allocated := 0
- for status, tasks := range jobInfo.TaskStatusIndex {
- if api.AllocatedStatus(status) || status == api.Succeeded {
- allocated += len(tasks)
- }
- }
-
- // If there're enough allocated resource, it's running
- if int32(allocated) >= jobInfo.PodGroup.Spec.MinMember {
- status.Phase = scheduling.PodGroupRunning
- } else if jobInfo.PodGroup.Status.Phase != scheduling.PodGroupInqueue {
- status.Phase = scheduling.PodGroupPending
- }
- }
-
- status.Running = int32(len(jobInfo.TaskStatusIndex[api.Running]))
- status.Failed = int32(len(jobInfo.TaskStatusIndex[api.Failed]))
- status.Succeeded = int32(len(jobInfo.TaskStatusIndex[api.Succeeded]))
-
- return status
-}
-
-// Statement returns new statement object
-func (ssn *Session) Statement() *Statement {
- return &Statement{
- ssn: ssn,
- }
-}
-
-// Pipeline the task to the node in the session
-func (ssn *Session) Pipeline(task *api.TaskInfo, hostname string) error {
- // Only update status in session
- job, found := ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Pipelined); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Pipelined, ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to found Job <%s> in Session <%s> index when binding.",
- task.Job, ssn.UID)
- return fmt.Errorf("failed to find job %s when binding", task.Job)
- }
-
- task.NodeName = hostname
-
- if node, found := ssn.Nodes[hostname]; found {
- if err := node.AddTask(task); err != nil {
- klog.Errorf("Failed to add task <%v/%v> to node <%v> in Session <%v>: %v",
- task.Namespace, task.Name, hostname, ssn.UID, err)
- return err
- }
- klog.V(3).Infof("After added Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
- task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
- } else {
- klog.Errorf("Failed to found Node <%s> in Session <%s> index when binding.",
- hostname, ssn.UID)
- return fmt.Errorf("failed to find node %s", hostname)
- }
-
- for _, eh := range ssn.eventHandlers {
- if eh.AllocateFunc != nil {
- eh.AllocateFunc(&Event{
- Task: task,
- })
- }
- }
-
- return nil
-}
-
-//Allocate the task to the node in the session
-func (ssn *Session) Allocate(task *api.TaskInfo, nodeInfo *api.NodeInfo) error {
- podVolumes, err := ssn.cache.GetPodVolumes(task, nodeInfo.Node)
- if err != nil {
- return err
- }
-
- hostname := nodeInfo.Name
- if err := ssn.cache.AllocateVolumes(task, hostname, podVolumes); err != nil {
- return err
- }
-
- task.Pod.Spec.NodeName = hostname
- task.PodVolumes = podVolumes
-
- // Only update status in session
- job, found := ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Allocated); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Allocated, ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to found Job <%s> in Session <%s> index when binding.",
- task.Job, ssn.UID)
- return fmt.Errorf("failed to find job %s", task.Job)
- }
-
- task.NodeName = hostname
-
- if node, found := ssn.Nodes[hostname]; found {
- if err := node.AddTask(task); err != nil {
- klog.Errorf("Failed to add task <%v/%v> to node <%v> in Session <%v>: %v",
- task.Namespace, task.Name, hostname, ssn.UID, err)
- return err
- }
- klog.V(3).Infof("After allocated Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
- task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
- } else {
- klog.Errorf("Failed to found Node <%s> in Session <%s> index when binding.",
- hostname, ssn.UID)
- return fmt.Errorf("failed to find node %s", hostname)
- }
-
- // Callbacks
- for _, eh := range ssn.eventHandlers {
- if eh.AllocateFunc != nil {
- eh.AllocateFunc(&Event{
- Task: task,
- })
- }
- }
-
- if ssn.JobReady(job) {
- for _, task := range job.TaskStatusIndex[api.Allocated] {
- if err := ssn.dispatch(task, podVolumes); err != nil {
- klog.Errorf("Failed to dispatch task <%v/%v>: %v",
- task.Namespace, task.Name, err)
- return err
- }
- }
- }
-
- return nil
-}
-
-func (ssn *Session) dispatch(task *api.TaskInfo, volumes *volumescheduling.PodVolumes) error {
- if err := ssn.cache.BindVolumes(task, volumes); err != nil {
- return err
- }
-
- if err := ssn.cache.Bind(task, task.NodeName); err != nil {
- return err
- }
-
- // Update status in session
- if job, found := ssn.Jobs[task.Job]; found {
- if err := job.UpdateTaskStatus(task, api.Binding); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Binding, ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to found Job <%s> in Session <%s> index when binding.",
- task.Job, ssn.UID)
- return fmt.Errorf("failed to find job %s", task.Job)
- }
-
- metrics.UpdateTaskScheduleDuration(metrics.Duration(task.Pod.CreationTimestamp.Time))
- return nil
-}
-
-//Evict the task in the session
-func (ssn *Session) Evict(reclaimee *api.TaskInfo, reason string) error {
- if err := ssn.cache.Evict(reclaimee, reason); err != nil {
- return err
- }
-
- // Update status in session
- job, found := ssn.Jobs[reclaimee.Job]
- if found {
- if err := job.UpdateTaskStatus(reclaimee, api.Releasing); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- reclaimee.Namespace, reclaimee.Name, api.Releasing, ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to found Job <%s> in Session <%s> index when binding.",
- reclaimee.Job, ssn.UID)
- return fmt.Errorf("failed to find job %s", reclaimee.Job)
- }
-
- // Update task in node.
- if node, found := ssn.Nodes[reclaimee.NodeName]; found {
- if err := node.UpdateTask(reclaimee); err != nil {
- klog.Errorf("Failed to update task <%v/%v> in Session <%v>: %v",
- reclaimee.Namespace, reclaimee.Name, ssn.UID, err)
- return err
- }
- }
-
- for _, eh := range ssn.eventHandlers {
- if eh.DeallocateFunc != nil {
- eh.DeallocateFunc(&Event{
- Task: reclaimee,
- })
- }
- }
-
- return nil
-}
-
-// BindPodGroup bind PodGroup to specified cluster
-func (ssn *Session) BindPodGroup(job *api.JobInfo, cluster string) error {
- return ssn.cache.BindPodGroup(job, cluster)
-}
-
-// UpdatePodGroupCondition update job condition accordingly.
-func (ssn *Session) UpdatePodGroupCondition(jobInfo *api.JobInfo, cond *scheduling.PodGroupCondition) error {
- job, ok := ssn.Jobs[jobInfo.UID]
- if !ok {
- return fmt.Errorf("failed to find job <%s/%s>", jobInfo.Namespace, jobInfo.Name)
- }
-
- index := -1
- for i, c := range job.PodGroup.Status.Conditions {
- if c.Type == cond.Type {
- index = i
- break
- }
- }
-
- // Update condition to the new condition.
- if index < 0 {
- job.PodGroup.Status.Conditions = append(job.PodGroup.Status.Conditions, *cond)
- } else {
- job.PodGroup.Status.Conditions[index] = *cond
- }
-
- return nil
-}
-
-// AddEventHandler add event handlers
-func (ssn *Session) AddEventHandler(eh *EventHandler) {
- ssn.eventHandlers = append(ssn.eventHandlers, eh)
-}
-
-// UpdateSchedulerNumaInfo update SchedulerNumaInfo
-func (ssn *Session) UpdateSchedulerNumaInfo(AllocatedSets map[string]api.ResNumaSets) {
- ssn.cache.UpdateSchedulerNumaInfo(AllocatedSets)
-}
-
-// KubeClient returns the kubernetes client
-func (ssn Session) KubeClient() kubernetes.Interface {
- return ssn.kubeClient
-}
-
-// InformerFactory returns the scheduler ShareInformerFactory
-func (ssn Session) InformerFactory() informers.SharedInformerFactory {
- return ssn.informerFactory
-}
-
-//String return nodes and jobs information in the session
-func (ssn Session) String() string {
- msg := fmt.Sprintf("Session %v: \n", ssn.UID)
-
- for _, job := range ssn.Jobs {
- msg = fmt.Sprintf("%s%v\n", msg, job)
- }
-
- for _, node := range ssn.Nodes {
- msg = fmt.Sprintf("%s%v\n", msg, node)
- }
-
- return msg
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-// AddJobOrderFn add job order function
-func (ssn *Session) AddJobOrderFn(name string, cf api.CompareFn) {
- ssn.jobOrderFns[name] = cf
-}
-
-// AddQueueOrderFn add queue order function
-func (ssn *Session) AddQueueOrderFn(name string, qf api.CompareFn) {
- ssn.queueOrderFns[name] = qf
-}
-
-// AddClusterOrderFn add queue order function
-func (ssn *Session) AddClusterOrderFn(name string, qf api.CompareFn) {
- ssn.clusterOrderFns[name] = qf
-}
-
-// AddTaskOrderFn add task order function
-func (ssn *Session) AddTaskOrderFn(name string, cf api.CompareFn) {
- ssn.taskOrderFns[name] = cf
-}
-
-// AddNamespaceOrderFn add namespace order function
-func (ssn *Session) AddNamespaceOrderFn(name string, cf api.CompareFn) {
- ssn.namespaceOrderFns[name] = cf
-}
-
-// AddPreemptableFn add preemptable function
-func (ssn *Session) AddPreemptableFn(name string, cf api.EvictableFn) {
- ssn.preemptableFns[name] = cf
-}
-
-// AddReclaimableFn add Reclaimable function
-func (ssn *Session) AddReclaimableFn(name string, rf api.EvictableFn) {
- ssn.reclaimableFns[name] = rf
-}
-
-// AddJobReadyFn add JobReady function
-func (ssn *Session) AddJobReadyFn(name string, vf api.ValidateFn) {
- ssn.jobReadyFns[name] = vf
-}
-
-// AddJobPipelinedFn add pipelined function
-func (ssn *Session) AddJobPipelinedFn(name string, vf api.VoteFn) {
- ssn.jobPipelinedFns[name] = vf
-}
-
-// AddPredicateFn add Predicate function
-func (ssn *Session) AddPredicateFn(name string, pf api.PredicateFn) {
- ssn.predicateFns[name] = pf
-}
-
-// AddBestNodeFn add BestNode function
-func (ssn *Session) AddBestNodeFn(name string, pf api.BestNodeFn) {
- ssn.bestNodeFns[name] = pf
-}
-
-// AddNodeOrderFn add Node order function
-func (ssn *Session) AddNodeOrderFn(name string, pf api.NodeOrderFn) {
- ssn.nodeOrderFns[name] = pf
-}
-
-// AddBatchNodeOrderFn add Batch Node order function
-func (ssn *Session) AddBatchNodeOrderFn(name string, pf api.BatchNodeOrderFn) {
- ssn.batchNodeOrderFns[name] = pf
-}
-
-// AddNodeMapFn add Node map function
-func (ssn *Session) AddNodeMapFn(name string, pf api.NodeMapFn) {
- ssn.nodeMapFns[name] = pf
-}
-
-// AddNodeReduceFn add Node reduce function
-func (ssn *Session) AddNodeReduceFn(name string, pf api.NodeReduceFn) {
- ssn.nodeReduceFns[name] = pf
-}
-
-// AddOverusedFn add overused function
-func (ssn *Session) AddOverusedFn(name string, fn api.ValidateFn) {
- ssn.overusedFns[name] = fn
-}
-
-// AddUnderusedResourceFn add underused function
-func (ssn *Session) AddUnderusedResourceFn(name string, fn api.UnderUsedResourceFn) {
- ssn.underUsedFns[name] = fn
-}
-
-// AddJobValidFn add jobvalid function
-func (ssn *Session) AddJobValidFn(name string, fn api.ValidateExFn) {
- ssn.jobValidFns[name] = fn
-}
-
-// AddJobEnqueueableFn add jobenqueueable function
-func (ssn *Session) AddJobEnqueueableFn(name string, fn api.VoteFn) {
- ssn.jobEnqueueableFns[name] = fn
-}
-
-// AddJobEnqueuedFn add jobEnqueued function
-func (ssn *Session) AddJobEnqueuedFn(name string, fn api.JobEnqueuedFn) {
- ssn.jobEnqueuedFns[name] = fn
-}
-
-// AddTargetJobFn add targetjob function
-func (ssn *Session) AddTargetJobFn(name string, fn api.TargetJobFn) {
- ssn.targetJobFns[name] = fn
-}
-
-// AddReservedNodesFn add reservedNodesFn function
-func (ssn *Session) AddReservedNodesFn(name string, fn api.ReservedNodesFn) {
- ssn.reservedNodesFns[name] = fn
-}
-
-// AddVictimTasksFns add victimTasksFns function
-func (ssn *Session) AddVictimTasksFns(name string, fn api.VictimTasksFn) {
- ssn.victimTasksFns[name] = fn
-}
-
-// AddJobStarvingFns add jobStarvingFns function
-func (ssn *Session) AddJobStarvingFns(name string, fn api.ValidateFn) {
- ssn.jobStarvingFns[name] = fn
-}
-
-// Reclaimable invoke reclaimable function of the plugins
-func (ssn *Session) Reclaimable(reclaimer *api.TaskInfo, reclaimees []*api.TaskInfo) []*api.TaskInfo {
- var victims []*api.TaskInfo
- var init bool
-
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledReclaimable) {
- continue
- }
- rf, found := ssn.reclaimableFns[plugin.Name]
- if !found {
- continue
- }
-
- candidates, abstain := rf(reclaimer, reclaimees)
- if abstain == 0 {
- continue
- }
- if len(candidates) == 0 {
- victims = nil
- break
- }
- if !init {
- victims = candidates
- init = true
- } else {
- var intersection []*api.TaskInfo
- // Get intersection of victims and candidates.
- for _, v := range victims {
- for _, c := range candidates {
- if v.UID == c.UID {
- intersection = append(intersection, v)
- }
- }
- }
-
- // Update victims to intersection
- victims = intersection
- }
- }
- // Plugins in this tier made decision if victims is not nil
- if victims != nil {
- return victims
- }
- }
-
- return victims
-}
-
-// Preemptable invoke preemptable function of the plugins
-func (ssn *Session) Preemptable(preemptor *api.TaskInfo, preemptees []*api.TaskInfo) []*api.TaskInfo {
- var victims []*api.TaskInfo
- var init bool
-
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledPreemptable) {
- continue
- }
-
- pf, found := ssn.preemptableFns[plugin.Name]
- if !found {
- continue
- }
- candidates, abstain := pf(preemptor, preemptees)
- if abstain == 0 {
- continue
- }
- // intersection will be nil if length is 0, don't need to do any more check
- if len(candidates) == 0 {
- victims = nil
- break
- }
-
- if !init {
- victims = candidates
- init = true
- } else {
- var intersection []*api.TaskInfo
- // Get intersection of victims and candidates.
- for _, v := range victims {
- for _, c := range candidates {
- if v.UID == c.UID {
- intersection = append(intersection, v)
- }
- }
- }
-
- // Update victims to intersection
- victims = intersection
- }
- }
- // Plugins in this tier made decision if victims is not nil
- if victims != nil {
- return victims
- }
- }
-
- return victims
-}
-
-// Overused invoke overused function of the plugins
-func (ssn *Session) Overused(queue *api.QueueInfo) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- of, found := ssn.overusedFns[plugin.Name]
- if !found {
- continue
- }
- if of(queue) {
- return true
- }
- }
- }
-
- return false
-}
-
-// UnderusedResources invoke underused function of the plugins
-// Returns:
-// * nil if no `UnderUsedResourceFn` is registered
-// * [] if no under-used resources
-func (ssn *Session) UnderusedResources(queue *api.QueueInfo) api.ResourceNameList {
- if len(ssn.underUsedFns) == 0 {
- return nil
- }
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- of, found := ssn.underUsedFns[plugin.Name]
- if !found {
- continue
- }
- underUsedResourceList := of(queue)
- return underUsedResourceList
- }
- }
-
- return api.ResourceNameList{}
-}
-
-// JobReady invoke jobready function of the plugins
-func (ssn *Session) JobReady(obj interface{}) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobReady) {
- continue
- }
- jrf, found := ssn.jobReadyFns[plugin.Name]
- if !found {
- continue
- }
-
- if !jrf(obj) {
- return false
- }
- }
- }
-
- return true
-}
-
-// JobPipelined invoke pipelined function of the plugins
-// Check if job has get enough resource to run
-func (ssn *Session) JobPipelined(obj interface{}) bool {
- var hasFound bool
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobPipelined) {
- continue
- }
- jrf, found := ssn.jobPipelinedFns[plugin.Name]
- if !found {
- continue
- }
-
- res := jrf(obj)
- if res < 0 {
- return false
- }
- if res > 0 {
- hasFound = true
- }
- }
- // if plugin exists that votes permit, meanwhile other plugin votes abstention,
- // permit job to be pipelined, do not check next tier
- if hasFound {
- return true
- }
- }
-
- return true
-}
-
-// JobStarving invoke jobStarving function of the plugins
-// Check if job still need more resource
-func (ssn *Session) JobStarving(obj interface{}) bool {
- var hasFound bool
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobStarving) {
- continue
- }
- jrf, found := ssn.jobStarvingFns[plugin.Name]
- if !found {
- continue
- }
- hasFound = true
-
- if !jrf(obj) {
- return false
- }
- }
- // this tier registered function
- if hasFound {
- return true
- }
- }
-
- return false
-}
-
-// JobValid invoke jobvalid function of the plugins
-func (ssn *Session) JobValid(obj interface{}) *api.ValidateResult {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- jrf, found := ssn.jobValidFns[plugin.Name]
- if !found {
- continue
- }
-
- if vr := jrf(obj); vr != nil && !vr.Pass {
- return vr
- }
- }
- }
-
- return nil
-}
-
-// JobEnqueueable invoke jobEnqueueableFns function of the plugins
-func (ssn *Session) JobEnqueueable(obj interface{}) bool {
- var hasFound bool
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobEnqueued) {
- continue
- }
- fn, found := ssn.jobEnqueueableFns[plugin.Name]
- if !found {
- continue
- }
-
- res := fn(obj)
- if res < 0 {
- return false
- }
- if res > 0 {
- hasFound = true
- }
- }
- // if plugin exists that votes permit, meanwhile other plugin votes abstention,
- // permit job to be enqueueable, do not check next tier
- if hasFound {
- return true
- }
- }
-
- return true
-}
-
-// JobEnqueued invoke jobEnqueuedFns function of the plugins
-func (ssn *Session) JobEnqueued(obj interface{}) {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobEnqueued) {
- continue
- }
- fn, found := ssn.jobEnqueuedFns[plugin.Name]
- if !found {
- continue
- }
-
- fn(obj)
- }
- }
-}
-
-// TargetJob invoke targetJobFns function of the plugins
-func (ssn *Session) TargetJob(jobs []*api.JobInfo) *api.JobInfo {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledTargetJob) {
- continue
- }
- fn, found := ssn.targetJobFns[plugin.Name]
- if !found {
- continue
- }
- return fn(jobs)
- }
- }
- return nil
-}
-
-// VictimTasks invoke ReservedNodes function of the plugins
-func (ssn *Session) VictimTasks() []*api.TaskInfo {
- var victims []*api.TaskInfo
- var init bool
-
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledVictim) {
- continue
- }
-
- pf, found := ssn.victimTasksFns[plugin.Name]
- if !found {
- continue
- }
- candidates := pf()
- if !init {
- victims = candidates
- init = true
- } else {
- var intersection []*api.TaskInfo
- // Get intersection of victims and candidates.
- for _, v := range victims {
- for _, c := range candidates {
- if v.UID == c.UID {
- intersection = append(intersection, v)
- }
- }
- }
-
- // Update victims to intersection
- victims = intersection
- }
- }
- // Plugins in this tier made decision if victims is not nil
- if victims != nil {
- return victims
- }
- }
-
- return victims
-}
-
-// ReservedNodes invoke ReservedNodes function of the plugins
-func (ssn *Session) ReservedNodes() {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledReservedNodes) {
- continue
- }
- fn, found := ssn.reservedNodesFns[plugin.Name]
- if !found {
- continue
- }
- fn()
- }
- }
-}
-
-// JobOrderFn invoke joborder function of the plugins
-func (ssn *Session) JobOrderFn(l, r interface{}) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobOrder) {
- continue
- }
- jof, found := ssn.jobOrderFns[plugin.Name]
- if !found {
- continue
- }
- if j := jof(l, r); j != 0 {
- return j < 0
- }
- }
- }
-
- // If no job order funcs, order job by CreationTimestamp first, then by UID.
- lv := l.(*api.JobInfo)
- rv := r.(*api.JobInfo)
- if lv.CreationTimestamp.Equal(&rv.CreationTimestamp) {
- return lv.UID < rv.UID
- }
- return lv.CreationTimestamp.Before(&rv.CreationTimestamp)
-}
-
-// NamespaceOrderFn invoke namespaceorder function of the plugins
-func (ssn *Session) NamespaceOrderFn(l, r interface{}) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledNamespaceOrder) {
- continue
- }
- nof, found := ssn.namespaceOrderFns[plugin.Name]
- if !found {
- continue
- }
- if j := nof(l, r); j != 0 {
- return j < 0
- }
- }
- }
-
- // TODO(lminzhw): if all NamespaceOrderFn treat these two namespace as the same,
- // we should make the job order have its affect among namespaces.
- // or just schedule namespace one by one
- lv := l.(api.NamespaceName)
- rv := r.(api.NamespaceName)
- return lv < rv
-}
-
-// ClusterOrderFn invoke ClusterOrderFn function of the plugins
-func (ssn *Session) ClusterOrderFn(l, r interface{}) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledClusterOrder) {
- continue
- }
- cof, found := ssn.clusterOrderFns[plugin.Name]
- if !found {
- continue
- }
- if j := cof(l, r); j != 0 {
- return j < 0
- }
- }
- }
-
- // If no cluster order funcs, order cluster by ClusterID
- lv := l.(*scheduling.Cluster)
- rv := r.(*scheduling.Cluster)
- return lv.Name < rv.Name
-}
-
-// QueueOrderFn invoke queueorder function of the plugins
-func (ssn *Session) QueueOrderFn(l, r interface{}) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledQueueOrder) {
- continue
- }
- qof, found := ssn.queueOrderFns[plugin.Name]
- if !found {
- continue
- }
- if j := qof(l, r); j != 0 {
- return j < 0
- }
- }
- }
-
- // If no queue order funcs, order queue by CreationTimestamp first, then by UID.
- lv := l.(*api.QueueInfo)
- rv := r.(*api.QueueInfo)
- if lv.Queue.CreationTimestamp.Equal(&rv.Queue.CreationTimestamp) {
- return lv.UID < rv.UID
- }
- return lv.Queue.CreationTimestamp.Before(&rv.Queue.CreationTimestamp)
-}
-
-// TaskCompareFns invoke taskorder function of the plugins
-func (ssn *Session) TaskCompareFns(l, r interface{}) int {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledTaskOrder) {
- continue
- }
- tof, found := ssn.taskOrderFns[plugin.Name]
- if !found {
- continue
- }
- if j := tof(l, r); j != 0 {
- return j
- }
- }
- }
-
- return 0
-}
-
-// TaskOrderFn invoke taskorder function of the plugins
-func (ssn *Session) TaskOrderFn(l, r interface{}) bool {
- if res := ssn.TaskCompareFns(l, r); res != 0 {
- return res < 0
- }
-
- // If no task order funcs, order task by CreationTimestamp first, then by UID.
- lv := l.(*api.TaskInfo)
- rv := r.(*api.TaskInfo)
- if lv.Pod.CreationTimestamp.Equal(&rv.Pod.CreationTimestamp) {
- return lv.UID < rv.UID
- }
- return lv.Pod.CreationTimestamp.Before(&rv.Pod.CreationTimestamp)
-}
-
-// PredicateFn invoke predicate function of the plugins
-func (ssn *Session) PredicateFn(task *api.TaskInfo, node *api.NodeInfo) error {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledPredicate) {
- continue
- }
- pfn, found := ssn.predicateFns[plugin.Name]
- if !found {
- continue
- }
- err := pfn(task, node)
- if err != nil {
- return err
- }
- }
- }
- return nil
-}
-
-// BestNodeFn invoke bestNode function of the plugins
-func (ssn *Session) BestNodeFn(task *api.TaskInfo, nodeScores map[float64][]*api.NodeInfo) *api.NodeInfo {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledBestNode) {
- continue
- }
- pfn, found := ssn.bestNodeFns[plugin.Name]
- if !found {
- continue
- }
- // Only the first plugin that enables and realizes bestNodeFn is allowed to choose best node for task
- if bestNode := pfn(task, nodeScores); bestNode != nil {
- return bestNode
- }
- }
- }
- return nil
-}
-
-// NodeOrderFn invoke node order function of the plugins
-func (ssn *Session) NodeOrderFn(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
- priorityScore := 0.0
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledNodeOrder) {
- continue
- }
- pfn, found := ssn.nodeOrderFns[plugin.Name]
- if !found {
- continue
- }
- score, err := pfn(task, node)
- if err != nil {
- return 0, err
- }
- priorityScore += score
- }
- }
- return priorityScore, nil
-}
-
-// BatchNodeOrderFn invoke node order function of the plugins
-func (ssn *Session) BatchNodeOrderFn(task *api.TaskInfo, nodes []*api.NodeInfo) (map[string]float64, error) {
- priorityScore := make(map[string]float64, len(nodes))
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledNodeOrder) {
- continue
- }
- pfn, found := ssn.batchNodeOrderFns[plugin.Name]
- if !found {
- continue
- }
- score, err := pfn(task, nodes)
- if err != nil {
- return nil, err
- }
- for nodeName, score := range score {
- priorityScore[nodeName] += score
- }
- }
- }
- return priorityScore, nil
-}
-
-func isEnabled(enabled *bool) bool {
- return enabled != nil && *enabled
-}
-
-// NodeOrderMapFn invoke node order function of the plugins
-func (ssn *Session) NodeOrderMapFn(task *api.TaskInfo, node *api.NodeInfo) (map[string]float64, float64, error) {
- nodeScoreMap := map[string]float64{}
- var priorityScore float64
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledNodeOrder) {
- continue
- }
- if pfn, found := ssn.nodeOrderFns[plugin.Name]; found {
- score, err := pfn(task, node)
- if err != nil {
- return nodeScoreMap, priorityScore, err
- }
- priorityScore += score
- }
- if pfn, found := ssn.nodeMapFns[plugin.Name]; found {
- score, err := pfn(task, node)
- if err != nil {
- return nodeScoreMap, priorityScore, err
- }
- nodeScoreMap[plugin.Name] = score
- }
- }
- }
- return nodeScoreMap, priorityScore, nil
-}
-
-// NodeOrderReduceFn invoke node order function of the plugins
-func (ssn *Session) NodeOrderReduceFn(task *api.TaskInfo, pluginNodeScoreMap map[string]k8sframework.NodeScoreList) (map[string]float64, error) {
- nodeScoreMap := map[string]float64{}
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledNodeOrder) {
- continue
- }
- pfn, found := ssn.nodeReduceFns[plugin.Name]
- if !found {
- continue
- }
- if err := pfn(task, pluginNodeScoreMap[plugin.Name]); err != nil {
- return nodeScoreMap, err
- }
- for _, hp := range pluginNodeScoreMap[plugin.Name] {
- nodeScoreMap[hp.Name] += float64(hp.Score)
- }
- }
- }
- return nodeScoreMap, nil
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- "fmt"
-
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/metrics"
-)
-
-// Operation type
-type Operation int8
-
-const (
- // Evict op
- Evict = iota
- // Pipeline op
- Pipeline
- // Allocate op
- Allocate
-)
-
-type operation struct {
- name Operation
- task *api.TaskInfo
- reason string
-}
-
-// Statement structure
-type Statement struct {
- operations []operation
- ssn *Session
-}
-
-// NewStatement returns new statement object
-func NewStatement(ssn *Session) *Statement {
- return &Statement{
- ssn: ssn,
- }
-}
-
-// Evict the pod
-func (s *Statement) Evict(reclaimee *api.TaskInfo, reason string) error {
- // Update status in session
- if job, found := s.ssn.Jobs[reclaimee.Job]; found {
- if err := job.UpdateTaskStatus(reclaimee, api.Releasing); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- reclaimee.Namespace, reclaimee.Name, api.Releasing, s.ssn.UID, err)
- }
- } else {
- klog.Errorf("Failed to found Job <%s> in Session <%s> index when binding.",
- reclaimee.Job, s.ssn.UID)
- }
-
- // Update task in node.
- if node, found := s.ssn.Nodes[reclaimee.NodeName]; found {
- err := node.UpdateTask(reclaimee)
- if err != nil {
- klog.Errorf("Failed to update task <%v/%v> in node %v for: %s",
- reclaimee.Namespace, reclaimee.Name, reclaimee.NodeName, err.Error())
- return err
- }
- }
-
- for _, eh := range s.ssn.eventHandlers {
- if eh.DeallocateFunc != nil {
- eh.DeallocateFunc(&Event{
- Task: reclaimee,
- })
- }
- }
-
- s.operations = append(s.operations, operation{
- name: Evict,
- task: reclaimee,
- reason: reason,
- })
-
- return nil
-}
-
-func (s *Statement) evict(reclaimee *api.TaskInfo, reason string) error {
- if err := s.ssn.cache.Evict(reclaimee, reason); err != nil {
- if e := s.unevict(reclaimee); e != nil {
- klog.Errorf("Faled to unevict task <%v/%v>: %v.",
- reclaimee.Namespace, reclaimee.Name, e)
- }
- return err
- }
-
- return nil
-}
-
-func (s *Statement) unevict(reclaimee *api.TaskInfo) error {
- // Update status in session
- job, found := s.ssn.Jobs[reclaimee.Job]
- if found {
- if err := job.UpdateTaskStatus(reclaimee, api.Running); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- reclaimee.Namespace, reclaimee.Name, api.Releasing, s.ssn.UID, err)
- }
- } else {
- klog.Errorf("Failed to found Job <%s> in Session <%s> index when binding.",
- reclaimee.Job, s.ssn.UID)
- }
-
- // Update task in node.
- if node, found := s.ssn.Nodes[reclaimee.NodeName]; found {
- err := node.UpdateTask(reclaimee)
- if err != nil {
- klog.Errorf("Failed to update task <%v/%v> in node %v for: %s",
- reclaimee.Namespace, reclaimee.Name, reclaimee.NodeName, err.Error())
- return err
- }
- }
-
- for _, eh := range s.ssn.eventHandlers {
- if eh.AllocateFunc != nil {
- eh.AllocateFunc(&Event{
- Task: reclaimee,
- })
- }
- }
-
- return nil
-}
-
-// Pipeline the task for the node
-func (s *Statement) Pipeline(task *api.TaskInfo, hostname string) error {
- job, found := s.ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Pipelined); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Pipelined, s.ssn.UID, err)
- }
- } else {
- klog.Errorf("Failed to found Job <%s> in Session <%s> index when binding.",
- task.Job, s.ssn.UID)
- }
-
- task.NodeName = hostname
-
- if node, found := s.ssn.Nodes[hostname]; found {
- if err := node.AddTask(task); err != nil {
- klog.Errorf("Failed to pipeline task <%v/%v> to node <%v> in Session <%v>: %v",
- task.Namespace, task.Name, hostname, s.ssn.UID, err)
- }
- klog.V(3).Infof("After pipelined Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
- task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
- } else {
- klog.Errorf("Failed to found Node <%s> in Session <%s> index when binding.",
- hostname, s.ssn.UID)
- }
-
- for _, eh := range s.ssn.eventHandlers {
- if eh.AllocateFunc != nil {
- eh.AllocateFunc(&Event{
- Task: task,
- })
- }
- }
-
- s.operations = append(s.operations, operation{
- name: Pipeline,
- task: task,
- })
-
- return nil
-}
-
-func (s *Statement) pipeline(task *api.TaskInfo) {
-}
-
-func (s *Statement) unpipeline(task *api.TaskInfo) error {
- job, found := s.ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Pending); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Pipelined, s.ssn.UID, err)
- }
- } else {
- klog.Errorf("Failed to found Job <%s> in Session <%s> index when binding.",
- task.Job, s.ssn.UID)
- }
-
- if node, found := s.ssn.Nodes[task.NodeName]; found {
- if err := node.RemoveTask(task); err != nil {
- klog.Errorf("Failed to pipeline task <%v/%v> to node <%v> in Session <%v>: %v",
- task.Namespace, task.Name, task.NodeName, s.ssn.UID, err)
- }
- klog.V(3).Infof("After pipelined Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
- task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
- } else {
- klog.Errorf("Failed to found Node <%s> in Session <%s> index when binding.",
- task.NodeName, s.ssn.UID)
- }
-
- for _, eh := range s.ssn.eventHandlers {
- if eh.DeallocateFunc != nil {
- eh.DeallocateFunc(&Event{
- Task: task,
- })
- }
- }
- task.NodeName = ""
-
- return nil
-}
-
-// Allocate the task to node
-func (s *Statement) Allocate(task *api.TaskInfo, nodeInfo *api.NodeInfo) error {
- podVolumes, err := s.ssn.cache.GetPodVolumes(task, nodeInfo.Node)
- if err != nil {
- return err
- }
-
- hostname := nodeInfo.Name
- if err := s.ssn.cache.AllocateVolumes(task, hostname, podVolumes); err != nil {
- return err
- }
-
- task.Pod.Spec.NodeName = hostname
- task.PodVolumes = podVolumes
-
- // Only update status in session
- job, found := s.ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Allocated); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Allocated, s.ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to found Job <%s> in Session <%s> index when binding.",
- task.Job, s.ssn.UID)
- return fmt.Errorf("failed to find job %s", task.Job)
- }
-
- task.NodeName = hostname
- if node, found := s.ssn.Nodes[hostname]; found {
- if err := node.AddTask(task); err != nil {
- klog.Errorf("Failed to add task <%v/%v> to node <%v> in Session <%v>: %v",
- task.Namespace, task.Name, hostname, s.ssn.UID, err)
- return err
- }
- klog.V(3).Infof("After allocated Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
- task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
- } else {
- klog.Errorf("Failed to found Node <%s> in Session <%s> index when binding.",
- hostname, s.ssn.UID)
- return fmt.Errorf("failed to find node %s", hostname)
- }
-
- // Callbacks
- for _, eh := range s.ssn.eventHandlers {
- if eh.AllocateFunc != nil {
- eh.AllocateFunc(&Event{
- Task: task,
- })
- }
- }
-
- // Update status in session
- klog.V(3).Info("Allocating operations ...")
- s.operations = append(s.operations, operation{
- name: Allocate,
- task: task,
- })
-
- return nil
-}
-
-func (s *Statement) allocate(task *api.TaskInfo) error {
- if err := s.ssn.cache.BindVolumes(task, task.PodVolumes); err != nil {
- return err
- }
-
- if err := s.ssn.cache.Bind(task, task.NodeName); err != nil {
- return err
- }
-
- // Update status in session
- if job, found := s.ssn.Jobs[task.Job]; found {
- if err := job.UpdateTaskStatus(task, api.Binding); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Binding, s.ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to found Job <%s> in Session <%s> index when binding.",
- task.Job, s.ssn.UID)
- return fmt.Errorf("failed to find job %s", task.Job)
- }
-
- metrics.UpdateTaskScheduleDuration(metrics.Duration(task.Pod.CreationTimestamp.Time))
- return nil
-}
-
-// unallocate the pod for task
-func (s *Statement) unallocate(task *api.TaskInfo) error {
- // Update status in session
- job, found := s.ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Pending); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Pending, s.ssn.UID, err)
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when unallocating.",
- task.Job, s.ssn.UID)
- }
-
- if node, found := s.ssn.Nodes[task.NodeName]; found {
- klog.V(3).Infof("Remove Task <%v> on node <%v>", task.Name, task.NodeName)
- err := node.RemoveTask(task)
- if err != nil {
- klog.Errorf("Failed to remove Task <%v> on node <%v>: %s", task.Name, task.NodeName, err.Error())
- }
- }
-
- for _, eh := range s.ssn.eventHandlers {
- if eh.DeallocateFunc != nil {
- eh.DeallocateFunc(&Event{
- Task: task,
- })
- }
- }
- task.NodeName = ""
-
- return nil
-}
-
-// Discard operation for evict, pipeline and allocate
-func (s *Statement) Discard() {
- klog.V(3).Info("Discarding operations ...")
- for i := len(s.operations) - 1; i >= 0; i-- {
- op := s.operations[i]
- op.task.GenerateLastTxContext()
- switch op.name {
- case Evict:
- err := s.unevict(op.task)
- if err != nil {
- klog.Errorf("Failed to unevict task: %s", err.Error())
- }
- case Pipeline:
- err := s.unpipeline(op.task)
- if err != nil {
- klog.Errorf("Failed to unpipeline task: %s", err.Error())
- }
- case Allocate:
- err := s.unallocate(op.task)
- if err != nil {
- klog.Errorf("Failed to unallocate task: %s", err.Error())
- }
- }
- }
-}
-
-// Commit operation for evict and pipeline
-func (s *Statement) Commit() {
- klog.V(3).Info("Committing operations ...")
- for _, op := range s.operations {
- op.task.ClearLastTxContext()
- switch op.name {
- case Evict:
- err := s.evict(op.task, op.reason)
- if err != nil {
- klog.Errorf("Failed to evict task: %s", err.Error())
- }
- case Pipeline:
- s.pipeline(op.task)
- case Allocate:
- err := s.allocate(op.task)
- if err != nil {
- klog.Errorf("Failed to allocate task: for %s", err.Error())
- }
- }
- }
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package binpack
-
-import (
- "fmt"
- "strings"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
-)
-
-const (
- // PluginName indicates name of volcano scheduler plugin.
- PluginName = "binpack"
-)
-
-const (
- // BinpackWeight is the key for providing Binpack Priority Weight in YAML
- BinpackWeight = "binpack.weight"
- // BinpackCPU is the key for weight of cpu
- BinpackCPU = "binpack.cpu"
- // BinpackMemory is the key for weight of memory
- BinpackMemory = "binpack.memory"
-
- // BinpackResources is the key for additional resource key name
- BinpackResources = "binpack.resources"
- // BinpackResourcesPrefix is the key prefix for additional resource key name
- BinpackResourcesPrefix = BinpackResources + "."
-
- resourceFmt = "%s[%d]"
-)
-
-type priorityWeight struct {
- BinPackingWeight int
- BinPackingCPU int
- BinPackingMemory int
- BinPackingResources map[v1.ResourceName]int
-}
-
-func (w *priorityWeight) String() string {
- length := 3
- if extendLength := len(w.BinPackingResources); extendLength == 0 {
- length++
- } else {
- length += extendLength
- }
- msg := make([]string, 0, length)
- msg = append(msg,
- fmt.Sprintf(resourceFmt, BinpackWeight, w.BinPackingWeight),
- fmt.Sprintf(resourceFmt, BinpackCPU, w.BinPackingCPU),
- fmt.Sprintf(resourceFmt, BinpackMemory, w.BinPackingMemory),
- )
-
- if len(w.BinPackingResources) == 0 {
- msg = append(msg, "no extend resources.")
- } else {
- for name, weight := range w.BinPackingResources {
- msg = append(msg, fmt.Sprintf(resourceFmt, name, weight))
- }
- }
- return strings.Join(msg, ", ")
-}
-
-type binpackPlugin struct {
- // Arguments given for the plugin
- weight priorityWeight
-}
-
-//New function returns prioritizePlugin object
-func New(aruguments framework.Arguments) framework.Plugin {
- weight := calculateWeight(aruguments)
- return &binpackPlugin{weight: weight}
-}
-
-func calculateWeight(args framework.Arguments) priorityWeight {
- /*
- User Should give priorityWeight in this format(binpack.weight, binpack.cpu, binpack.memory).
- Support change the weight about cpu, memory and additional resource by arguments.
-
- actions: "enqueue, reclaim, allocate, backfill, preempt"
- tiers:
- - plugins:
- - name: binpack
- arguments:
- binpack.weight: 10
- binpack.cpu: 5
- binpack.memory: 1
- binpack.resources: nvidia.com/gpu, example.com/foo
- binpack.resources.nvidia.com/gpu: 2
- binpack.resources.example.com/foo: 3
- */
- // Values are initialized to 1.
- weight := priorityWeight{
- BinPackingWeight: 1,
- BinPackingCPU: 1,
- BinPackingMemory: 1,
- BinPackingResources: make(map[v1.ResourceName]int),
- }
-
- // Checks whether binpack.weight is provided or not, if given, modifies the value in weight struct.
- args.GetInt(&weight.BinPackingWeight, BinpackWeight)
- // Checks whether binpack.cpu is provided or not, if given, modifies the value in weight struct.
- args.GetInt(&weight.BinPackingCPU, BinpackCPU)
- if weight.BinPackingCPU < 0 {
- weight.BinPackingCPU = 1
- }
- // Checks whether binpack.memory is provided or not, if given, modifies the value in weight struct.
- args.GetInt(&weight.BinPackingMemory, BinpackMemory)
- if weight.BinPackingMemory < 0 {
- weight.BinPackingMemory = 1
- }
-
- resourcesStr := args[BinpackResources]
- resources := strings.Split(resourcesStr, ",")
- for _, resource := range resources {
- resource = strings.TrimSpace(resource)
- if resource == "" {
- continue
- }
-
- // binpack.resources.[ResourceName]
- resourceKey := BinpackResourcesPrefix + resource
- resourceWeight := 1
- args.GetInt(&resourceWeight, resourceKey)
- if resourceWeight < 0 {
- resourceWeight = 1
- }
- weight.BinPackingResources[v1.ResourceName(resource)] = resourceWeight
- }
-
- return weight
-}
-
-func (bp *binpackPlugin) Name() string {
- return PluginName
-}
-
-func (bp *binpackPlugin) OnSessionOpen(ssn *framework.Session) {
- klog.V(4).Infof("Enter binpack plugin ...")
- if klog.V(4) {
- defer func() {
- klog.V(4).Infof("Leaving binpack plugin. %s ...", bp.weight.String())
- }()
-
- notFoundResource := []string{}
- for resource := range bp.weight.BinPackingResources {
- found := false
- for _, nodeInfo := range ssn.Nodes {
- if nodeInfo.Allocatable.Get(resource) > 0 {
- found = true
- break
- }
- }
- if !found {
- notFoundResource = append(notFoundResource, string(resource))
- }
- }
- klog.V(4).Infof("resources [%s] record in weight but not found on any node", strings.Join(notFoundResource, ", "))
- }
-
- nodeOrderFn := func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
- binPackingScore := BinPackingScore(task, node, bp.weight)
-
- klog.V(4).Infof("Binpack score for Task %s/%s on node %s is: %v", task.Namespace, task.Name, node.Name, binPackingScore)
- return binPackingScore, nil
- }
- if bp.weight.BinPackingWeight != 0 {
- ssn.AddNodeOrderFn(bp.Name(), nodeOrderFn)
- } else {
- klog.Infof("binpack weight is zero, skip node order function")
- }
-}
-
-func (bp *binpackPlugin) OnSessionClose(ssn *framework.Session) {
-}
-
-// BinPackingScore use the best fit polices during scheduling.
-// Goals:
-// - Schedule Jobs using BestFit Policy using Resource Bin Packing Priority Function
-// - Reduce Fragmentation of scarce resources on the Cluster
-func BinPackingScore(task *api.TaskInfo, node *api.NodeInfo, weight priorityWeight) float64 {
- score := 0.0
- weightSum := 0
- requested := task.Resreq
- allocatable := node.Allocatable
- used := node.Used
-
- for _, resource := range requested.ResourceNames() {
- request := requested.Get(resource)
- if request == 0 {
- continue
- }
- allocate := allocatable.Get(resource)
- nodeUsed := used.Get(resource)
-
- resourceWeight := 0
- found := false
- switch resource {
- case v1.ResourceCPU:
- resourceWeight = weight.BinPackingCPU
- found = true
- case v1.ResourceMemory:
- resourceWeight = weight.BinPackingMemory
- found = true
- default:
- resourceWeight, found = weight.BinPackingResources[resource]
- }
- if !found {
- continue
- }
-
- resourceScore := ResourceBinPackingScore(request, allocate, nodeUsed, resourceWeight)
- klog.V(5).Infof("task %s/%s on node %s resource %s, need %f, used %f, allocatable %f, weight %d, score %f", task.Namespace, task.Name, node.Name, resource, request, nodeUsed, allocate, resourceWeight, resourceScore)
-
- score += resourceScore
- weightSum += resourceWeight
- }
-
- // mapping the result from [0, weightSum] to [0, 10(MaxPriority)]
- if weightSum > 0 {
- score /= float64(weightSum)
- }
- score *= float64(v1alpha1.MaxNodeScore * int64(weight.BinPackingWeight))
-
- return score
-}
-
-// ResourceBinPackingScore calculate the binpack score for resource with provided info
-func ResourceBinPackingScore(requested, capacity, used float64, weight int) float64 {
- if capacity == 0 || weight == 0 {
- return 0
- }
-
- usedFinally := requested + used
- if usedFinally > capacity {
- return 0
- }
-
- score := usedFinally * float64(weight) / capacity
- return score
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package drf
-
-import (
- "fmt"
- "math"
- "strconv"
- "strings"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/api/helpers"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/metrics"
- "volcano.sh/volcano/pkg/scheduler/plugins/util"
-)
-
-// PluginName indicates name of volcano scheduler plugin.
-const PluginName = "drf"
-
-var shareDelta = 0.000001
-
-// hierarchicalNode represents the node hierarchy
-// and the corresponding weight and drf attribute
-type hierarchicalNode struct {
- parent *hierarchicalNode
- attr *drfAttr
- // If the node is a leaf node,
- // request represents the request of the job.
- request *api.Resource
- weight float64
- saturated bool
- hierarchy string
- children map[string]*hierarchicalNode
-}
-
-func (node *hierarchicalNode) Clone(parent *hierarchicalNode) *hierarchicalNode {
- newNode := &hierarchicalNode{
- parent: parent,
- attr: &drfAttr{
- share: node.attr.share,
- dominantResource: node.attr.dominantResource,
- allocated: node.attr.allocated.Clone(),
- },
- request: node.request.Clone(),
- weight: node.weight,
- saturated: node.saturated,
- hierarchy: node.hierarchy,
- children: nil,
- }
- if node.children != nil {
- newNode.children = map[string]*hierarchicalNode{}
- for _, child := range node.children {
- newNode.children[child.hierarchy] = child.Clone(newNode)
- }
- }
- return newNode
-}
-
-// resourceSaturated returns true if any resource of the job is saturated or the job demands fully allocated resource
-func resourceSaturated(allocated *api.Resource,
- jobRequest *api.Resource, demandingResources map[v1.ResourceName]bool) bool {
- for _, rn := range allocated.ResourceNames() {
- if allocated.Get(rn) != 0 && jobRequest.Get(rn) != 0 &&
- allocated.Get(rn) >= jobRequest.Get(rn) {
- return true
- }
- if !demandingResources[rn] && jobRequest.Get(rn) != 0 {
- return true
- }
- }
- return false
-}
-
-type drfAttr struct {
- share float64
- dominantResource string
- allocated *api.Resource
-}
-
-func (attr *drfAttr) String() string {
- return fmt.Sprintf("dominant resource <%s>, dominant share %f, allocated %s",
- attr.dominantResource, attr.share, attr.allocated)
-}
-
-type drfPlugin struct {
- totalResource *api.Resource
- totalAllocated *api.Resource
-
- // Key is Job ID
- jobAttrs map[api.JobID]*drfAttr
-
- // map[namespaceName]->attr
- namespaceOpts map[string]*drfAttr
-
- // hierarchical tree root
- hierarchicalRoot *hierarchicalNode
-
- // Arguments given for the plugin
- pluginArguments framework.Arguments
-}
-
-// New return drf plugin
-func New(arguments framework.Arguments) framework.Plugin {
- return &drfPlugin{
- totalResource: api.EmptyResource(),
- totalAllocated: api.EmptyResource(),
- jobAttrs: map[api.JobID]*drfAttr{},
- namespaceOpts: map[string]*drfAttr{},
- hierarchicalRoot: &hierarchicalNode{
- attr: &drfAttr{allocated: api.EmptyResource()},
- request: api.EmptyResource(),
- hierarchy: "root",
- weight: 1,
- children: map[string]*hierarchicalNode{},
- },
- pluginArguments: arguments,
- }
-}
-
-func (drf *drfPlugin) Name() string {
- return PluginName
-}
-
-// HierarchyEnabled returns if hierarchy is enabled
-func (drf *drfPlugin) HierarchyEnabled(ssn *framework.Session) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if plugin.Name != PluginName {
- continue
- }
- return plugin.EnabledHierarchy != nil && *plugin.EnabledHierarchy
- }
- }
- return false
-}
-
-// NamespaceOrderEnabled returns the NamespaceOrder for this plugin is enabled in this session or not
-func (drf *drfPlugin) NamespaceOrderEnabled(ssn *framework.Session) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if plugin.Name != PluginName {
- continue
- }
- return plugin.EnabledNamespaceOrder != nil && *plugin.EnabledNamespaceOrder
- }
- }
- return false
-}
-
-func (drf *drfPlugin) compareQueues(root *hierarchicalNode, lqueue *api.QueueInfo, rqueue *api.QueueInfo) float64 {
- lnode := root
- lpaths := strings.Split(lqueue.Hierarchy, "/")
- rnode := root
- rpaths := strings.Split(rqueue.Hierarchy, "/")
- depth := 0
- if len(lpaths) < len(rpaths) {
- depth = len(lpaths)
- } else {
- depth = len(rpaths)
- }
- for i := 0; i < depth; i++ {
- // Saturated nodes have minumun prioirty,
- // so that demanding nodes will be poped first.
- if !lnode.saturated && rnode.saturated {
- return -1
- }
- if lnode.saturated && !rnode.saturated {
- return 1
- }
- if lnode.attr.share/lnode.weight == rnode.attr.share/rnode.weight {
- if i < depth-1 {
- lnode = lnode.children[lpaths[i+1]]
- rnode = rnode.children[rpaths[i+1]]
- }
- } else {
- return lnode.attr.share/lnode.weight - rnode.attr.share/rnode.weight
- }
- }
- return 0
-}
-
-func (drf *drfPlugin) OnSessionOpen(ssn *framework.Session) {
- // Prepare scheduling data for this session.
- drf.totalResource.Add(ssn.TotalResource)
-
- klog.V(4).Infof("Total Allocatable %s", drf.totalResource)
-
- namespaceOrderEnabled := drf.NamespaceOrderEnabled(ssn)
- hierarchyEnabled := drf.HierarchyEnabled(ssn)
-
- for _, job := range ssn.Jobs {
- attr := &drfAttr{
- allocated: api.EmptyResource(),
- }
-
- for status, tasks := range job.TaskStatusIndex {
- if api.AllocatedStatus(status) {
- for _, t := range tasks {
- attr.allocated.Add(t.Resreq)
- }
- }
- }
-
- // Calculate the init share of Job
- drf.updateJobShare(job.Namespace, job.Name, attr)
-
- drf.jobAttrs[job.UID] = attr
-
- if namespaceOrderEnabled {
- nsOpts, found := drf.namespaceOpts[job.Namespace]
- if !found {
- nsOpts = &drfAttr{
- allocated: api.EmptyResource(),
- }
- drf.namespaceOpts[job.Namespace] = nsOpts
- }
- // all task in job should have the same namespace with job
- nsOpts.allocated.Add(attr.allocated)
- drf.updateNamespaceShare(job.Namespace, nsOpts)
- }
- if hierarchyEnabled {
- queue := ssn.Queues[job.Queue]
- drf.totalAllocated.Add(attr.allocated)
- drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
- }
- }
-
- preemptableFn := func(preemptor *api.TaskInfo, preemptees []*api.TaskInfo) ([]*api.TaskInfo, int) {
- var victims []*api.TaskInfo
-
- addVictim := func(candidate *api.TaskInfo) {
- victims = append(victims, candidate)
- }
-
- if namespaceOrderEnabled {
- // apply the namespace share policy on preemptee firstly
-
- lWeight := ssn.NamespaceInfo[api.NamespaceName(preemptor.Namespace)].GetWeight()
- lNsAtt := drf.namespaceOpts[preemptor.Namespace]
- lNsAlloc := lNsAtt.allocated.Clone().Add(preemptor.Resreq)
- _, lNsShare := drf.calculateShare(lNsAlloc, drf.totalResource)
- lNsShareWeighted := lNsShare / float64(lWeight)
-
- namespaceAllocation := map[string]*api.Resource{}
-
- // undecidedPreemptees means this policy could not judge preemptee is preemptable or not
- // and left it to next policy
- undecidedPreemptees := []*api.TaskInfo{}
-
- for _, preemptee := range preemptees {
- if preemptor.Namespace == preemptee.Namespace {
- // policy is disabled when they are in the same namespace
- undecidedPreemptees = append(undecidedPreemptees, preemptee)
- continue
- }
-
- // compute the preemptee namespace weighted share after preemption
- nsAllocation, found := namespaceAllocation[preemptee.Namespace]
- if !found {
- rNsAtt := drf.namespaceOpts[preemptee.Namespace]
- nsAllocation = rNsAtt.allocated.Clone()
- namespaceAllocation[preemptee.Namespace] = nsAllocation
- }
- rWeight := ssn.NamespaceInfo[api.NamespaceName(preemptee.Namespace)].GetWeight()
- rNsAlloc := nsAllocation.Sub(preemptee.Resreq)
- _, rNsShare := drf.calculateShare(rNsAlloc, drf.totalResource)
- rNsShareWeighted := rNsShare / float64(rWeight)
-
- // to avoid ping pong actions, the preemptee namespace should
- // have the higher weighted share after preemption.
- if lNsShareWeighted < rNsShareWeighted {
- addVictim(preemptee)
- continue
- }
- if lNsShareWeighted-rNsShareWeighted > shareDelta {
- continue
- }
-
- // equal namespace order leads to judgement of jobOrder
- undecidedPreemptees = append(undecidedPreemptees, preemptee)
- }
-
- preemptees = undecidedPreemptees
- }
-
- latt := drf.jobAttrs[preemptor.Job]
- lalloc := latt.allocated.Clone().Add(preemptor.Resreq)
- _, ls := drf.calculateShare(lalloc, drf.totalResource)
-
- allocations := map[api.JobID]*api.Resource{}
-
- for _, preemptee := range preemptees {
- if _, found := allocations[preemptee.Job]; !found {
- ratt := drf.jobAttrs[preemptee.Job]
- allocations[preemptee.Job] = ratt.allocated.Clone()
- }
- ralloc := allocations[preemptee.Job].Sub(preemptee.Resreq)
- _, rs := drf.calculateShare(ralloc, drf.totalResource)
-
- if ls < rs || math.Abs(ls-rs) <= shareDelta {
- addVictim(preemptee)
- }
- }
-
- klog.V(4).Infof("Victims from DRF plugins are %+v", victims)
-
- return victims, util.Permit
- }
-
- ssn.AddPreemptableFn(drf.Name(), preemptableFn)
-
- if hierarchyEnabled {
- queueOrderFn := func(l interface{}, r interface{}) int {
- lv := l.(*api.QueueInfo)
- rv := r.(*api.QueueInfo)
- ret := drf.compareQueues(drf.hierarchicalRoot, lv, rv)
- if ret < 0 {
- return -1
- }
- if ret > 0 {
- return 1
- }
- return 0
- }
- ssn.AddQueueOrderFn(drf.Name(), queueOrderFn)
-
- reclaimFn := func(reclaimer *api.TaskInfo, reclaimees []*api.TaskInfo) ([]*api.TaskInfo, int) {
- var victims []*api.TaskInfo
- // clone hdrf tree
- totalAllocated := drf.totalAllocated.Clone()
- root := drf.hierarchicalRoot.Clone(nil)
-
- // update reclaimer hdrf
- ljob := ssn.Jobs[reclaimer.Job]
- lqueue := ssn.Queues[ljob.Queue]
- ljob = ljob.Clone()
- attr := drf.jobAttrs[ljob.UID]
- lattr := &drfAttr{
- allocated: attr.allocated.Clone(),
- }
- lattr.allocated.Add(reclaimer.Resreq)
- totalAllocated.Add(reclaimer.Resreq)
- drf.updateShare(lattr)
- drf.UpdateHierarchicalShare(root, totalAllocated, ljob, lattr, lqueue.Hierarchy, lqueue.Weights)
-
- for _, preemptee := range reclaimees {
- rjob := ssn.Jobs[preemptee.Job]
- rqueue := ssn.Queues[rjob.Queue]
-
- // update hdrf of reclaimee job
- totalAllocated.Sub(preemptee.Resreq)
- rjob = rjob.Clone()
- attr := drf.jobAttrs[rjob.UID]
- rattr := &drfAttr{
- allocated: attr.allocated.Clone(),
- }
- rattr.allocated.Sub(preemptee.Resreq)
- drf.updateShare(rattr)
- drf.UpdateHierarchicalShare(root, totalAllocated, rjob, rattr, rqueue.Hierarchy, rqueue.Weights)
-
- // compare hdrf of queues
- ret := drf.compareQueues(root, lqueue, rqueue)
-
- // resume hdrf of reclaimee job
- totalAllocated.Add(preemptee.Resreq)
- rattr.allocated.Add(preemptee.Resreq)
- drf.updateShare(rattr)
- drf.UpdateHierarchicalShare(root, totalAllocated, rjob, rattr, rqueue.Hierarchy, rqueue.Weights)
-
- if ret < 0 {
- victims = append(victims, preemptee)
- }
-
- if ret > shareDelta {
- continue
- }
- }
-
- klog.V(4).Infof("Victims from HDRF plugins are %+v", victims)
-
- return victims, util.Permit
- }
- ssn.AddReclaimableFn(drf.Name(), reclaimFn)
- }
-
- jobOrderFn := func(l interface{}, r interface{}) int {
- lv := l.(*api.JobInfo)
- rv := r.(*api.JobInfo)
-
- klog.V(4).Infof("DRF JobOrderFn: <%v/%v> share state: %v, <%v/%v> share state: %v",
- lv.Namespace, lv.Name, drf.jobAttrs[lv.UID].share, rv.Namespace, rv.Name, drf.jobAttrs[rv.UID].share)
-
- if drf.jobAttrs[lv.UID].share == drf.jobAttrs[rv.UID].share {
- return 0
- }
-
- if drf.jobAttrs[lv.UID].share < drf.jobAttrs[rv.UID].share {
- return -1
- }
-
- return 1
- }
-
- ssn.AddJobOrderFn(drf.Name(), jobOrderFn)
-
- namespaceOrderFn := func(l interface{}, r interface{}) int {
- lv := l.(api.NamespaceName)
- rv := r.(api.NamespaceName)
-
- lOpt := drf.namespaceOpts[string(lv)]
- rOpt := drf.namespaceOpts[string(rv)]
-
- lWeight := ssn.NamespaceInfo[lv].GetWeight()
- rWeight := ssn.NamespaceInfo[rv].GetWeight()
-
- klog.V(4).Infof("DRF NamespaceOrderFn: <%v> share state: %f, weight %v, <%v> share state: %f, weight %v",
- lv, lOpt.share, lWeight, rv, rOpt.share, rWeight)
-
- lWeightedShare := lOpt.share / float64(lWeight)
- rWeightedShare := rOpt.share / float64(rWeight)
-
- metrics.UpdateNamespaceWeight(string(lv), lWeight)
- metrics.UpdateNamespaceWeight(string(rv), rWeight)
- metrics.UpdateNamespaceWeightedShare(string(lv), lWeightedShare)
- metrics.UpdateNamespaceWeightedShare(string(rv), rWeightedShare)
-
- if lWeightedShare == rWeightedShare {
- return 0
- }
-
- if lWeightedShare < rWeightedShare {
- return -1
- }
-
- return 1
- }
-
- if namespaceOrderEnabled {
- ssn.AddNamespaceOrderFn(drf.Name(), namespaceOrderFn)
- }
-
- // Register event handlers.
- ssn.AddEventHandler(&framework.EventHandler{
- AllocateFunc: func(event *framework.Event) {
- attr := drf.jobAttrs[event.Task.Job]
- attr.allocated.Add(event.Task.Resreq)
-
- job := ssn.Jobs[event.Task.Job]
- drf.updateJobShare(job.Namespace, job.Name, attr)
-
- nsShare := -1.0
- if namespaceOrderEnabled {
- nsOpt := drf.namespaceOpts[event.Task.Namespace]
- nsOpt.allocated.Add(event.Task.Resreq)
-
- drf.updateNamespaceShare(event.Task.Namespace, nsOpt)
- nsShare = nsOpt.share
- }
- if hierarchyEnabled {
- queue := ssn.Queues[job.Queue]
-
- drf.totalAllocated.Add(event.Task.Resreq)
- drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
- }
-
- klog.V(4).Infof("DRF AllocateFunc: task <%v/%v>, resreq <%v>, share <%v>, namespace share <%v>",
- event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share, nsShare)
- },
- DeallocateFunc: func(event *framework.Event) {
- attr := drf.jobAttrs[event.Task.Job]
- attr.allocated.Sub(event.Task.Resreq)
-
- job := ssn.Jobs[event.Task.Job]
- drf.updateJobShare(job.Namespace, job.Name, attr)
-
- nsShare := -1.0
- if namespaceOrderEnabled {
- nsOpt := drf.namespaceOpts[event.Task.Namespace]
- nsOpt.allocated.Sub(event.Task.Resreq)
-
- drf.updateNamespaceShare(event.Task.Namespace, nsOpt)
- nsShare = nsOpt.share
- }
-
- if hierarchyEnabled {
- queue := ssn.Queues[job.Queue]
- drf.totalAllocated.Sub(event.Task.Resreq)
- drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
- }
-
- klog.V(4).Infof("DRF EvictFunc: task <%v/%v>, resreq <%v>, share <%v>, namespace share <%v>",
- event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share, nsShare)
- },
- })
-}
-
-func (drf *drfPlugin) updateNamespaceShare(namespaceName string, attr *drfAttr) {
- drf.updateShare(attr)
- metrics.UpdateNamespaceShare(namespaceName, attr.share)
-}
-
-// build hierarchy if the node does not exist
-func (drf *drfPlugin) buildHierarchy(root *hierarchicalNode, job *api.JobInfo, attr *drfAttr,
- hierarchy, hierarchicalWeights string) {
- inode := root
- paths := strings.Split(hierarchy, "/")
- weights := strings.Split(hierarchicalWeights, "/")
-
- for i := 1; i < len(paths); i++ {
- if child, ok := inode.children[paths[i]]; ok {
- inode = child
- } else {
- fweight, _ := strconv.ParseFloat(weights[i], 64)
- if fweight < 1 {
- fweight = 1
- }
- child = &hierarchicalNode{
- weight: fweight,
- hierarchy: paths[i],
- request: api.EmptyResource(),
- attr: &drfAttr{
- allocated: api.EmptyResource(),
- },
- children: make(map[string]*hierarchicalNode),
- }
- klog.V(4).Infof("Node %s added to %s, weight %f",
- child.hierarchy, inode.hierarchy, fweight)
- inode.children[paths[i]] = child
- child.parent = inode
- inode = child
- }
- }
-
- child := &hierarchicalNode{
- weight: 1,
- attr: attr,
- hierarchy: string(job.UID),
- request: job.TotalRequest.Clone(),
- children: nil,
- }
- inode.children[string(job.UID)] = child
- // update drf attribute bottom up
- klog.V(4).Infof("Job <%s/%s> added to %s, weights %s, attr %v, total request: %s",
- job.Namespace, job.Name, inode.hierarchy, hierarchicalWeights, child.attr, job.TotalRequest)
-}
-
-// updateNamespaceShare updates the node attribute recursively
-func (drf *drfPlugin) updateHierarchicalShare(node *hierarchicalNode,
- demandingResources map[v1.ResourceName]bool) {
- if node.children == nil {
- node.saturated = resourceSaturated(node.attr.allocated,
- node.request, demandingResources)
- klog.V(4).Infof("Update hierarchical node %s, share %f, dominant %s, resource %v, saturated: %t",
- node.hierarchy, node.attr.share, node.attr.dominantResource, node.attr.allocated, node.saturated)
- } else {
- var mdr float64 = 1
- // get minimun dominant resource share
- for _, child := range node.children {
- drf.updateHierarchicalShare(child, demandingResources)
- // skip empty child and saturated child
- if child.attr.share != 0 && !child.saturated {
- _, resShare := drf.calculateShare(child.attr.allocated, drf.totalResource)
- if resShare < mdr {
- mdr = resShare
- }
- }
- }
-
- node.attr.allocated = api.EmptyResource()
- saturated := true
- for _, child := range node.children {
- if !child.saturated {
- saturated = false
- }
- // only consider non-empty children
- if child.attr.share != 0 {
- // saturated child is not scaled
- if child.saturated {
- t := child.attr.allocated
- node.attr.allocated.Add(t)
- } else {
- t := child.attr.allocated.Clone().Multi(mdr / child.attr.share)
- node.attr.allocated.Add(t)
- }
- }
- }
- node.attr.dominantResource, node.attr.share = drf.calculateShare(
- node.attr.allocated, drf.totalResource)
- node.saturated = saturated
- klog.V(4).Infof("Update hierarchical node %s, share %f, dominant resource %s, resource %v, saturated: %t",
- node.hierarchy, node.attr.share, node.attr.dominantResource, node.attr.allocated, node.saturated)
- }
-}
-
-func (drf *drfPlugin) UpdateHierarchicalShare(root *hierarchicalNode, totalAllocated *api.Resource, job *api.JobInfo, attr *drfAttr, hierarchy, hierarchicalWeights string) {
- // filter out demanding resources
- demandingResources := map[v1.ResourceName]bool{}
- for _, rn := range drf.totalResource.ResourceNames() {
- if totalAllocated.Get(rn) < drf.totalResource.Get(rn) {
- demandingResources[rn] = true
- }
- }
- drf.buildHierarchy(root, job, attr, hierarchy, hierarchicalWeights)
- drf.updateHierarchicalShare(root, demandingResources)
-}
-
-func (drf *drfPlugin) updateJobShare(jobNs, jobName string, attr *drfAttr) {
- drf.updateShare(attr)
- metrics.UpdateJobShare(jobNs, jobName, attr.share)
-}
-
-func (drf *drfPlugin) updateShare(attr *drfAttr) {
- attr.dominantResource, attr.share = drf.calculateShare(attr.allocated, drf.totalResource)
-}
-
-func (drf *drfPlugin) calculateShare(allocated, totalResource *api.Resource) (string, float64) {
- res := float64(0)
- dominantResource := ""
- for _, rn := range totalResource.ResourceNames() {
- share := helpers.Share(allocated.Get(rn), totalResource.Get(rn))
- if share > res {
- res = share
- dominantResource = string(rn)
- }
- }
-
- return dominantResource, res
-}
-
-func (drf *drfPlugin) OnSessionClose(session *framework.Session) {
- // Clean schedule data.
- drf.totalResource = api.EmptyResource()
- drf.totalAllocated = api.EmptyResource()
- drf.jobAttrs = map[api.JobID]*drfAttr{}
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-import (
- v1 "k8s.io/api/core/v1"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
- "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- nodeinfov1alpha1 "volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-// TopologyHint is a struct containing the NUMANodeAffinity for a Container
-type TopologyHint struct {
- NUMANodeAffinity bitmask.BitMask
- // Preferred is set to true when the NUMANodeAffinity encodes a preferred
- // allocation for the Container. It is set to false otherwise.
- Preferred bool
-}
-
-// Policy is an interface for topology manager policy
-type Policy interface {
- // Predicate Get the best hit.
- Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool)
-}
-
-// HintProvider is an interface for components that want to collaborate to
-// achieve globally optimal concrete resource alignment with respect to
-// NUMA locality.
-type HintProvider interface {
- // Name returns provider name used for register and logging.
- Name() string
- // GetTopologyHints returns hints if this hint provider has a preference,
- GetTopologyHints(container *v1.Container, topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string][]TopologyHint
- Allocate(container *v1.Container, bestHit *TopologyHint, topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string]cpuset.CPUSet
-}
-
-// GetPolicy return the interface matched the input task topology config
-func GetPolicy(node *api.NodeInfo, numaNodes []int) Policy {
- switch batch.NumaPolicy(node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.TopologyManagerPolicy]) {
- case batch.None:
- return NewPolicyNone(numaNodes)
- case batch.BestEffort:
- return NewPolicyBestEffort(numaNodes)
- case batch.Restricted:
- return NewPolicyRestricted(numaNodes)
- case batch.SingleNumaNode:
- return NewPolicySingleNumaNode(numaNodes)
- }
-
- return &policyNone{}
-}
-
-// AccumulateProvidersHints return all TopologyHint collection from different providers
-func AccumulateProvidersHints(container *v1.Container,
- topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets,
- hintProviders []HintProvider) (providersHints []map[string][]TopologyHint) {
- for _, provider := range hintProviders {
- hints := provider.GetTopologyHints(container, topoInfo, resNumaSets)
- providersHints = append(providersHints, hints)
- }
-
- return providersHints
-}
-
-// Allocate return all resource assignment collection from different providers
-func Allocate(container *v1.Container, bestHit *TopologyHint,
- topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets, hintProviders []HintProvider) map[string]cpuset.CPUSet {
- allResAlloc := make(map[string]cpuset.CPUSet)
- for _, provider := range hintProviders {
- resAlloc := provider.Allocate(container, bestHit, topoInfo, resNumaSets)
- for resName, assign := range resAlloc {
- allResAlloc[resName] = assign
- }
- }
-
- return allResAlloc
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-import (
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
-)
-
-func filterProvidersHints(providersHints []map[string][]TopologyHint) [][]TopologyHint {
- var allProviderHints [][]TopologyHint
- for _, hints := range providersHints {
- // If hints is nil, insert a single, preferred any-numa hint into allProviderHints.
- if len(hints) == 0 {
- klog.Infof("[numatopo] Hint Provider has no preference for NUMA affinity with any resource")
- allProviderHints = append(allProviderHints, []TopologyHint{{nil, true}})
- continue
- }
-
- // Otherwise, accumulate the hints for each resource type into allProviderHints.
- for resource := range hints {
- if hints[resource] == nil {
- klog.Infof("[numatopo] Hint Provider has no preference for NUMA affinity with resource '%s'", resource)
- allProviderHints = append(allProviderHints, []TopologyHint{{nil, true}})
- continue
- }
-
- if len(hints[resource]) == 0 {
- klog.Infof("[numatopo] Hint Provider has no possible NUMA affinities for resource '%s'", resource)
- allProviderHints = append(allProviderHints, []TopologyHint{{nil, false}})
- continue
- }
-
- allProviderHints = append(allProviderHints, hints[resource])
- }
- }
- return allProviderHints
-}
-
-func mergeFilteredHints(numaNodes []int, filteredHints [][]TopologyHint) TopologyHint {
- // Set the default affinity as an any-numa affinity containing the list
- // of NUMA Nodes available on this machine.
- defaultAffinity, _ := bitmask.NewBitMask(numaNodes...)
-
- // Set the bestHint to return from this function as {nil false}.
- // This will only be returned if no better hint can be found when
- // merging hints from each hint provider.
- bestHint := TopologyHint{defaultAffinity, false}
- iterateAllProviderTopologyHints(filteredHints, func(permutation []TopologyHint) {
- // Get the NUMANodeAffinity from each hint in the permutation and see if any
- // of them encode unpreferred allocations.
- mergedHint := mergePermutation(numaNodes, permutation)
- // Only consider mergedHints that result in a NUMANodeAffinity > 0 to
- // replace the current bestHint.
- if mergedHint.NUMANodeAffinity.Count() == 0 {
- return
- }
-
- // If the current bestHint is non-preferred and the new mergedHint is
- // preferred, always choose the preferred hint over the non-preferred one.
- if mergedHint.Preferred && !bestHint.Preferred {
- bestHint = mergedHint
- return
- }
-
- // If the current bestHint is preferred and the new mergedHint is
- // non-preferred, never update bestHint, regardless of mergedHint's
- // narowness.
- if !mergedHint.Preferred && bestHint.Preferred {
- return
- }
-
- // If mergedHint and bestHint has the same preference, only consider
- // mergedHints that have a narrower NUMANodeAffinity than the
- // NUMANodeAffinity in the current bestHint.
- if !mergedHint.NUMANodeAffinity.IsNarrowerThan(bestHint.NUMANodeAffinity) {
- return
- }
-
- // In all other cases, update bestHint to the current mergedHint
- bestHint = mergedHint
- })
-
- return bestHint
-}
-
-// Iterate over all permutations of hints in 'allProviderHints [][]TopologyHint'.
-//
-// This procedure is implemented as a recursive function over the set of hints
-// in 'allproviderHints[i]'. It applies the function 'callback' to each
-// permutation as it is found. It is the equivalent of:
-//
-// for i := 0; i < len(providerHints[0]); i++
-// for j := 0; j < len(providerHints[1]); j++
-// for k := 0; k < len(providerHints[2]); k++
-// ...
-// for z := 0; z < len(providerHints[-1]); z++
-// permutation := []TopologyHint{
-// providerHints[0][i],
-// providerHints[1][j],
-// providerHints[2][k],
-// ...
-// providerHints[-1][z]
-// }
-// callback(permutation)
-func iterateAllProviderTopologyHints(allProviderHints [][]TopologyHint, callback func([]TopologyHint)) {
- // Internal helper function to accumulate the permutation before calling the callback.
- var iterate func(i int, accum []TopologyHint)
- iterate = func(i int, accum []TopologyHint) {
- // Base case: we have looped through all providers and have a full permutation.
- if i == len(allProviderHints) {
- callback(accum)
- return
- }
-
- // Loop through all hints for provider 'i', and recurse to build the
- // the permutation of this hint with all hints from providers 'i++'.
- for j := range allProviderHints[i] {
- iterate(i+1, append(accum, allProviderHints[i][j]))
- }
- }
- iterate(0, []TopologyHint{})
-}
-
-// Merge a TopologyHints permutation to a single hint by performing a bitwise-AND
-// of their affinity masks. The hint shall be preferred if all hits in the permutation
-// are preferred.
-func mergePermutation(numaNodes []int, permutation []TopologyHint) TopologyHint {
- // Get the NUMANodeAffinity from each hint in the permutation and see if any
- // of them encode unpreferred allocations.
- preferred := true
- defaultAffinity, _ := bitmask.NewBitMask(numaNodes...)
- var numaAffinities []bitmask.BitMask
- for _, hint := range permutation {
- // Only consider hints that have an actual NUMANodeAffinity set.
- if hint.NUMANodeAffinity == nil {
- numaAffinities = append(numaAffinities, defaultAffinity)
- } else {
- numaAffinities = append(numaAffinities, hint.NUMANodeAffinity)
- }
-
- if !hint.Preferred {
- preferred = false
- }
- }
-
- // Merge the affinities using a bitwise-and operation.
- mergedAffinity := bitmask.And(defaultAffinity, numaAffinities...)
- // Build a mergedHint from the merged affinity mask, indicating if an
- // preferred allocation was used to generate the affinity mask or not.
- return TopologyHint{mergedAffinity, preferred}
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-import "k8s.io/klog"
-
-type policyBestEffort struct {
- numaNodes []int
-}
-
-// NewPolicyBestEffort return a new policy interface
-func NewPolicyBestEffort(numaNodes []int) Policy {
- return &policyBestEffort{numaNodes: numaNodes}
-}
-
-func (p *policyBestEffort) canAdmitPodResult(hint *TopologyHint) bool {
- return true
-}
-
-func (p *policyBestEffort) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
- filteredProvidersHints := filterProvidersHints(providersHints)
- bestHint := mergeFilteredHints(p.numaNodes, filteredProvidersHints)
- admit := p.canAdmitPodResult(&bestHint)
-
- klog.V(4).Infof("bestHint: %v admit %v\n", bestHint, admit)
- return bestHint, admit
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-type policyNone struct {
- numaNodes []int
-}
-
-// NewPolicyNone return a new policy interface
-func NewPolicyNone(numaNodes []int) Policy {
- return &policyNone{numaNodes: numaNodes}
-}
-
-func (policy *policyNone) canAdmitPodResult(hint *TopologyHint) bool {
- return true
-}
-
-func (policy *policyNone) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
- return TopologyHint{}, policy.canAdmitPodResult(nil)
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-import "k8s.io/klog"
-
-type policyRestricted struct {
- numaNodes []int
-}
-
-// NewPolicyRestricted return a new policy interface
-func NewPolicyRestricted(numaNodes []int) Policy {
- return &policyRestricted{numaNodes: numaNodes}
-}
-
-func (p *policyRestricted) canAdmitPodResult(hint *TopologyHint) bool {
- return hint.Preferred
-}
-
-func (p *policyRestricted) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
- filteredHints := filterProvidersHints(providersHints)
- bestHint := mergeFilteredHints(p.numaNodes, filteredHints)
- admit := p.canAdmitPodResult(&bestHint)
-
- klog.V(4).Infof("bestHint: %v admit %v\n", bestHint, admit)
- return bestHint, admit
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-import "k8s.io/klog"
-
-type policySingleNumaNode struct {
- numaNodes []int
-}
-
-// NewPolicySingleNumaNode return a new policy interface
-func NewPolicySingleNumaNode(numaNodes []int) Policy {
- return &policySingleNumaNode{numaNodes: numaNodes}
-}
-
-func (policy *policySingleNumaNode) canAdmitPodResult(hint *TopologyHint) bool {
- return hint.Preferred
-}
-
-// Return hints that have valid bitmasks with exactly one bit set.
-func filterSingleNumaHints(allResourcesHints [][]TopologyHint) [][]TopologyHint {
- var filteredResourcesHints [][]TopologyHint
- for _, oneResourceHints := range allResourcesHints {
- var filtered []TopologyHint
- for _, hint := range oneResourceHints {
- if hint.NUMANodeAffinity == nil && hint.Preferred {
- filtered = append(filtered, hint)
- }
- if hint.NUMANodeAffinity != nil && hint.NUMANodeAffinity.Count() == 1 && hint.Preferred {
- filtered = append(filtered, hint)
- }
- }
- filteredResourcesHints = append(filteredResourcesHints, filtered)
- }
- return filteredResourcesHints
-}
-
-func (policy *policySingleNumaNode) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
- filteredHints := filterProvidersHints(providersHints)
- singleNumaHints := filterSingleNumaHints(filteredHints)
- bestHint := mergeFilteredHints(policy.numaNodes, singleNumaHints)
- klog.V(4).Infof("bestHint: %v\n", bestHint)
- admit := policy.canAdmitPodResult(&bestHint)
- return bestHint, admit
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package cpumanager
-
-import (
- "fmt"
- "sort"
-
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
-)
-
-type cpuAccumulator struct {
- topo *topology.CPUTopology
- details topology.CPUDetails
- numCPUsNeeded int
- result cpuset.CPUSet
-}
-
-func newCPUAccumulator(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) *cpuAccumulator {
- return &cpuAccumulator{
- topo: topo,
- details: topo.CPUDetails.KeepOnly(availableCPUs),
- numCPUsNeeded: numCPUs,
- result: cpuset.NewCPUSet(),
- }
-}
-
-func (a *cpuAccumulator) take(cpus cpuset.CPUSet) {
- a.result = a.result.Union(cpus)
- a.details = a.details.KeepOnly(a.details.CPUs().Difference(a.result))
- a.numCPUsNeeded -= cpus.Size()
-}
-
-// isSocketFree Returns true if the supplied socket is fully available in `topoDetails`.
-func (a *cpuAccumulator) isSocketFree(socketID int) bool {
- return a.details.CPUsInSockets(socketID).Size() == a.topo.CPUsPerSocket()
-}
-
-// isCoreFree Returns true if the supplied core is fully available in `topoDetails`.
-func (a *cpuAccumulator) isCoreFree(coreID int) bool {
- return a.details.CPUsInCores(coreID).Size() == a.topo.CPUsPerCore()
-}
-
-// freeSockets Returns free socket IDs as a slice sorted by:
-// - socket ID, ascending.
-func (a *cpuAccumulator) freeSockets() []int {
- return a.details.Sockets().Filter(a.isSocketFree).ToSlice()
-}
-
-// freeCores Returns core IDs as a slice sorted by:
-// - the number of whole available cores on the socket, ascending
-// - socket ID, ascending
-// - core ID, ascending
-func (a *cpuAccumulator) freeCores() []int {
- socketIDs := a.details.Sockets().ToSliceNoSort()
- sort.Slice(socketIDs,
- func(i, j int) bool {
- iCores := a.details.CoresInSockets(socketIDs[i]).Filter(a.isCoreFree)
- jCores := a.details.CoresInSockets(socketIDs[j]).Filter(a.isCoreFree)
- return iCores.Size() < jCores.Size() || socketIDs[i] < socketIDs[j]
- })
-
- coreIDs := []int{}
- for _, s := range socketIDs {
- coreIDs = append(coreIDs, a.details.CoresInSockets(s).Filter(a.isCoreFree).ToSlice()...)
- }
- return coreIDs
-}
-
-// freeCPUs Returns CPU IDs as a slice sorted by:
-// - socket affinity with result
-// - number of CPUs available on the same socket
-// - number of CPUs available on the same core
-// - socket ID.
-// - core ID.
-func (a *cpuAccumulator) freeCPUs() []int {
- result := []int{}
- cores := a.details.Cores().ToSlice()
-
- sort.Slice(
- cores,
- func(i, j int) bool {
- iCore := cores[i]
- jCore := cores[j]
-
- iCPUs := a.topo.CPUDetails.CPUsInCores(iCore).ToSlice()
- jCPUs := a.topo.CPUDetails.CPUsInCores(jCore).ToSlice()
-
- iSocket := a.topo.CPUDetails[iCPUs[0]].SocketID
- jSocket := a.topo.CPUDetails[jCPUs[0]].SocketID
-
- // Compute the number of CPUs in the result reside on the same socket
- // as each core.
- iSocketColoScore := a.topo.CPUDetails.CPUsInSockets(iSocket).Intersection(a.result).Size()
- jSocketColoScore := a.topo.CPUDetails.CPUsInSockets(jSocket).Intersection(a.result).Size()
-
- // Compute the number of available CPUs available on the same socket
- // as each core.
- iSocketFreeScore := a.details.CPUsInSockets(iSocket).Size()
- jSocketFreeScore := a.details.CPUsInSockets(jSocket).Size()
-
- // Compute the number of available CPUs on each core.
- iCoreFreeScore := a.details.CPUsInCores(iCore).Size()
- jCoreFreeScore := a.details.CPUsInCores(jCore).Size()
-
- return iSocketColoScore > jSocketColoScore ||
- iSocketFreeScore < jSocketFreeScore ||
- iCoreFreeScore < jCoreFreeScore ||
- iSocket < jSocket ||
- iCore < jCore
- })
-
- // For each core, append sorted CPU IDs to result.
- for _, core := range cores {
- result = append(result, a.details.CPUsInCores(core).ToSlice()...)
- }
- return result
-}
-
-func (a *cpuAccumulator) needs(n int) bool {
- return a.numCPUsNeeded >= n
-}
-
-func (a *cpuAccumulator) isSatisfied() bool {
- return a.numCPUsNeeded < 1
-}
-
-func (a *cpuAccumulator) isFailed() bool {
- return a.numCPUsNeeded > a.details.CPUs().Size()
-}
-
-// takeByTopology return the assigned cpuset
-func takeByTopology(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) {
- acc := newCPUAccumulator(topo, availableCPUs, numCPUs)
- if acc.isSatisfied() {
- return acc.result, nil
- }
- if acc.isFailed() {
- return cpuset.NewCPUSet(), fmt.Errorf("not enough cpus available to satisfy request")
- }
-
- // Algorithm: topology-aware best-fit
- // 1. Acquire whole sockets, if available and the container requires at
- // least a socket's-worth of CPUs.
- if acc.needs(acc.topo.CPUsPerSocket()) {
- for _, s := range acc.freeSockets() {
- klog.V(4).Infof("[cpumanager] takeByTopology: claiming socket [%d]", s)
- acc.take(acc.details.CPUsInSockets(s))
- if acc.isSatisfied() {
- return acc.result, nil
- }
- if !acc.needs(acc.topo.CPUsPerSocket()) {
- break
- }
- }
- }
-
- // 2. Acquire whole cores, if available and the container requires at least
- // a core's-worth of CPUs.
- if acc.needs(acc.topo.CPUsPerCore()) {
- for _, c := range acc.freeCores() {
- klog.V(4).Infof("[cpumanager] takeByTopology: claiming core [%d]", c)
- acc.take(acc.details.CPUsInCores(c))
- if acc.isSatisfied() {
- return acc.result, nil
- }
- if !acc.needs(acc.topo.CPUsPerCore()) {
- break
- }
- }
- }
-
- // 3. Acquire single threads, preferring to fill partially-allocated cores
- // on the same sockets as the whole cores we have already taken in this
- // allocation.
- for _, c := range acc.freeCPUs() {
- klog.V(4).Infof("[cpumanager] takeByTopology: claiming CPU [%d]", c)
- if acc.needs(1) {
- acc.take(cpuset.NewCPUSet(c))
- }
- if acc.isSatisfied() {
- return acc.result, nil
- }
- }
-
- return cpuset.NewCPUSet(), fmt.Errorf("failed to allocate cpus")
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package cpumanager
-
-import (
- "math"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
- "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/plugins/numaaware/policy"
-)
-
-type cpuMng struct {
-}
-
-// NewProvider return a new provider
-func NewProvider() policy.HintProvider {
- return &cpuMng{}
-}
-
-// Name return the cpu manager name
-func (mng *cpuMng) Name() string {
- return "cpuMng"
-}
-
-// guaranteedCPUs return the intger num of request cpu
-func guaranteedCPUs(container *v1.Container) int {
- cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
- if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
- return 0
- }
-
- return int(cpuQuantity.Value())
-}
-
-// generateCPUTopologyHints return the numa topology hints based on
-// - availableCPUs
-func generateCPUTopologyHints(availableCPUs cpuset.CPUSet, CPUDetails topology.CPUDetails, request int) []policy.TopologyHint {
- minAffinitySize := CPUDetails.NUMANodes().Size()
- hints := []policy.TopologyHint{}
- bitmask.IterateBitMasks(CPUDetails.NUMANodes().ToSlice(), func(mask bitmask.BitMask) {
- // First, update minAffinitySize for the current request size.
- cpusInMask := CPUDetails.CPUsInNUMANodes(mask.GetBits()...).Size()
- if cpusInMask >= request && mask.Count() < minAffinitySize {
- minAffinitySize = mask.Count()
- }
-
- // Then check to see if we have enough CPUs available on the current
- // numa node bitmask to satisfy the CPU request.
- numMatching := 0
- // Finally, check to see if enough available CPUs remain on the current
- // NUMA node combination to satisfy the CPU request.
- for _, c := range availableCPUs.ToSlice() {
- if mask.IsSet(CPUDetails[c].NUMANodeID) {
- numMatching++
- }
- }
-
- // If they don't, then move onto the next combination.
- if numMatching < request {
- return
- }
-
- // Otherwise, create a new hint from the numa node bitmask and add it to the
- // list of hints. We set all hint preferences to 'false' on the first
- // pass through.
- hints = append(hints, policy.TopologyHint{
- NUMANodeAffinity: mask,
- Preferred: false,
- })
- })
-
- // Loop back through all hints and update the 'Preferred' field based on
- // counting the number of bits sets in the affinity mask and comparing it
- // to the minAffinitySize. Only those with an equal number of bits set (and
- // with a minimal set of numa nodes) will be considered preferred.
- for i := range hints {
- if hints[i].NUMANodeAffinity.Count() == minAffinitySize {
- hints[i].Preferred = true
- }
- }
-
- return hints
-}
-
-func (mng *cpuMng) GetTopologyHints(container *v1.Container,
- topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string][]policy.TopologyHint {
- if _, ok := container.Resources.Requests[v1.ResourceCPU]; !ok {
- klog.Warningf("container %s has no cpu request", container.Name)
- return nil
- }
-
- requestNum := guaranteedCPUs(container)
- if requestNum == 0 {
- klog.Warningf(" the cpu request isn't integer in container %s", container.Name)
- return nil
- }
-
- cputopo := &topology.CPUTopology{
- NumCPUs: topoInfo.CPUDetail.CPUs().Size(),
- NumCores: topoInfo.CPUDetail.Cores().Size() * topoInfo.CPUDetail.Sockets().Size(),
- NumSockets: topoInfo.CPUDetail.Sockets().Size(),
- CPUDetails: topoInfo.CPUDetail,
- }
-
- reserved := cpuset.NewCPUSet()
- reservedCPUs, ok := topoInfo.ResReserved[v1.ResourceCPU]
- if ok {
- // Take the ceiling of the reservation, since fractional CPUs cannot be
- // exclusively allocated.
- reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
- numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
- reserved, _ = takeByTopology(cputopo, cputopo.CPUDetails.CPUs(), numReservedCPUs)
- klog.V(4).Infof("[cpumanager] reserve cpuset :%v", reserved)
- }
-
- availableCPUSet, ok := resNumaSets[string(v1.ResourceCPU)]
- if !ok {
- klog.Warningf("no cpu resource")
- return nil
- }
-
- availableCPUSet = availableCPUSet.Difference(reserved)
- klog.V(4).Infof("requested: %d, availableCPUSet: %v", requestNum, availableCPUSet)
- return map[string][]policy.TopologyHint{
- string(v1.ResourceCPU): generateCPUTopologyHints(availableCPUSet, topoInfo.CPUDetail, requestNum),
- }
-}
-
-func (mng *cpuMng) Allocate(container *v1.Container, bestHit *policy.TopologyHint,
- topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string]cpuset.CPUSet {
- cputopo := &topology.CPUTopology{
- NumCPUs: topoInfo.CPUDetail.CPUs().Size(),
- NumCores: topoInfo.CPUDetail.Cores().Size() * topoInfo.CPUDetail.Sockets().Size(),
- NumSockets: topoInfo.CPUDetail.Sockets().Size(),
- CPUDetails: topoInfo.CPUDetail,
- }
-
- reserved := cpuset.NewCPUSet()
- reservedCPUs, ok := topoInfo.ResReserved[v1.ResourceCPU]
- if ok {
- // Take the ceiling of the reservation, since fractional CPUs cannot be
- // exclusively allocated.
- reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
- numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
- reserved, _ = takeByTopology(cputopo, cputopo.CPUDetails.CPUs(), numReservedCPUs)
- klog.V(3).Infof("[cpumanager] reserve cpuset :%v", reserved)
- }
-
- requestNum := guaranteedCPUs(container)
- availableCPUSet := resNumaSets[string(v1.ResourceCPU)]
- availableCPUSet = availableCPUSet.Difference(reserved)
-
- klog.V(4).Infof("alignedCPUs: %v requestNum: %v bestHit %v", availableCPUSet, requestNum, bestHit)
-
- result := cpuset.NewCPUSet()
- if bestHit.NUMANodeAffinity != nil {
- alignedCPUs := cpuset.NewCPUSet()
- for _, numaNodeID := range bestHit.NUMANodeAffinity.GetBits() {
- alignedCPUs = alignedCPUs.Union(availableCPUSet.Intersection(cputopo.CPUDetails.CPUsInNUMANodes(numaNodeID)))
- }
-
- numAlignedToAlloc := alignedCPUs.Size()
- if requestNum < numAlignedToAlloc {
- numAlignedToAlloc = requestNum
- }
-
- alignedCPUs, err := takeByTopology(cputopo, alignedCPUs, numAlignedToAlloc)
- if err != nil {
- return map[string]cpuset.CPUSet{
- string(v1.ResourceCPU): cpuset.NewCPUSet(),
- }
- }
-
- result = result.Union(alignedCPUs)
- }
-
- // Get any remaining CPUs from what's leftover after attempting to grab aligned ones.
- remainingCPUs, err := takeByTopology(cputopo, availableCPUSet.Difference(result), requestNum-result.Size())
- if err != nil {
- return map[string]cpuset.CPUSet{
- string(v1.ResourceCPU): cpuset.NewCPUSet(),
- }
- }
-
- result = result.Union(remainingCPUs)
-
- return map[string]cpuset.CPUSet{
- string(v1.ResourceCPU): result,
- }
-}
-
-
-
/*
-Copyright 2020 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package predicates
-
-import (
- "fmt"
- "sync"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
-)
-
-type predicateCache struct {
- sync.RWMutex
- cache map[string]map[string]bool //key_1: nodename key_2:pod uid
-}
-
-// predicateCacheNew return cache map
-func predicateCacheNew() *predicateCache {
- return &predicateCache{
- cache: make(map[string]map[string]bool),
- }
-}
-
-// getPodTemplateUID return pod template key
-func getPodTemplateUID(pod *v1.Pod) string {
- uid, found := pod.Annotations[batch.PodTemplateKey]
- if !found {
- return ""
- }
-
- return uid
-}
-
-// PredicateWithCache: check the predicate result existed in cache
-func (pc *predicateCache) PredicateWithCache(nodeName string, pod *v1.Pod) (bool, error) {
- podTemplateUID := getPodTemplateUID(pod)
- if podTemplateUID == "" {
- return false, fmt.Errorf("no anonation of volcano.sh/template-uid in pod %s", pod.Name)
- }
-
- pc.RLock()
- defer pc.RUnlock()
- if nodeCache, exist := pc.cache[nodeName]; exist {
- if result, exist := nodeCache[podTemplateUID]; exist {
- klog.V(4).Infof("Predicate node %s and pod %s result %v", nodeName, pod.Name, result)
- return result, nil
- }
- }
-
- return false, fmt.Errorf("no information of node %s and pod %s in predicate cache", nodeName, pod.Name)
-}
-
-// UpdateCache update cache data
-func (pc *predicateCache) UpdateCache(nodeName string, pod *v1.Pod, fit bool) {
- podTemplateUID := getPodTemplateUID(pod)
- if podTemplateUID == "" {
- klog.V(3).Infof("Don't find pod %s template uid", pod.Name)
- return
- }
-
- pc.Lock()
- defer pc.Unlock()
-
- if _, exist := pc.cache[nodeName]; !exist {
- podCache := make(map[string]bool)
- podCache[podTemplateUID] = fit
- pc.cache[nodeName] = podCache
- } else {
- pc.cache[nodeName][podTemplateUID] = fit
- }
-}
-
-
-
/*
-Copyright 2020 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package predicates
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-// checkNodeGPUSharingPredicate checks if a gpu sharing pod can be scheduled on a node.
-func checkNodeGPUSharingPredicate(pod *v1.Pod, nodeInfo *api.NodeInfo) (bool, error) {
- // no gpu sharing request
- if api.GetGPUResourceOfPod(pod) <= 0 {
- return true, nil
- }
-
- id := predicateGPU(pod, nodeInfo)
- if id < 0 {
- return false, fmt.Errorf("no enough gpu memory on single device of node %s", nodeInfo.Name)
- }
- return true, nil
-}
-
-// predicateGPU returns the available GPU ID
-func predicateGPU(pod *v1.Pod, node *api.NodeInfo) int {
- gpuRequest := api.GetGPUResourceOfPod(pod)
- allocatableGPUs := node.GetDevicesIdleGPUMemory()
-
- for devID := 0; devID < len(allocatableGPUs); devID++ {
- availableGPU, ok := allocatableGPUs[devID]
- if ok {
- if availableGPU >= gpuRequest {
- return devID
- }
- }
- }
-
- return -1
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package predicates
-
-import (
- "context"
- "fmt"
- "strings"
-
- v1 "k8s.io/api/core/v1"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/types"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/scheduler/apis/config"
- "k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity"
- "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity"
- "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports"
- "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeunschedulable"
- "k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
- k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/plugins/util"
- "volcano.sh/volcano/pkg/scheduler/plugins/util/k8s"
-)
-
-const (
- // PluginName indicates name of volcano scheduler plugin.
- PluginName = "predicates"
-
- // GPUSharingPredicate is the key for enabling GPU Sharing Predicate in YAML
- GPUSharingPredicate = "predicate.GPUSharingEnable"
-
- // CachePredicate control cache predicate feature
- CachePredicate = "predicate.CacheEnable"
-
- // ProportionalPredicate is the key for enabling Proportional Predicate in YAML
- ProportionalPredicate = "predicate.ProportionalEnable"
- // ProportionalResource is the key for additional resource key name
- ProportionalResource = "predicate.resources"
- // ProportionalResourcesPrefix is the key prefix for additional resource key name
- ProportionalResourcesPrefix = ProportionalResource + "."
-)
-
-type predicatesPlugin struct {
- // Arguments given for the plugin
- pluginArguments framework.Arguments
-}
-
-// New return predicate plugin
-func New(arguments framework.Arguments) framework.Plugin {
- return &predicatesPlugin{pluginArguments: arguments}
-}
-
-func (pp *predicatesPlugin) Name() string {
- return PluginName
-}
-
-type baseResource struct {
- CPU float64
- Memory float64
-}
-
-type predicateEnable struct {
- gpuSharingEnable bool
- cacheEnable bool
- proportionalEnable bool
- proportional map[v1.ResourceName]baseResource
-}
-
-func enablePredicate(args framework.Arguments) predicateEnable {
- /*
- User Should give predicatesEnable in this format(predicate.GPUSharingEnable).
- Currently supported only GPUSharing predicate checks.
-
- actions: "reclaim, allocate, backfill, preempt"
- tiers:
- - plugins:
- - name: priority
- - name: gang
- - name: conformance
- - plugins:
- - name: drf
- - name: predicates
- arguments:
- predicate.GPUSharingEnable: true
- predicate.CacheEnable: true
- predicate.ProportionalEnable: true
- predicate.resources: nvidia.com/gpu
- predicate.resources.nvidia.com/gpu.cpu: 4
- predicate.resources.nvidia.com/gpu.memory: 8
- - name: proportion
- - name: nodeorder
- */
-
- predicate := predicateEnable{
- gpuSharingEnable: false,
- cacheEnable: false,
- proportionalEnable: false,
- }
-
- // Checks whether predicate.GPUSharingEnable is provided or not, if given, modifies the value in predicateEnable struct.
- args.GetBool(&predicate.gpuSharingEnable, GPUSharingPredicate)
- args.GetBool(&predicate.cacheEnable, CachePredicate)
- // Checks whether predicate.ProportionalEnable is provided or not, if given, modifies the value in predicateEnable struct.
- args.GetBool(&predicate.proportionalEnable, ProportionalPredicate)
- resourcesProportional := make(map[v1.ResourceName]baseResource)
- resourcesStr := args[ProportionalResource]
- resources := strings.Split(resourcesStr, ",")
- for _, resource := range resources {
- resource = strings.TrimSpace(resource)
- if resource == "" {
- continue
- }
- // proportional.resources.[ResourceName]
- cpuResourceKey := ProportionalResourcesPrefix + resource + ".cpu"
- cpuResourceRate := 1.0
- args.GetFloat64(&cpuResourceRate, cpuResourceKey)
- if cpuResourceRate < 0 {
- cpuResourceRate = 1.0
- }
- memoryResourceKey := ProportionalResourcesPrefix + resource + ".memory"
- memoryResourceRate := 1.0
- args.GetFloat64(&memoryResourceRate, memoryResourceKey)
- if memoryResourceRate < 0 {
- memoryResourceRate = 1.0
- }
- r := baseResource{
- CPU: cpuResourceRate,
- Memory: memoryResourceRate,
- }
- resourcesProportional[v1.ResourceName(resource)] = r
- }
- predicate.proportional = resourcesProportional
-
- return predicate
-}
-
-func (pp *predicatesPlugin) OnSessionOpen(ssn *framework.Session) {
- pl := util.NewPodListerFromNode(ssn)
- nodeMap := util.GenerateNodeMapAndSlice(ssn.Nodes)
-
- pCache := predicateCacheNew()
- predicate := enablePredicate(pp.pluginArguments)
-
- kubeClient := ssn.KubeClient()
- // Register event handlers to update task info in PodLister & nodeMap
- ssn.AddEventHandler(&framework.EventHandler{
- AllocateFunc: func(event *framework.Event) {
- pod := pl.UpdateTask(event.Task, event.Task.NodeName)
-
- nodeName := event.Task.NodeName
- node, found := nodeMap[nodeName]
- if !found {
- klog.Errorf("predicates, update pod %s/%s allocate to NOT EXIST node [%s]", pod.Namespace, pod.Name, nodeName)
- return
- }
-
- if predicate.gpuSharingEnable && api.GetGPUResourceOfPod(pod) > 0 {
- nodeInfo, ok := ssn.Nodes[nodeName]
- if !ok {
- klog.Errorf("Failed to get node %s info from cache", nodeName)
- return
- }
-
- id := predicateGPU(pod, nodeInfo)
- if id < 0 {
- klog.Errorf("The node %s can't place the pod %s in ns %s", pod.Spec.NodeName, pod.Name, pod.Namespace)
- return
- }
- patch := api.AddGPUIndexPatch(id)
- pod, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{})
- if err != nil {
- klog.Errorf("Patch pod %s failed with patch %s: %v", pod.Name, patch, err)
- return
- }
- dev, ok := nodeInfo.GPUDevices[id]
- if !ok {
- klog.Errorf("Failed to get GPU %d from node %s", id, nodeName)
- return
- }
- dev.PodMap[string(pod.UID)] = pod
- klog.V(4).Infof("predicates with gpu sharing, update pod %s/%s allocate to node [%s]", pod.Namespace, pod.Name, nodeName)
- }
-
- node.AddPod(pod)
- klog.V(4).Infof("predicates, update pod %s/%s allocate to node [%s]", pod.Namespace, pod.Name, nodeName)
- },
- DeallocateFunc: func(event *framework.Event) {
- pod := pl.UpdateTask(event.Task, "")
- nodeName := event.Task.NodeName
- node, found := nodeMap[nodeName]
- if !found {
- klog.Errorf("predicates, update pod %s/%s allocate from NOT EXIST node [%s]", pod.Namespace, pod.Name, nodeName)
- return
- }
-
- if predicate.gpuSharingEnable && api.GetGPUResourceOfPod(pod) > 0 {
- // deallocate pod gpu id
- id := api.GetGPUIndex(pod)
- patch := api.RemoveGPUIndexPatch()
- _, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{})
- if err != nil {
- klog.Errorf("Patch pod %s failed with patch %s: %v", pod.Name, patch, err)
- return
- }
-
- nodeInfo, ok := ssn.Nodes[nodeName]
- if !ok {
- klog.Errorf("Failed to get node %s info from cache", nodeName)
- return
- }
- if dev, ok := nodeInfo.GPUDevices[id]; ok {
- delete(dev.PodMap, string(pod.UID))
- }
-
- klog.V(4).Infof("predicates with gpu sharing, update pod %s/%s deallocate from node [%s]", pod.Namespace, pod.Name, nodeName)
- }
-
- err := node.RemovePod(pod)
- if err != nil {
- klog.Errorf("predicates, remove pod %s/%s from node [%s] error: %v", pod.Namespace, pod.Name, nodeName, err)
- return
- }
- klog.V(4).Infof("predicates, update pod %s/%s deallocate from node [%s]", pod.Namespace, pod.Name, nodeName)
- },
- })
-
- // Initialize k8s plugins
- // TODO: Add more predicates, k8s.io/kubernetes/pkg/scheduler/framework/plugins/legacy_registry.go
- handle := k8s.NewFrameworkHandle(nodeMap, ssn.KubeClient(), ssn.InformerFactory())
- // 1. NodeUnschedulable
- plugin, _ := nodeunschedulable.New(nil, handle)
- nodeUnscheduleFilter := plugin.(*nodeunschedulable.NodeUnschedulable)
- // 2. NodeAffinity
- plugin, _ = nodeaffinity.New(nil, handle)
- nodeAffinityFilter := plugin.(*nodeaffinity.NodeAffinity)
- // 3. NodePorts
- plugin, _ = nodeports.New(nil, handle)
- nodePortFilter := plugin.(*nodeports.NodePorts)
- // 4. TaintToleration
- plugin, _ = tainttoleration.New(nil, handle)
- tolerationFilter := plugin.(*tainttoleration.TaintToleration)
- // 5. InterPodAffinity
- plArgs := &config.InterPodAffinityArgs{}
- plugin, _ = interpodaffinity.New(plArgs, handle)
- podAffinityFilter := plugin.(*interpodaffinity.InterPodAffinity)
-
- ssn.AddPredicateFn(pp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) error {
- nodeInfo, found := nodeMap[node.Name]
- if !found {
- return fmt.Errorf("failed to predicates, node info for %s not found", node.Name)
- }
-
- if node.Allocatable.MaxTaskNum <= len(nodeInfo.Pods) {
- klog.V(4).Infof("NodePodNumber predicates Task <%s/%s> on Node <%s> failed",
- task.Namespace, task.Name, node.Name)
- return api.NewFitError(task, node, api.NodePodNumberExceeded)
- }
-
- state := k8sframework.NewCycleState()
- predicateByStablefilter := func(pod *v1.Pod, nodeInfo *k8sframework.NodeInfo) (bool, error) {
- // CheckNodeUnschedulable
- status := nodeUnscheduleFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
- if !status.IsSuccess() {
- return false, fmt.Errorf("plugin %s predicates failed %s", nodeunschedulable.Name, status.Message())
- }
-
- // Check NodeAffinity
- status = nodeAffinityFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
- if !status.IsSuccess() {
- return false, fmt.Errorf("plugin %s predicates failed %s", nodeaffinity.Name, status.Message())
- }
-
- // PodToleratesNodeTaints: TaintToleration
- status = tolerationFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
- if !status.IsSuccess() {
- return false, fmt.Errorf("plugin %s predicates failed %s", tainttoleration.Name, status.Message())
- }
-
- return true, nil
- }
-
- // Check PredicateWithCache
- {
- var err error
- var fit bool
- if predicate.cacheEnable {
- fit, err = pCache.PredicateWithCache(node.Name, task.Pod)
- if err != nil {
- fit, err = predicateByStablefilter(task.Pod, nodeInfo)
- pCache.UpdateCache(node.Name, task.Pod, fit)
- } else {
- if !fit {
- err = fmt.Errorf("plugin equivalence cache predicates failed")
- }
- }
- } else {
- fit, err = predicateByStablefilter(task.Pod, nodeInfo)
- }
-
- if !fit {
- return err
- }
- }
-
- // Check NodePorts
- nodePortFilter.PreFilter(context.TODO(), state, task.Pod)
- status := nodePortFilter.Filter(context.TODO(), state, nil, nodeInfo)
- if !status.IsSuccess() {
- return fmt.Errorf("plugin %s predicates failed %s", nodeaffinity.Name, status.Message())
- }
-
- // InterPodAffinity Predicate
- status = podAffinityFilter.PreFilter(context.TODO(), state, task.Pod)
- if !status.IsSuccess() {
- return fmt.Errorf("plugin %s pre-predicates failed %s", interpodaffinity.Name, status.Message())
- }
-
- status = podAffinityFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
- if !status.IsSuccess() {
- return fmt.Errorf("plugin %s predicates failed %s", interpodaffinity.Name, status.Message())
- }
-
- if predicate.gpuSharingEnable {
- // CheckGPUSharingPredicate
- fit, err := checkNodeGPUSharingPredicate(task.Pod, node)
- if err != nil {
- return err
- }
-
- klog.V(4).Infof("checkNodeGPUSharingPredicate predicates Task <%s/%s> on Node <%s>: fit %v",
- task.Namespace, task.Name, node.Name, fit)
- }
- if predicate.proportionalEnable {
- // Check ProportionalPredicate
- fit, err := checkNodeResourceIsProportional(task, node, predicate.proportional)
- if err != nil {
- return err
- }
- klog.V(4).Infof("checkNodeResourceIsProportional predicates Task <%s/%s> on Node <%s>: fit %v",
- task.Namespace, task.Name, node.Name, fit)
- }
- return nil
- })
-}
-
-func (pp *predicatesPlugin) OnSessionClose(ssn *framework.Session) {}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package predicates
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-// checkNodeResourceIsProportional checks if a gpu:cpu:memory is Proportional
-func checkNodeResourceIsProportional(task *api.TaskInfo, node *api.NodeInfo, proportional map[v1.ResourceName]baseResource) (bool, error) {
- for resourceName := range proportional {
- if value, found := task.Resreq.ScalarResources[resourceName]; found && value > 0 {
- return true, nil
- }
- }
- for resourceName, resourceRate := range proportional {
- if value, found := node.Idle.ScalarResources[resourceName]; found {
- cpuReserved := value * resourceRate.CPU
- memoryReserved := value * resourceRate.Memory * 1000 * 1000
- r := node.Idle.Clone()
- r = r.Sub(task.Resreq)
- if r.MilliCPU < cpuReserved || r.Memory < memoryReserved {
- return false, fmt.Errorf("proportional of resource %s check failed", resourceName)
- }
- }
- }
- return true, nil
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package tasktopology
-
-import (
- "k8s.io/apimachinery/pkg/types"
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-type reqAction int
-
-const (
- reqSub reqAction = iota
- reqAdd
-)
-
-// Bucket is struct used to classify tasks by affinity and anti-affinity
-type Bucket struct {
- index int
- tasks map[types.UID]*api.TaskInfo
- taskNameSet map[string]int
-
- // reqScore is score of resource
- // now, we regard 1 CPU and 1 GPU and 1Gi memory as the same score.
- reqScore float64
- request *api.Resource
-
- boundTask int
- node map[string]int
-}
-
-// NewBucket create a new empty bucket
-func NewBucket() *Bucket {
- return &Bucket{
- index: 0,
- tasks: make(map[types.UID]*api.TaskInfo),
- taskNameSet: make(map[string]int),
-
- reqScore: 0,
- request: api.EmptyResource(),
-
- boundTask: 0,
- node: make(map[string]int),
- }
-}
-
-// CalcResReq calculates task resources request
-func (b *Bucket) CalcResReq(req *api.Resource, action reqAction) {
- if req == nil {
- return
- }
-
- cpu := req.MilliCPU
- // treat 1Mi the same as 1m cpu 1m gpu
- mem := req.Memory / 1024 / 1024
- score := cpu + mem
- for _, request := range req.ScalarResources {
- score += request
- }
-
- switch action {
- case reqSub:
- b.reqScore -= score
- b.request.Sub(req)
- case reqAdd:
- b.reqScore += score
- b.request.Add(req)
- default:
- klog.V(3).Infof("Invalid action <%v> for resource <%v>", action, req)
- }
-}
-
-// AddTask adds task into bucket
-func (b *Bucket) AddTask(taskName string, task *api.TaskInfo) {
- b.taskNameSet[taskName]++
- if task.NodeName != "" {
- b.node[task.NodeName]++
- b.boundTask++
- return
- }
-
- b.tasks[task.Pod.UID] = task
- b.CalcResReq(task.Resreq, reqAdd)
-}
-
-// TaskBound binds task to bucket
-func (b *Bucket) TaskBound(task *api.TaskInfo) {
- b.node[task.NodeName]++
- b.boundTask++
-
- delete(b.tasks, task.Pod.UID)
- b.CalcResReq(task.Resreq, reqSub)
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package tasktopology
-
-import (
- "fmt"
- "math"
- "sort"
- "strings"
-
- "k8s.io/apimachinery/pkg/types"
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-type topologyType int
-
-const (
- selfAntiAffinity topologyType = iota
- interAntiAffinity
- selfAffinity
- interAffinity
-)
-
-// map[topologyType]priority, the larger number means the higher priority
-var affinityPriority = map[topologyType]int{
- selfAntiAffinity: 4,
- interAffinity: 3,
- selfAffinity: 2,
- interAntiAffinity: 1,
-}
-
-// JobManager is struct used to save infos about affinity and buckets of a job
-type JobManager struct {
- jobID api.JobID
-
- buckets []*Bucket
- podInBucket map[types.UID]int
- podInTask map[types.UID]string
- taskOverPod map[string]map[types.UID]struct{}
-
- taskAffinityPriority map[string]int // [taskName] -> priority
- taskExistOrder map[string]int
- interAffinity map[string]map[string]struct{} // [taskName]->[taskName]
- selfAffinity map[string]struct{}
- interAntiAffinity map[string]map[string]struct{} // [taskName]->[taskName]
- selfAntiAffinity map[string]struct{}
-
- bucketMaxSize int
- nodeTaskSet map[string]map[string]int // [nodeName]->[taskName]
-}
-
-// NewJobManager creates a new job manager for job
-func NewJobManager(jobID api.JobID) *JobManager {
- return &JobManager{
- jobID: jobID,
-
- buckets: make([]*Bucket, 0),
- podInBucket: make(map[types.UID]int),
- podInTask: make(map[types.UID]string),
- taskOverPod: make(map[string]map[types.UID]struct{}),
-
- taskAffinityPriority: make(map[string]int),
- taskExistOrder: make(map[string]int),
- interAffinity: make(map[string]map[string]struct{}),
- interAntiAffinity: make(map[string]map[string]struct{}),
- selfAffinity: make(map[string]struct{}),
- selfAntiAffinity: make(map[string]struct{}),
-
- bucketMaxSize: 0,
- nodeTaskSet: make(map[string]map[string]int),
- }
-}
-
-// MarkOutOfBucket indicates task is outside of any bucket
-func (jm *JobManager) MarkOutOfBucket(uid types.UID) {
- jm.podInBucket[uid] = OutOfBucket
-}
-
-// MarkTaskHasTopology indicates task has topology settings
-func (jm *JobManager) MarkTaskHasTopology(taskName string, topoType topologyType) {
- priority := affinityPriority[topoType]
- if priority > jm.taskAffinityPriority[taskName] {
- jm.taskAffinityPriority[taskName] = priority
- }
-}
-
-// ApplyTaskTopology transforms taskTopology to matrix
-// affinity: [[a, b], [c]]
-// interAffinity:
-// a b c
-// a - x -
-// b x - -
-// c - - -
-// selfAffinity:
-// a b c
-// - - x
-func (jm *JobManager) ApplyTaskTopology(topo *TaskTopology) {
- for _, aff := range topo.Affinity {
- if len(aff) == 1 {
- taskName := aff[0]
- jm.selfAffinity[taskName] = struct{}{}
- jm.MarkTaskHasTopology(taskName, selfAffinity)
- continue
- }
- for index, src := range aff {
- for _, dst := range aff[:index] {
- addAffinity(jm.interAffinity, src, dst)
- addAffinity(jm.interAffinity, dst, src)
- }
- jm.MarkTaskHasTopology(src, interAffinity)
- }
- }
-
- for _, aff := range topo.AntiAffinity {
- if len(aff) == 1 {
- taskName := aff[0]
- jm.selfAntiAffinity[taskName] = struct{}{}
- jm.MarkTaskHasTopology(taskName, selfAntiAffinity)
- continue
- }
- for index, src := range aff {
- for _, dst := range aff[:index] {
- addAffinity(jm.interAntiAffinity, src, dst)
- addAffinity(jm.interAntiAffinity, dst, src)
- }
- jm.MarkTaskHasTopology(src, interAntiAffinity)
- }
- }
-
- length := len(topo.TaskOrder)
- for index, taskName := range topo.TaskOrder {
- jm.taskExistOrder[taskName] = length - index
- }
-}
-
-// NewBucket creates a new bucket
-func (jm *JobManager) NewBucket() *Bucket {
- bucket := NewBucket()
- bucket.index = len(jm.buckets)
- jm.buckets = append(jm.buckets, bucket)
- return bucket
-}
-
-// AddTaskToBucket adds task into bucket
-func (jm *JobManager) AddTaskToBucket(bucketIndex int, taskName string, task *api.TaskInfo) {
- bucket := jm.buckets[bucketIndex]
- jm.podInBucket[task.Pod.UID] = bucketIndex
- bucket.AddTask(taskName, task)
- if size := len(bucket.tasks) + bucket.boundTask; size > jm.bucketMaxSize {
- jm.bucketMaxSize = size
- }
-}
-
-// L compared with R, -1 for L < R, 0 for L == R, 1 for L > R
-func (jm *JobManager) taskAffinityOrder(L, R *api.TaskInfo) int {
- LTaskName := jm.podInTask[L.Pod.UID]
- RTaskName := jm.podInTask[R.Pod.UID]
-
- // in the same vk task, they are equal
- if LTaskName == RTaskName {
- return 0
- }
-
- // use user defined order firstly
- LOrder := jm.taskExistOrder[LTaskName]
- ROrder := jm.taskExistOrder[RTaskName]
- if LOrder != ROrder {
- if LOrder > ROrder {
- return 1
- }
- return -1
- }
-
- LPriority := jm.taskAffinityPriority[LTaskName]
- RPriority := jm.taskAffinityPriority[RTaskName]
- if LPriority != RPriority {
- if LPriority > RPriority {
- return 1
- }
- return -1
- }
-
- // all affinity setting of L and R are the same, they are equal
- return 0
-}
-
-func (jm *JobManager) buildTaskInfo(tasks map[api.TaskID]*api.TaskInfo) []*api.TaskInfo {
- taskWithoutBucket := make([]*api.TaskInfo, 0, len(tasks))
- for _, task := range tasks {
- pod := task.Pod
-
- taskName := getTaskName(task)
- if taskName == "" {
- jm.MarkOutOfBucket(pod.UID)
- continue
- }
- if _, hasTopology := jm.taskAffinityPriority[taskName]; !hasTopology {
- jm.MarkOutOfBucket(pod.UID)
- continue
- }
-
- jm.podInTask[pod.UID] = taskName
- taskSet, ok := jm.taskOverPod[taskName]
- if !ok {
- taskSet = make(map[types.UID]struct{})
- jm.taskOverPod[taskName] = taskSet
- }
- taskSet[pod.UID] = struct{}{}
- taskWithoutBucket = append(taskWithoutBucket, task)
- }
- return taskWithoutBucket
-}
-
-func (jm *JobManager) checkTaskSetAffinity(taskName string, taskNameSet map[string]int, onlyAnti bool) int {
- bucketPodAff := 0
-
- if taskName == "" {
- return bucketPodAff
- }
-
- for taskNameInBucket, count := range taskNameSet {
- theSameTask := taskNameInBucket == taskName
-
- if !onlyAnti {
- affinity := false
- if theSameTask {
- _, affinity = jm.selfAffinity[taskName]
- } else {
- _, affinity = jm.interAffinity[taskName][taskNameInBucket]
- }
- if affinity {
- bucketPodAff += count
- }
- }
-
- antiAffinity := false
- if theSameTask {
- _, antiAffinity = jm.selfAntiAffinity[taskName]
- } else {
- _, antiAffinity = jm.interAntiAffinity[taskName][taskNameInBucket]
- }
- if antiAffinity {
- bucketPodAff -= count
- }
- }
-
- return bucketPodAff
-}
-
-func (jm *JobManager) buildBucket(taskWithOrder []*api.TaskInfo) {
- nodeBucketMapping := make(map[string]*Bucket)
-
- for _, task := range taskWithOrder {
- klog.V(5).Infof("jobID %s task with order task %s/%s", jm.jobID, task.Namespace, task.Name)
-
- var selectedBucket *Bucket
- maxAffinity := math.MinInt32
-
- taskName := getTaskName(task)
-
- if task.NodeName != "" {
- // generate bucket by node
- maxAffinity = 0
- selectedBucket = nodeBucketMapping[task.NodeName]
- } else {
- for _, bucket := range jm.buckets {
- bucketPodAff := jm.checkTaskSetAffinity(taskName, bucket.taskNameSet, false)
-
- // choose the best fit affinity, or balance resource between bucket
- if bucketPodAff > maxAffinity {
- maxAffinity = bucketPodAff
- selectedBucket = bucket
- } else if bucketPodAff == maxAffinity && selectedBucket != nil &&
- bucket.reqScore < selectedBucket.reqScore {
- selectedBucket = bucket
- }
- }
- }
-
- if maxAffinity < 0 || selectedBucket == nil {
- selectedBucket = jm.NewBucket()
- if task.NodeName != "" {
- nodeBucketMapping[task.NodeName] = selectedBucket
- }
- }
-
- jm.AddTaskToBucket(selectedBucket.index, taskName, task)
- }
-}
-
-// ConstructBucket builds bucket for tasks
-func (jm *JobManager) ConstructBucket(tasks map[api.TaskID]*api.TaskInfo) {
- taskWithoutBucket := jm.buildTaskInfo(tasks)
-
- o := TaskOrder{
- tasks: taskWithoutBucket,
-
- manager: jm,
- }
- sort.Sort(sort.Reverse(&o))
-
- jm.buildBucket(o.tasks)
-}
-
-// TaskBound binds task to bucket
-func (jm *JobManager) TaskBound(task *api.TaskInfo) {
- if taskName := getTaskName(task); taskName != "" {
- set, ok := jm.nodeTaskSet[task.NodeName]
- if !ok {
- set = make(map[string]int)
- jm.nodeTaskSet[task.NodeName] = set
- }
- set[taskName]++
- }
-
- bucket := jm.GetBucket(task)
- if bucket != nil {
- bucket.TaskBound(task)
- }
-}
-
-// GetBucket get bucket inside which task has been
-func (jm *JobManager) GetBucket(task *api.TaskInfo) *Bucket {
- index, ok := jm.podInBucket[task.Pod.UID]
- if !ok || index == OutOfBucket {
- return nil
- }
-
- bucket := jm.buckets[index]
- return bucket
-}
-
-func (jm *JobManager) String() string {
- // saa: selfAntiAffinity
- // iaa: interAntiAffinity
- // sa: selfAffinity
- // ia: interAffinity
- msg := []string{
- fmt.Sprintf("%s - job %s max %d || saa: %v - iaa: %v - sa: %v - ia: %v || priority: %v - order: %v || ",
- PluginName, jm.jobID, jm.bucketMaxSize,
- jm.selfAntiAffinity, jm.interAntiAffinity,
- jm.selfAffinity, jm.interAffinity,
- jm.taskAffinityPriority, jm.taskExistOrder,
- ),
- }
-
- for _, bucket := range jm.buckets {
- bucketMsg := fmt.Sprintf("b:%d -- ", bucket.index)
- var info []string
- for _, task := range bucket.tasks {
- info = append(info, task.Pod.Name)
- }
- bucketMsg += strings.Join(info, ", ")
- bucketMsg += "|"
-
- info = nil
- for nodeName, count := range bucket.node {
- info = append(info, fmt.Sprintf("n%s-%d", nodeName, count))
- }
- bucketMsg += strings.Join(info, ", ")
-
- msg = append(msg, "["+bucketMsg+"]")
- }
- return strings.Join(msg, " ")
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package tasktopology
-
-import (
- "fmt"
- "strings"
- "time"
-
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
-)
-
-type taskTopologyPlugin struct {
- arguments framework.Arguments
-
- weight int
- managers map[api.JobID]*JobManager
-}
-
-// New function returns taskTopologyPlugin object
-func New(arguments framework.Arguments) framework.Plugin {
- return &taskTopologyPlugin{
- arguments: arguments,
-
- weight: calculateWeight(arguments),
- managers: make(map[api.JobID]*JobManager),
- }
-}
-
-func (p *taskTopologyPlugin) Name() string {
- return PluginName
-}
-
-// TaskOrderFn returns -1 to make l prior to r.
-//
-// for example:
-// A:
-// | bucket1 | bucket2 | out of bucket
-// | a1 a3 | a2 | a4
-// B:
-// | bucket1 | out of bucket
-// | b1 b2 | b3
-// the right task order should be:
-// a1 a3 a2 b1 b2 a4 b3
-func (p *taskTopologyPlugin) TaskOrderFn(l interface{}, r interface{}) int {
- lv, ok := l.(*api.TaskInfo)
- if !ok {
- klog.Errorf("Object is not a taskinfo")
- }
- rv, ok := r.(*api.TaskInfo)
- if !ok {
- klog.Errorf("Object is not a taskinfo")
- }
-
- lvJobManager := p.managers[lv.Job]
- rvJobManager := p.managers[rv.Job]
-
- var lvBucket, rvBucket *Bucket
- if lvJobManager != nil {
- lvBucket = lvJobManager.GetBucket(lv)
- } else {
- klog.V(4).Infof("No job manager for job <ID: %s>, do not return task order.", lv.Job)
- return 0
- }
- if rvJobManager != nil {
- rvBucket = rvJobManager.GetBucket(rv)
- } else {
- klog.V(4).Infof("No job manager for job <ID: %s>, do not return task order.", rv.Job)
- return 0
- }
-
- // the one have bucket would always prior to another
- lvInBucket := lvBucket != nil
- rvInBucket := rvBucket != nil
- if lvInBucket != rvInBucket {
- if lvInBucket {
- return -1
- }
- return 1
- }
-
- // comparison between job is not the duty of this plugin
- if lv.Job != rv.Job {
- return 0
- }
-
- // task out of bucket have no order
- if !lvInBucket && !rvInBucket {
- return 0
- }
-
- // the big bucket should prior to small one
- lvHasTask := len(lvBucket.tasks)
- rvHasTask := len(rvBucket.tasks)
- if lvHasTask != rvHasTask {
- if lvHasTask > rvHasTask {
- return -1
- }
- return 1
- }
-
- lvBucketIndex := lvBucket.index
- rvBucketIndex := rvBucket.index
- // in the same bucket, the affinityOrder is ok
- if lvBucketIndex == rvBucketIndex {
- affinityOrder := lvJobManager.taskAffinityOrder(lv, rv)
- return -affinityOrder
- }
-
- // the old bucket should prior to young one
- if lvBucketIndex < rvBucketIndex {
- return -1
- }
- return 1
-}
-
-func (p *taskTopologyPlugin) calcBucketScore(task *api.TaskInfo, node *api.NodeInfo) (int, *JobManager, error) {
- // task could never fits the node
- maxResource := node.Idle.Clone().Add(node.Releasing)
- if req := task.Resreq; req != nil && maxResource.LessPartly(req, api.Zero) {
- return 0, nil, nil
- }
-
- jobManager, hasManager := p.managers[task.Job]
- if !hasManager {
- return 0, nil, nil
- }
-
- bucket := jobManager.GetBucket(task)
- // task out of bucket
- if bucket == nil {
- return 0, jobManager, nil
- }
-
- // 1. bound task in bucket is the base score of this node
- score := bucket.node[node.Name]
-
- // 2. task inter/self anti-affinity should be calculated
- if nodeTaskSet := jobManager.nodeTaskSet[node.Name]; nodeTaskSet != nil {
- taskName := getTaskName(task)
- affinityScore := jobManager.checkTaskSetAffinity(taskName, nodeTaskSet, true)
- if affinityScore < 0 {
- score += affinityScore
- }
- }
- klog.V(4).Infof("task %s/%s, node %s, additional score %d, task %d",
- task.Namespace, task.Name, node.Name, score, len(bucket.tasks))
-
- // 3. the other tasks in bucket take into considering
- score += len(bucket.tasks)
- if bucket.request == nil || bucket.request.LessEqual(maxResource, api.Zero) {
- return score, jobManager, nil
- }
-
- remains := bucket.request.Clone()
- // randomly (by map) take out task to make the bucket fits the node
- for bucketTaskID, bucketTask := range bucket.tasks {
- // current task should kept in bucket
- if bucketTaskID == task.Pod.UID || bucketTask.Resreq == nil {
- continue
- }
- remains.Sub(bucketTask.Resreq)
- score--
- if remains.LessEqual(maxResource, api.Zero) {
- break
- }
- }
- // here, the bucket remained request will always fit the maxResource
- return score, jobManager, nil
-}
-
-func (p *taskTopologyPlugin) NodeOrderFn(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
- score, jobManager, err := p.calcBucketScore(task, node)
- if err != nil {
- return 0, err
- }
- fScore := float64(score * p.weight)
- if jobManager != nil && jobManager.bucketMaxSize != 0 {
- fScore = fScore * float64(v1alpha1.MaxNodeScore) / float64(jobManager.bucketMaxSize)
- }
- klog.V(4).Infof("task %s/%s at node %s has bucket score %d, score %f",
- task.Namespace, task.Name, node.Name, score, fScore)
- return fScore, nil
-}
-
-func (p *taskTopologyPlugin) AllocateFunc(event *framework.Event) {
- task := event.Task
-
- jobManager, hasManager := p.managers[task.Job]
- if !hasManager {
- return
- }
- jobManager.TaskBound(task)
-}
-
-func (p *taskTopologyPlugin) initBucket(ssn *framework.Session) {
- for jobID, job := range ssn.Jobs {
- if noPendingTasks(job) {
- klog.V(4).Infof("No pending tasks in job <%s/%s> by plugin %s.",
- job.Namespace, job.Name, PluginName)
- continue
- }
-
- jobTopology, err := readTopologyFromPgAnnotations(job)
- if err != nil {
- klog.V(4).Infof("Failed to read task topology from job <%s/%s> annotations, error: %s.",
- job.Namespace, job.Name, err.Error())
- continue
- }
- if jobTopology == nil {
- continue
- }
-
- manager := NewJobManager(jobID)
- manager.ApplyTaskTopology(jobTopology)
- manager.ConstructBucket(job.Tasks)
-
- p.managers[job.UID] = manager
- }
-}
-
-func affinityCheck(job *api.JobInfo, affinity [][]string) error {
- if job == nil || affinity == nil {
- return fmt.Errorf("empty input, job: %v, affinity: %v", job, affinity)
- }
-
- var taskNumber = len(job.Tasks)
- var taskRef = make(map[string]bool, taskNumber)
- for _, task := range job.Tasks {
- tmpStrings := strings.Split(task.Name, "-")
- if _, exist := taskRef[tmpStrings[len(tmpStrings)-2]]; !exist {
- taskRef[tmpStrings[len(tmpStrings)-2]] = true
- }
- }
-
- for _, aff := range affinity {
- affTasks := make(map[string]bool, len(aff))
- for _, task := range aff {
- if len(task) == 0 {
- continue
- }
- if _, exist := taskRef[task]; !exist {
- return fmt.Errorf("task %s do not exist in job <%s/%s>", task, job.Namespace, job.Name)
- }
- if _, exist := affTasks[task]; exist {
- return fmt.Errorf("task %s is duplicated in job <%s/%s>", task, job.Namespace, job.Name)
- }
- affTasks[task] = true
- }
- }
-
- return nil
-}
-
-func splitAnnotations(job *api.JobInfo, annotation string) ([][]string, error) {
- affinityStr := strings.Split(annotation, ";")
- if len(affinityStr) == 0 {
- return nil, nil
- }
- var affinity = make([][]string, len(affinityStr))
- for i, str := range affinityStr {
- affinity[i] = strings.Split(str, ",")
- }
- if err := affinityCheck(job, affinity); err != nil {
- klog.V(4).Infof("Job <%s/%s> affinity key invalid: %s.",
- job.Namespace, job.Name, err.Error())
- return nil, err
- }
- return affinity, nil
-}
-
-func readTopologyFromPgAnnotations(job *api.JobInfo) (*TaskTopology, error) {
- jobAffinityStr, affinityExist := job.PodGroup.Annotations[JobAffinityAnnotations]
- jobAntiAffinityStr, antiAffinityExist := job.PodGroup.Annotations[JobAntiAffinityAnnotations]
- taskOrderStr, taskOrderExist := job.PodGroup.Annotations[TaskOrderAnnotations]
-
- if !(affinityExist || antiAffinityExist || taskOrderExist) {
- return nil, nil
- }
-
- var jobTopology = TaskTopology{
- Affinity: nil,
- AntiAffinity: nil,
- TaskOrder: nil,
- }
-
- if affinityExist {
- affinities, err := splitAnnotations(job, jobAffinityStr)
- if err != nil {
- klog.V(4).Infof("Job <%s/%s> affinity key invalid: %s.",
- job.Namespace, job.Name, err.Error())
- return nil, err
- }
- jobTopology.Affinity = affinities
- }
-
- if antiAffinityExist {
- affinities, err := splitAnnotations(job, jobAntiAffinityStr)
- if err != nil {
- klog.V(4).Infof("Job <%s/%s> anti affinity key invalid: %s.",
- job.Namespace, job.Name, err.Error())
- return nil, err
- }
- jobTopology.AntiAffinity = affinities
- }
-
- if taskOrderExist {
- jobTopology.TaskOrder = strings.Split(taskOrderStr, ",")
- if err := affinityCheck(job, [][]string{jobTopology.TaskOrder}); err != nil {
- klog.V(4).Infof("Job <%s/%s> task order key invalid: %s.",
- job.Namespace, job.Name, err.Error())
- return nil, err
- }
- }
-
- return &jobTopology, nil
-}
-
-func (p *taskTopologyPlugin) OnSessionOpen(ssn *framework.Session) {
- start := time.Now()
- klog.V(3).Infof("start to init task topology plugin, weight[%d], defined order %v", p.weight, affinityPriority)
-
- p.initBucket(ssn)
-
- ssn.AddTaskOrderFn(p.Name(), p.TaskOrderFn)
-
- ssn.AddNodeOrderFn(p.Name(), p.NodeOrderFn)
-
- ssn.AddEventHandler(&framework.EventHandler{
- AllocateFunc: p.AllocateFunc,
- })
-
- klog.V(3).Infof("finished to init task topology plugin, using time %v", time.Since(start))
-}
-
-func (p *taskTopologyPlugin) OnSessionClose(ssn *framework.Session) {
- p.managers = nil
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package tasktopology
-
-import (
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
-)
-
-const (
- // PluginName indicates name of volcano scheduler plugin
- PluginName = "task-topology"
- // PluginWeight is task-topology plugin weight in nodeOrderFn
- PluginWeight = "task-topology.weight"
- // JobAffinityKey is the key to read in task-topology arguments from job annotations
- JobAffinityKey = "volcano.sh/task-topology"
- // OutOfBucket indicates task is outside of any bucket
- OutOfBucket = -1
-
- // JobAffinityAnnotations is the key to read in task-topology affinity arguments from podgroup annotations
- JobAffinityAnnotations = "volcano.sh/task-topology-affinity"
- // JobAntiAffinityAnnotations is the key to read in task-topology anti-affinity arguments from podgroup annotations
- JobAntiAffinityAnnotations = "volcano.sh/task-topology-anti-affinity"
- // TaskOrderAnnotations is the key to read in task-topology task order arguments from podgroup annotations
- TaskOrderAnnotations = "volcano.sh/task-topology-task-order"
-)
-
-// TaskTopology is struct used to save affinity infos of a job read from job plugin or annotations
-type TaskTopology struct {
- Affinity [][]string `json:"affinity,omitempty"`
- AntiAffinity [][]string `json:"antiAffinity,omitempty"`
- TaskOrder []string `json:"taskOrder,omitempty"`
-}
-
-func calculateWeight(args framework.Arguments) int {
- /*
- User Should give taskTopologyWeight in this format(task-topology.weight).
-
- actions: "enqueue, reclaim, allocate, backfill, preempt"
- tiers:
- - plugins:
- - name: task-topology
- arguments:
- task-topology.weight: 10
- */
- // Values are initialized to 1.
- weight := 1
-
- args.GetInt(&weight, PluginWeight)
-
- return weight
-}
-
-func getTaskName(task *api.TaskInfo) string {
- return task.Pod.Annotations[v1alpha1.TaskSpecKey]
-}
-
-func addAffinity(m map[string]map[string]struct{}, src, dst string) {
- srcMap, ok := m[src]
- if !ok {
- srcMap = make(map[string]struct{})
- m[src] = srcMap
- }
- srcMap[dst] = struct{}{}
-}
-
-func noPendingTasks(job *api.JobInfo) bool {
- return len(job.TaskStatusIndex[api.Pending]) == 0
-}
-
-// TaskOrder is struct used to save task order
-type TaskOrder struct {
- tasks []*api.TaskInfo
- manager *JobManager
-}
-
-func (p *TaskOrder) Len() int { return len(p.tasks) }
-
-func (p *TaskOrder) Swap(l, r int) {
- p.tasks[l], p.tasks[r] = p.tasks[r], p.tasks[l]
-}
-
-func (p *TaskOrder) Less(l, r int) bool {
- L := p.tasks[l]
- R := p.tasks[r]
-
- LHasNode := L.NodeName != ""
- RHasNode := R.NodeName != ""
- if LHasNode || RHasNode {
- // the task bounded would have high priority
- if LHasNode != RHasNode {
- return !LHasNode
- }
- // all bound, any order is alright
- return L.NodeName > R.NodeName
- }
-
- result := p.manager.taskAffinityOrder(L, R)
- // they have the same taskAffinity order, any order is alright
- if result == 0 {
- return L.Name > R.Name
- }
- return result < 0
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package tdm
-
-import (
- "fmt"
- "strings"
- "time"
-
- "k8s.io/apimachinery/pkg/util/intstr"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
- tutil "volcano.sh/volcano/pkg/scheduler/plugins/util"
- "volcano.sh/volcano/pkg/scheduler/util"
-)
-
-const (
- // PluginName indicates name of volcano scheduler plugin.
- PluginName = "tdm"
- // revocableZoneLayout revocable zone layout
- revocableZoneLayout = "15:04"
- revocableZoneLabelPrefix = "tdm.revocable-zone."
- evictPeriodLabel = "tdm.evict.period"
- defaultPodEvictNum = 1
-)
-
-var lastEvictAt time.Time
-
-/*
- actions: "enqueue, reclaim, allocate, preempt"
- tiers:
- - plugins:
- - name: tdm
- arguments:
- tdm.revocable-zone.rz1: 10:00-21:00
- tdm.revocable-zone.rz2: 12:00-14:00
- tdm.evict.period: 1m
-*/
-
-type tdmPlugin struct {
- revocableZone map[string]string
- // evictPeriod
- // default 1m
- evictPeriod time.Duration
-}
-
-// New function returns prioritizePlugin object
-func New(args framework.Arguments) framework.Plugin {
- revocableZone := make(map[string]string)
- evictPeriod := time.Minute
-
- for k, v := range args {
- if strings.Contains(k, revocableZoneLabelPrefix) {
- revocableZone[strings.Replace(k, revocableZoneLabelPrefix, "", 1)] = v
- }
- }
-
- if period, ok := args[evictPeriodLabel]; ok {
- if d, err := time.ParseDuration(period); err == nil {
- evictPeriod = d
- }
- }
-
- return &tdmPlugin{revocableZone, evictPeriod}
-}
-
-func (tp *tdmPlugin) Name() string {
- return PluginName
-}
-
-func parseRevocableZone(rzRaw string) (start, end time.Time, err error) {
- rzValues := strings.Split(strings.TrimSpace(rzRaw), "-")
-
- if len(rzValues) != 2 {
- err = fmt.Errorf("revocable zone %v format error", rzRaw)
- return
- }
-
- t1, err := time.Parse(revocableZoneLayout, rzValues[0])
- if err != nil {
- return
- }
-
- t2, err := time.Parse(revocableZoneLayout, rzValues[1])
- if err != nil {
- return
- }
-
- now := time.Now()
-
- start = time.Date(now.Year(), now.Month(), now.Day(), t1.Hour(), t1.Minute(), 0, 0, now.Location())
- if t1.After(t2) || t1.Equal(t2) {
- end = time.Date(now.Year(), now.Month(), now.Day()+1, t2.Hour(), t2.Minute(), 0, 0, now.Location())
- } else {
- end = time.Date(now.Year(), now.Month(), now.Day(), t2.Hour(), t2.Minute(), 0, 0, now.Location())
- }
-
- return
-}
-
-func (tp *tdmPlugin) availableRevocableZone(rz string) error {
- // rzRaw format 00:00-23:59
- rzRaw, ok := tp.revocableZone[rz]
- if !ok {
- return fmt.Errorf("revocable zone %v not support", rz)
- }
-
- now := time.Now()
-
- start, end, err := parseRevocableZone(rzRaw)
- if err != nil {
- return err
- }
-
- if now.Unix() < start.Unix() || now.Unix() > end.Unix() {
- return fmt.Errorf("current time beyond revocable zone %v:%v", rz, rzRaw)
- }
-
- return nil
-}
-
-func (tp *tdmPlugin) OnSessionOpen(ssn *framework.Session) {
- klog.V(4).Infof("Enter tdm plugin ...")
- if klog.V(4) {
- defer func() {
- klog.V(4).Infof("Leaving tdm plugin.")
- }()
- }
-
- // tdm plugin just handle revocable node
- predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) error {
- if node.RevocableZone == "" {
- return nil
- }
-
- if err := tp.availableRevocableZone(node.RevocableZone); err != nil {
- return fmt.Errorf("plugin %s predicates %w", tp.Name(), err)
- }
-
- klog.V(4).Infof("TDM node %v revocable zone %v:%v is active", node.Name, node.RevocableZone, tp.revocableZone[node.RevocableZone])
-
- if len(task.RevocableZone) == 0 {
- msg := fmt.Sprintf("task %s/%s is not allow to dispatch to revocable node %s", task.Namespace, task.Name, node.Name)
- return fmt.Errorf("plugin %s predicates %s", tp.Name(), msg)
- }
-
- klog.V(4).Infof("TDM filter for Task %s/%s on node %s pass.", task.Namespace, task.Name, node.Name)
- return nil
- }
-
- // tdm plugin just handle revocable node
- nodeOrderFn := func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
- score := 0.0
-
- if node.RevocableZone == "" {
- return score, nil
- }
-
- if err := tp.availableRevocableZone(node.RevocableZone); err != nil {
- klog.V(4).Infof("TDM not available %s", err)
- return score, err
- }
-
- if len(task.RevocableZone) == 0 {
- klog.V(4).Infof("TDM task %s/%s is not allow to dispatch to revocable node %s", task.Namespace, task.Name, node.Name)
- return score, nil
- }
-
- score = float64(v1alpha1.MaxNodeScore)
-
- klog.V(4).Infof("TDM score for Task %s/%s on node %s is: %v", task.Namespace, task.Name, node.Name, score)
- return score, nil
- }
-
- preemptableFn := func(preemptor *api.TaskInfo, preemptees []*api.TaskInfo) ([]*api.TaskInfo, int) {
- // for the preemptable or can use revocablezone workload, they can not preempt other tasks.
- if preemptor.Preemptable || len(preemptor.RevocableZone) > 0 {
- klog.V(4).Infof("TDM task %s/%s is preemptable, do nothing skip", preemptor.Namespace, preemptor.Name)
- return nil, tutil.Reject
- }
-
- var victims []*api.TaskInfo
- tasksMap := make(map[api.JobID][]*api.TaskInfo)
-
- // find preemptable tasks which appear on none revocable node
- for _, task := range preemptees {
- if !task.Preemptable || task.Status != api.Running {
- continue
- }
-
- node, ok := ssn.Nodes[task.NodeName]
- if !ok {
- continue
- }
-
- if node.RevocableZone != "" {
- continue
- }
-
- tasksMap[task.Job] = append(tasksMap[task.Job], task)
- }
-
- for jobID, preemptableTasks := range tasksMap {
- if job, ok := ssn.Jobs[jobID]; ok {
- victims = append(victims, tp.maxVictims(job, preemptableTasks)...)
- }
- }
-
- klog.V(4).Infof("TDM victims are %+v", victims)
-
- return victims, tutil.Permit
- }
-
- victimsFn := func() []*api.TaskInfo {
- if lastEvictAt.Add(tp.evictPeriod).After(time.Now()) {
- klog.V(4).Infof("TDM next evict time at %v", lastEvictAt)
- return nil
- }
-
- klog.V(4).Infof("TDM start to find victims")
-
- // find preemptable task on timeout revocable zone node
- victims := make([]*api.TaskInfo, 0)
- for rz := range tp.revocableZone {
- if err := tp.availableRevocableZone(rz); err != nil {
- klog.V(4).Infof("TDM revocable zone %v disactive, %v", rz, err)
- // rz disactive, then evict preemptable tasks by job from the revocable node
- for jobID, preemtableTasks := range tp.revocableNodePreemptableTask(rz, ssn) {
- if job, ok := ssn.Jobs[jobID]; ok {
- victims = append(victims, tp.maxVictims(job, preemtableTasks)...)
- }
- }
- }
- }
-
- // need to consider concurrency?
- lastEvictAt = time.Now()
-
- klog.V(4).Infof("TDM got %v victims", len(victims))
-
- return victims
- }
-
- jobOrderFn := func(l, r interface{}) int {
- lv := l.(*api.JobInfo)
- rv := r.(*api.JobInfo)
-
- if lv.Preemptable == rv.Preemptable {
- return 0
- }
-
- if !lv.Preemptable {
- return -1
- }
-
- return 1
- }
-
- jobPipelinedFn := func(obj interface{}) int {
- jobInfo := obj.(*api.JobInfo)
- occupied := jobInfo.WaitingTaskNum() + jobInfo.ReadyTaskNum()
- if occupied >= jobInfo.MinAvailable {
- return tutil.Permit
- }
- return tutil.Reject
- }
-
- jobStarvingFn := func(obj interface{}) bool {
- jobInfo := obj.(*api.JobInfo)
- // allow none preemptable elastic job (deployment) preempt task
- if jobInfo.Preemptable {
- return false
- }
- return len(jobInfo.TaskStatusIndex[api.Pending]) > 0
- }
-
- ssn.AddPredicateFn(tp.Name(), predicateFn)
- ssn.AddNodeOrderFn(tp.Name(), nodeOrderFn)
- ssn.AddPreemptableFn(tp.Name(), preemptableFn)
- ssn.AddVictimTasksFns(tp.Name(), victimsFn)
- ssn.AddJobOrderFn(tp.Name(), jobOrderFn)
- ssn.AddJobPipelinedFn(tp.Name(), jobPipelinedFn)
- ssn.AddJobStarvingFns(tp.Name(), jobStarvingFn)
-}
-
-func (tp *tdmPlugin) maxVictims(job *api.JobInfo, victims []*api.TaskInfo) []*api.TaskInfo {
- maxPodEvictNum := tp.getMaxPodEvictNum(job)
- targetNum := util.GetMinInt(maxPodEvictNum, len(victims))
- klog.V(3).Infof("Job <%s/%s> max evict:%v, potential victims number:%v, max victims number:%v",
- job.Namespace, job.Name, maxPodEvictNum, len(victims), targetNum)
-
- return victims[:targetNum]
-}
-
-// get max pod evict number from job budget configure
-func (tp *tdmPlugin) getMaxPodEvictNum(job *api.JobInfo) int {
- jobRunningTaskNum := len(job.TaskStatusIndex[api.Running])
- if job.Budget.MaxUnavilable != "" {
- maxUnavilable := tp.parseIntStr(job.Budget.MaxUnavilable, len(job.Tasks))
- finalTaskNum := len(job.TaskStatusIndex[api.Succeeded]) + len(job.TaskStatusIndex[api.Failed])
- realUnavilable := len(job.Tasks) - finalTaskNum - jobRunningTaskNum
- if realUnavilable >= maxUnavilable {
- return 0
- }
- return maxUnavilable - realUnavilable
- }
-
- if job.Budget.MinAvailable != "" {
- minAvailable := tp.parseIntStr(job.Budget.MinAvailable, len(job.Tasks))
- if jobRunningTaskNum >= minAvailable {
- return jobRunningTaskNum - minAvailable
- }
- }
-
- return defaultPodEvictNum
-}
-
-func (tp *tdmPlugin) parseIntStr(input string, taskNum int) int {
- resultValue := 0
- tmp := intstr.Parse(input)
- switch tmp.Type {
- case intstr.Int:
- resultValue = tmp.IntValue()
- case intstr.String:
- if v, err := intstr.GetValueFromIntOrPercent(&tmp, taskNum, true); err == nil {
- resultValue = v
- } else {
- klog.Warningf("TDM get percent value err: %v", err)
- }
- }
-
- return resultValue
-}
-
-func (tp *tdmPlugin) revocableNodePreemptableTask(rz string, ssn *framework.Session) map[api.JobID][]*api.TaskInfo {
- tasksMap := make(map[api.JobID][]*api.TaskInfo)
- for _, node := range ssn.RevocableNodes {
- if node.RevocableZone != rz {
- continue
- }
-
- for _, task := range node.Tasks {
- if task.Preemptable {
- if task.Status == api.Running {
- tasksMap[task.Job] = append(tasksMap[task.Job], task)
- }
- }
- }
- }
-
- return tasksMap
-}
-
-func (tp *tdmPlugin) OnSessionClose(ssn *framework.Session) {}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package scheduler
-
-import (
- "fmt"
- "path/filepath"
- "sync"
- "time"
-
- "github.com/fsnotify/fsnotify"
- "k8s.io/apimachinery/pkg/util/wait"
- "k8s.io/client-go/rest"
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/filewatcher"
- schedcache "volcano.sh/volcano/pkg/scheduler/cache"
- "volcano.sh/volcano/pkg/scheduler/conf"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/metrics"
-)
-
-// Scheduler watches for new unscheduled pods for volcano. It attempts to find
-// nodes that they fit on and writes bindings back to the api server.
-type Scheduler struct {
- cache schedcache.Cache
- schedulerConf string
- fileWatcher filewatcher.FileWatcher
- schedulePeriod time.Duration
- once sync.Once
-
- mutex sync.Mutex
- actions []framework.Action
- plugins []conf.Tier
- configurations []conf.Configuration
-}
-
-// NewScheduler returns a scheduler
-func NewScheduler(
- config *rest.Config,
- schedulerName string,
- schedulerConf string,
- period time.Duration,
- defaultQueue string,
-) (*Scheduler, error) {
- var watcher filewatcher.FileWatcher
- if schedulerConf != "" {
- var err error
- path := filepath.Dir(schedulerConf)
- watcher, err = filewatcher.NewFileWatcher(path)
- if err != nil {
- return nil, fmt.Errorf("failed creating filewatcher for %s: %v", schedulerConf, err)
- }
- }
-
- scheduler := &Scheduler{
- schedulerConf: schedulerConf,
- fileWatcher: watcher,
- cache: schedcache.New(config, schedulerName, defaultQueue),
- schedulePeriod: period,
- }
-
- return scheduler, nil
-}
-
-// Run runs the Scheduler
-func (pc *Scheduler) Run(stopCh <-chan struct{}) {
- pc.loadSchedulerConf()
- go pc.watchSchedulerConf(stopCh)
- // Start cache for policy.
- go pc.cache.Run(stopCh)
- pc.cache.WaitForCacheSync(stopCh)
- go wait.Until(pc.runOnce, pc.schedulePeriod, stopCh)
-}
-
-func (pc *Scheduler) runOnce() {
- klog.V(4).Infof("Start scheduling ...")
- scheduleStartTime := time.Now()
- defer klog.V(4).Infof("End scheduling ...")
-
- pc.mutex.Lock()
- actions := pc.actions
- plugins := pc.plugins
- configurations := pc.configurations
- pc.mutex.Unlock()
-
- ssn := framework.OpenSession(pc.cache, plugins, configurations)
- defer framework.CloseSession(ssn)
-
- for _, action := range actions {
- actionStartTime := time.Now()
- action.Execute(ssn)
- metrics.UpdateActionDuration(action.Name(), metrics.Duration(actionStartTime))
- }
- metrics.UpdateE2eDuration(metrics.Duration(scheduleStartTime))
-}
-
-func (pc *Scheduler) loadSchedulerConf() {
- var err error
- pc.once.Do(func() {
- pc.actions, pc.plugins, pc.configurations, err = unmarshalSchedulerConf(defaultSchedulerConf)
- if err != nil {
- klog.Errorf("unmarshal scheduler config %s failed: %v", defaultSchedulerConf, err)
- panic("invalid default configuration")
- }
- })
-
- var config string
- if len(pc.schedulerConf) != 0 {
- if config, err = readSchedulerConf(pc.schedulerConf); err != nil {
- klog.Errorf("Failed to read scheduler configuration '%s', using previous configuration: %v",
- pc.schedulerConf, err)
- return
- }
- }
-
- actions, plugins, configurations, err := unmarshalSchedulerConf(config)
- if err != nil {
- klog.Errorf("scheduler config %s is invalid: %v", config, err)
- return
- }
-
- pc.mutex.Lock()
- // If it is valid, use the new configuration
- pc.actions = actions
- pc.plugins = plugins
- pc.configurations = configurations
- pc.mutex.Unlock()
-}
-
-func (pc *Scheduler) watchSchedulerConf(stopCh <-chan struct{}) {
- if pc.fileWatcher == nil {
- return
- }
- eventCh := pc.fileWatcher.Events()
- errCh := pc.fileWatcher.Errors()
- for {
- select {
- case event, ok := <-eventCh:
- if !ok {
- return
- }
- klog.V(4).Infof("watch %s event: %v", pc.schedulerConf, event)
- if event.Op&fsnotify.Write == fsnotify.Write || event.Op&fsnotify.Create == fsnotify.Create {
- pc.loadSchedulerConf()
- }
- case err, ok := <-errCh:
- if !ok {
- return
- }
- klog.Infof("watch %s error: %v", pc.schedulerConf, err)
- case <-stopCh:
- return
- }
- }
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package scheduler
-
-import (
- "fmt"
- "io/ioutil"
- "strings"
-
- "gopkg.in/yaml.v2"
-
- "volcano.sh/volcano/pkg/scheduler/conf"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/plugins"
-)
-
-var defaultSchedulerConf = `
-actions: "enqueue, allocate, backfill"
-tiers:
-- plugins:
- - name: priority
- - name: gang
- - name: conformance
-- plugins:
- - name: overcommit
- - name: drf
- - name: predicates
- - name: proportion
- - name: nodeorder
-`
-
-func unmarshalSchedulerConf(confStr string) ([]framework.Action, []conf.Tier, []conf.Configuration, error) {
- var actions []framework.Action
-
- schedulerConf := &conf.SchedulerConfiguration{}
-
- if err := yaml.Unmarshal([]byte(confStr), schedulerConf); err != nil {
- return nil, nil, nil, err
- }
- // Set default settings for each plugin if not set
- for i, tier := range schedulerConf.Tiers {
- // drf with hierarchy enabled
- hdrf := false
- // proportion enabled
- proportion := false
- for j := range tier.Plugins {
- if tier.Plugins[j].Name == "drf" &&
- tier.Plugins[j].EnabledHierarchy != nil &&
- *tier.Plugins[j].EnabledHierarchy {
- hdrf = true
- }
- if tier.Plugins[j].Name == "proportion" {
- proportion = true
- }
- plugins.ApplyPluginConfDefaults(&schedulerConf.Tiers[i].Plugins[j])
- }
- if hdrf && proportion {
- return nil, nil, nil, fmt.Errorf("proportion and drf with hierarchy enabled conflicts")
- }
- }
-
- actionNames := strings.Split(schedulerConf.Actions, ",")
- for _, actionName := range actionNames {
- if action, found := framework.GetAction(strings.TrimSpace(actionName)); found {
- actions = append(actions, action)
- } else {
- return nil, nil, nil, fmt.Errorf("failed to found Action %s, ignore it", actionName)
- }
- }
-
- return actions, schedulerConf.Tiers, schedulerConf.Configurations, nil
-}
-
-func readSchedulerConf(confPath string) (string, error) {
- dat, err := ioutil.ReadFile(confPath)
- if err != nil {
- return "", err
- }
- return string(dat), nil
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package util
-
-import (
- "container/heap"
-
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-//PriorityQueue implements a scheduling queue.
-type PriorityQueue struct {
- queue priorityQueue
-}
-
-type priorityQueue struct {
- items []interface{}
- lessFn api.LessFn
-}
-
-// NewPriorityQueue returns a PriorityQueue
-func NewPriorityQueue(lessFn api.LessFn) *PriorityQueue {
- return &PriorityQueue{
- queue: priorityQueue{
- items: make([]interface{}, 0),
- lessFn: lessFn,
- },
- }
-}
-
-// Push pushes element in the priority Queue
-func (q *PriorityQueue) Push(it interface{}) {
- heap.Push(&q.queue, it)
-}
-
-// Pop pops element in the priority Queue
-func (q *PriorityQueue) Pop() interface{} {
- if q.Len() == 0 {
- return nil
- }
-
- return heap.Pop(&q.queue)
-}
-
-// Empty check if queue is empty
-func (q *PriorityQueue) Empty() bool {
- return q.queue.Len() == 0
-}
-
-// Len returns Len of the priority queue
-func (q *PriorityQueue) Len() int {
- return q.queue.Len()
-}
-
-func (pq *priorityQueue) Len() int { return len(pq.items) }
-
-func (pq *priorityQueue) Less(i, j int) bool {
- if pq.lessFn == nil {
- return i < j
- }
-
- // We want Pop to give us the highest, not lowest, priority so we use greater than here.
- return pq.lessFn(pq.items[i], pq.items[j])
-}
-
-func (pq priorityQueue) Swap(i, j int) {
- pq.items[i], pq.items[j] = pq.items[j], pq.items[i]
-}
-
-func (pq *priorityQueue) Push(x interface{}) {
- (*pq).items = append((*pq).items, x)
-}
-
-func (pq *priorityQueue) Pop() interface{} {
- old := (*pq).items
- n := len(old)
- item := old[n-1]
- (*pq).items = old[0 : n-1]
- return item
-}
-
-
-
/*
-Copyright 2019 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package util
-
-import (
- "context"
- "fmt"
- "math"
- "math/rand"
- "sort"
- "sync"
- "sync/atomic"
-
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
- k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/volcano/cmd/scheduler/app/options"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-const baselinePercentageOfNodesToFind = 50
-
-var lastProcessedNodeIndex int
-
-// Reservation is used to record target job and locked nodes
-var Reservation *ResourceReservation
-
-func init() {
- Reservation = NewResourceReservation()
-}
-
-// CalculateNumOfFeasibleNodesToFind returns the number of feasible nodes that once found,
-// the scheduler stops its search for more feasible nodes.
-func CalculateNumOfFeasibleNodesToFind(numAllNodes int32) (numNodes int32) {
- opts := options.ServerOpts
- if numAllNodes <= opts.MinNodesToFind || opts.PercentageOfNodesToFind >= 100 {
- return numAllNodes
- }
-
- adaptivePercentage := opts.PercentageOfNodesToFind
- if adaptivePercentage <= 0 {
- adaptivePercentage = baselinePercentageOfNodesToFind - numAllNodes/125
- if adaptivePercentage < opts.MinPercentageOfNodesToFind {
- adaptivePercentage = opts.MinPercentageOfNodesToFind
- }
- }
-
- numNodes = numAllNodes * adaptivePercentage / 100
- if numNodes < opts.MinNodesToFind {
- numNodes = opts.MinNodesToFind
- }
- return numNodes
-}
-
-// PredicateNodes returns the specified number of nodes that fit a task
-func PredicateNodes(task *api.TaskInfo, nodes []*api.NodeInfo, fn api.PredicateFn) ([]*api.NodeInfo, *api.FitErrors) {
- //var workerLock sync.Mutex
-
- var errorLock sync.Mutex
- fe := api.NewFitErrors()
-
- allNodes := len(nodes)
- if allNodes == 0 {
- return make([]*api.NodeInfo, 0), fe
- }
- numNodesToFind := CalculateNumOfFeasibleNodesToFind(int32(allNodes))
-
- //allocate enough space to avoid growing it
- predicateNodes := make([]*api.NodeInfo, numNodesToFind)
-
- numFoundNodes := int32(0)
- processedNodes := int32(0)
-
- //create a context with cancellation
- ctx, cancel := context.WithCancel(context.Background())
-
- checkNode := func(index int) {
- // Check the nodes starting from where is left off in the previous scheduling cycle,
- // to make sure all nodes have the same chance of being examined across pods.
- node := nodes[(lastProcessedNodeIndex+index)%allNodes]
- atomic.AddInt32(&processedNodes, 1)
- klog.V(4).Infof("Considering Task <%v/%v> on node <%v>: <%v> vs. <%v>",
- task.Namespace, task.Name, node.Name, task.Resreq, node.Idle)
-
- // TODO (k82cn): Enable eCache for performance improvement.
- if err := fn(task, node); err != nil {
- klog.V(3).Infof("Predicates failed for task <%s/%s> on node <%s>: %v",
- task.Namespace, task.Name, node.Name, err)
- errorLock.Lock()
- fe.SetNodeError(node.Name, err)
- errorLock.Unlock()
- return
- }
-
- //check if the number of found nodes is more than the numNodesTofind
- length := atomic.AddInt32(&numFoundNodes, 1)
- if length > numNodesToFind {
- cancel()
- atomic.AddInt32(&numFoundNodes, -1)
- } else {
- predicateNodes[length-1] = node
- }
- }
-
- //workqueue.ParallelizeUntil(context.TODO(), 16, len(nodes), checkNode)
- workqueue.ParallelizeUntil(ctx, 16, allNodes, checkNode)
-
- //processedNodes := int(numFoundNodes) + len(filteredNodesStatuses) + len(failedPredicateMap)
- lastProcessedNodeIndex = (lastProcessedNodeIndex + int(processedNodes)) % allNodes
- predicateNodes = predicateNodes[:numFoundNodes]
- return predicateNodes, fe
-}
-
-// PrioritizeNodes returns a map whose key is node's score and value are corresponding nodes
-func PrioritizeNodes(task *api.TaskInfo, nodes []*api.NodeInfo, batchFn api.BatchNodeOrderFn, mapFn api.NodeOrderMapFn, reduceFn api.NodeOrderReduceFn) map[float64][]*api.NodeInfo {
- pluginNodeScoreMap := map[string]k8sframework.NodeScoreList{}
- nodeOrderScoreMap := map[string]float64{}
- nodeScores := map[float64][]*api.NodeInfo{}
- var workerLock sync.Mutex
- scoreNode := func(index int) {
- node := nodes[index]
- mapScores, orderScore, err := mapFn(task, node)
- if err != nil {
- klog.Errorf("Error in Calculating Priority for the node:%v", err)
- return
- }
-
- workerLock.Lock()
- for plugin, score := range mapScores {
- nodeScoreMap, ok := pluginNodeScoreMap[plugin]
- if !ok {
- nodeScoreMap = k8sframework.NodeScoreList{}
- }
- hp := k8sframework.NodeScore{}
- hp.Name = node.Name
- hp.Score = int64(math.Floor(score))
- pluginNodeScoreMap[plugin] = append(nodeScoreMap, hp)
- }
- nodeOrderScoreMap[node.Name] = orderScore
- workerLock.Unlock()
- }
- workqueue.ParallelizeUntil(context.TODO(), 16, len(nodes), scoreNode)
- reduceScores, err := reduceFn(task, pluginNodeScoreMap)
- if err != nil {
- klog.Errorf("Error in Calculating Priority for the node:%v", err)
- return nodeScores
- }
-
- batchNodeScore, err := batchFn(task, nodes)
- if err != nil {
- klog.Errorf("Error in Calculating batch Priority for the node, err %v", err)
- return nodeScores
- }
-
- for _, node := range nodes {
- if score, found := reduceScores[node.Name]; found {
- if orderScore, ok := nodeOrderScoreMap[node.Name]; ok {
- score += orderScore
- }
- if batchScore, ok := batchNodeScore[node.Name]; ok {
- score += batchScore
- }
- nodeScores[score] = append(nodeScores[score], node)
- } else {
- // If no plugin is applied to this node, the default is 0.0
- score = 0.0
- if orderScore, ok := nodeOrderScoreMap[node.Name]; ok {
- score += orderScore
- }
- if batchScore, ok := batchNodeScore[node.Name]; ok {
- score += batchScore
- }
- nodeScores[score] = append(nodeScores[score], node)
- }
- }
- return nodeScores
-}
-
-// SortNodes returns nodes by order of score
-func SortNodes(nodeScores map[float64][]*api.NodeInfo) []*api.NodeInfo {
- var nodesInorder []*api.NodeInfo
- var keys []float64
- for key := range nodeScores {
- keys = append(keys, key)
- }
- sort.Sort(sort.Reverse(sort.Float64Slice(keys)))
- for _, key := range keys {
- nodes := nodeScores[key]
- nodesInorder = append(nodesInorder, nodes...)
- }
- return nodesInorder
-}
-
-// SelectBestNode returns best node whose score is highest, pick one randomly if there are many nodes with same score.
-func SelectBestNode(nodeScores map[float64][]*api.NodeInfo) *api.NodeInfo {
- var bestNodes []*api.NodeInfo
- maxScore := -1.0
- for score, nodes := range nodeScores {
- if score > maxScore {
- maxScore = score
- bestNodes = nodes
- }
- }
-
- if len(bestNodes) == 0 {
- return nil
- }
-
- return bestNodes[rand.Intn(len(bestNodes))]
-}
-
-// GetNodeList returns values of the map 'nodes'
-func GetNodeList(nodes map[string]*api.NodeInfo, nodeList []string) []*api.NodeInfo {
- result := make([]*api.NodeInfo, 0, len(nodeList))
- for _, nodename := range nodeList {
- if ni, ok := nodes[nodename]; ok {
- result = append(result, ni)
- }
- }
- return result
-}
-
-// ValidateVictims returns an error if the resources of the victims can't satisfy the preemptor
-func ValidateVictims(preemptor *api.TaskInfo, node *api.NodeInfo, victims []*api.TaskInfo) error {
- if len(victims) == 0 {
- return fmt.Errorf("no victims")
- }
- futureIdle := node.FutureIdle()
- for _, victim := range victims {
- futureIdle.Add(victim.Resreq)
- }
- // Every resource of the preemptor needs to be less or equal than corresponding
- // idle resource after preemption.
- if !preemptor.InitResreq.LessEqual(futureIdle, api.Zero) {
- return fmt.Errorf("not enough resources: requested <%v>, but future idle <%v>",
- preemptor.InitResreq, futureIdle)
- }
- return nil
-}
-
-// ResourceReservation is struct used for resource reservation
-type ResourceReservation struct {
- TargetJob *api.JobInfo
- LockedNodes map[string]*api.NodeInfo
-}
-
-// NewResourceReservation is used to create global instance
-func NewResourceReservation() *ResourceReservation {
- return &ResourceReservation{
- TargetJob: nil,
- LockedNodes: map[string]*api.NodeInfo{},
- }
-}
-
-// GetMinInt return minimum int from vals
-func GetMinInt(vals ...int) int {
- if len(vals) == 0 {
- return 0
- }
-
- min := vals[0]
- for _, val := range vals {
- if val <= min {
- min = val
- }
- }
- return min
-}
-
-
-
/*
-Copyright 2019 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package util
-
-import (
- "fmt"
- "sync"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/types"
- volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
-
- schedulingv2 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-// BuildResourceList builts resource list object
-func BuildResourceList(cpu string, memory string) v1.ResourceList {
- return v1.ResourceList{
- v1.ResourceCPU: resource.MustParse(cpu),
- v1.ResourceMemory: resource.MustParse(memory),
- api.GPUResourceName: resource.MustParse("0"),
- }
-}
-
-// BuildResourceListWithGPU builts resource list with GPU
-func BuildResourceListWithGPU(cpu string, memory string, GPU string) v1.ResourceList {
- return v1.ResourceList{
- v1.ResourceCPU: resource.MustParse(cpu),
- v1.ResourceMemory: resource.MustParse(memory),
- api.GPUResourceName: resource.MustParse(GPU),
- }
-}
-
-// BuildNode builts node object
-func BuildNode(name string, alloc v1.ResourceList, labels map[string]string) *v1.Node {
- return &v1.Node{
- ObjectMeta: metav1.ObjectMeta{
- Name: name,
- Labels: labels,
- Annotations: map[string]string{},
- },
- Status: v1.NodeStatus{
- Capacity: alloc,
- Allocatable: alloc,
- },
- }
-}
-
-// BuildPod builts Pod object
-func BuildPod(namespace, name, nodename string, p v1.PodPhase, req v1.ResourceList, groupName string, labels map[string]string, selector map[string]string) *v1.Pod {
- return &v1.Pod{
- ObjectMeta: metav1.ObjectMeta{
- UID: types.UID(fmt.Sprintf("%v-%v", namespace, name)),
- Name: name,
- Namespace: namespace,
- Labels: labels,
- Annotations: map[string]string{
- schedulingv2.KubeGroupNameAnnotationKey: groupName,
- },
- },
- Status: v1.PodStatus{
- Phase: p,
- },
- Spec: v1.PodSpec{
- NodeName: nodename,
- NodeSelector: selector,
- Containers: []v1.Container{
- {
- Resources: v1.ResourceRequirements{
- Requests: req,
- },
- },
- },
- },
- }
-}
-
-// FakeBinder is used as fake binder
-type FakeBinder struct {
- sync.Mutex
- Binds map[string]string
- Channel chan string
-}
-
-// Bind used by fake binder struct to bind pods
-func (fb *FakeBinder) Bind(p *v1.Pod, hostname string) error {
- fb.Lock()
- defer fb.Unlock()
-
- key := fmt.Sprintf("%v/%v", p.Namespace, p.Name)
- fb.Binds[key] = hostname
-
- fb.Channel <- key
-
- return nil
-}
-
-// FakeEvictor is used as fake evictor
-type FakeEvictor struct {
- sync.Mutex
- evicts []string
- Channel chan string
-}
-
-// Evicts returns copy of evicted pods.
-func (fe *FakeEvictor) Evicts() []string {
- fe.Lock()
- defer fe.Unlock()
- return append([]string{}, fe.evicts...)
-}
-
-// Evict is used by fake evictor to evict pods
-func (fe *FakeEvictor) Evict(p *v1.Pod, reason string) error {
- fe.Lock()
- defer fe.Unlock()
-
- fmt.Println("PodName: ", p.Name)
- key := fmt.Sprintf("%v/%v", p.Namespace, p.Name)
- fe.evicts = append(fe.evicts, key)
-
- fe.Channel <- key
-
- return nil
-}
-
-// FakeStatusUpdater is used for fake status update
-type FakeStatusUpdater struct {
-}
-
-// UpdatePodCondition is a empty function
-func (ftsu *FakeStatusUpdater) UpdatePodCondition(pod *v1.Pod, podCondition *v1.PodCondition) (*v1.Pod, error) {
- // do nothing here
- return nil, nil
-}
-
-// UpdatePodGroup is a empty function
-func (ftsu *FakeStatusUpdater) UpdatePodGroup(pg *api.PodGroup) (*api.PodGroup, error) {
- // do nothing here
- return nil, nil
-}
-
-// FakeVolumeBinder is used as fake volume binder
-type FakeVolumeBinder struct {
-}
-
-// AllocateVolumes is a empty function
-func (fvb *FakeVolumeBinder) AllocateVolumes(task *api.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error {
- return nil
-}
-
-// BindVolumes is a empty function
-func (fvb *FakeVolumeBinder) BindVolumes(task *api.TaskInfo, podVolumes *volumescheduling.PodVolumes) error {
- return nil
-}
-
-// GetPodVolumes is a empty function
-func (fvb *FakeVolumeBinder) GetPodVolumes(task *api.TaskInfo, node *v1.Node) (*volumescheduling.PodVolumes, error) {
- return nil, nil
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- "encoding/json"
- "fmt"
- "strconv"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-const (
- // DefaultQueue constant stores the name of the queue as "default"
- DefaultQueue = "default"
- // DefaultMaxRetry is the default number of retries.
- DefaultMaxRetry = 3
-
- defaultSchedulerName = "volcano"
-
- defaultMaxRetry int32 = 3
-)
-
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/jobs/mutate",
- Func: Jobs,
-
- MutatingConfig: &whv1beta1.MutatingWebhookConfiguration{
- Webhooks: []whv1beta1.MutatingWebhook{{
- Name: "mutatejob.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create},
- Rule: whv1beta1.Rule{
- APIGroups: []string{"batch.volcano.sh"},
- APIVersions: []string{"v1alpha1"},
- Resources: []string{"jobs"},
- },
- },
- },
- }},
- },
-}
-
-type patchOperation struct {
- Op string `json:"op"`
- Path string `json:"path"`
- Value interface{} `json:"value,omitempty"`
-}
-
-// Jobs mutate jobs.
-func Jobs(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("mutating jobs")
-
- job, err := schema.DecodeJob(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
-
- var patchBytes []byte
- switch ar.Request.Operation {
- case v1beta1.Create:
- patchBytes, _ = createPatch(job)
- default:
- err = fmt.Errorf("expect operation to be 'CREATE' ")
- return util.ToAdmissionResponse(err)
- }
-
- klog.V(3).Infof("AdmissionResponse: patch=%v", string(patchBytes))
- reviewResponse := v1beta1.AdmissionResponse{
- Allowed: true,
- Patch: patchBytes,
- }
- pt := v1beta1.PatchTypeJSONPatch
- reviewResponse.PatchType = &pt
-
- return &reviewResponse
-}
-
-func createPatch(job *v1alpha1.Job) ([]byte, error) {
- var patch []patchOperation
- pathQueue := patchDefaultQueue(job)
- if pathQueue != nil {
- patch = append(patch, *pathQueue)
- }
- pathScheduler := patchDefaultScheduler(job)
- if pathScheduler != nil {
- patch = append(patch, *pathScheduler)
- }
- pathMaxRetry := patchDefaultMaxRetry(job)
- if pathMaxRetry != nil {
- patch = append(patch, *pathMaxRetry)
- }
- pathSpec := mutateSpec(job.Spec.Tasks, "/spec/tasks")
- if pathSpec != nil {
- patch = append(patch, *pathSpec)
- }
- pathMinAvailable := patchDefaultMinAvailable(job)
- if pathMinAvailable != nil {
- patch = append(patch, *pathMinAvailable)
- }
- return json.Marshal(patch)
-}
-
-func patchDefaultQueue(job *v1alpha1.Job) *patchOperation {
- //Add default queue if not specified.
- if job.Spec.Queue == "" {
- return &patchOperation{Op: "add", Path: "/spec/queue", Value: DefaultQueue}
- }
- return nil
-}
-
-func patchDefaultScheduler(job *v1alpha1.Job) *patchOperation {
- // Add default scheduler name if not specified.
- if job.Spec.SchedulerName == "" {
- return &patchOperation{Op: "add", Path: "/spec/schedulerName", Value: defaultSchedulerName}
- }
- return nil
-}
-
-func patchDefaultMaxRetry(job *v1alpha1.Job) *patchOperation {
- // Add default maxRetry if maxRetry is zero.
- if job.Spec.MaxRetry == 0 {
- return &patchOperation{Op: "add", Path: "/spec/maxRetry", Value: DefaultMaxRetry}
- }
- return nil
-}
-
-func patchDefaultMinAvailable(job *v1alpha1.Job) *patchOperation {
- // Add default minAvailable if minAvailable is zero.
- if job.Spec.MinAvailable == 0 {
- var jobMinAvailable int32
- for _, task := range job.Spec.Tasks {
- if task.MinAvailable != nil {
- jobMinAvailable += *task.MinAvailable
- } else {
- jobMinAvailable += task.Replicas
- }
- }
-
- return &patchOperation{Op: "add", Path: "/spec/minAvailable", Value: jobMinAvailable}
- }
- return nil
-}
-
-func mutateSpec(tasks []v1alpha1.TaskSpec, basePath string) *patchOperation {
- patched := false
- for index := range tasks {
- // add default task name
- taskName := tasks[index].Name
- if len(taskName) == 0 {
- patched = true
- tasks[index].Name = v1alpha1.DefaultTaskSpec + strconv.Itoa(index)
- }
-
- if tasks[index].Template.Spec.HostNetwork && tasks[index].Template.Spec.DNSPolicy == "" {
- patched = true
- tasks[index].Template.Spec.DNSPolicy = v1.DNSClusterFirstWithHostNet
- }
-
- if tasks[index].MinAvailable == nil {
- patched = true
- minAvailable := tasks[index].Replicas
- tasks[index].MinAvailable = &minAvailable
- }
-
- if tasks[index].MaxRetry == 0 {
- patched = true
- tasks[index].MaxRetry = defaultMaxRetry
- }
- }
- if !patched {
- return nil
- }
- return &patchOperation{
- Op: "replace",
- Path: basePath,
- Value: tasks,
- }
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package validate
-
-import (
- "context"
- "fmt"
- "strings"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- v1 "k8s.io/api/core/v1"
- apieequality "k8s.io/apimachinery/pkg/api/equality"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/util/validation"
- "k8s.io/apimachinery/pkg/util/validation/field"
- "k8s.io/klog"
- k8score "k8s.io/kubernetes/pkg/apis/core"
- k8scorev1 "k8s.io/kubernetes/pkg/apis/core/v1"
- v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
- k8scorevalid "k8s.io/kubernetes/pkg/apis/core/validation"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
- "volcano.sh/volcano/pkg/controllers/job/plugins"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/jobs/validate",
- Func: AdmitJobs,
-
- Config: config,
-
- ValidatingConfig: &whv1beta1.ValidatingWebhookConfiguration{
- Webhooks: []whv1beta1.ValidatingWebhook{{
- Name: "validatejob.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create, whv1beta1.Update},
- Rule: whv1beta1.Rule{
- APIGroups: []string{"batch.volcano.sh"},
- APIVersions: []string{"v1alpha1"},
- Resources: []string{"jobs"},
- },
- },
- },
- }},
- },
-}
-
-var config = &router.AdmissionServiceConfig{}
-
-// AdmitJobs is to admit jobs and return response.
-func AdmitJobs(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("admitting jobs -- %s", ar.Request.Operation)
-
- job, err := schema.DecodeJob(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
- var msg string
- reviewResponse := v1beta1.AdmissionResponse{}
- reviewResponse.Allowed = true
-
- switch ar.Request.Operation {
- case v1beta1.Create:
- msg = validateJobCreate(job, &reviewResponse)
- case v1beta1.Update:
- oldJob, err := schema.DecodeJob(ar.Request.OldObject, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
- err = validateJobUpdate(oldJob, job)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
- default:
- err := fmt.Errorf("expect operation to be 'CREATE' or 'UPDATE'")
- return util.ToAdmissionResponse(err)
- }
-
- if !reviewResponse.Allowed {
- reviewResponse.Result = &metav1.Status{Message: strings.TrimSpace(msg)}
- }
- return &reviewResponse
-}
-
-func validateJobCreate(job *v1alpha1.Job, reviewResponse *v1beta1.AdmissionResponse) string {
- var msg string
- taskNames := map[string]string{}
- var totalReplicas int32
-
- if job.Spec.MinAvailable < 0 {
- reviewResponse.Allowed = false
- return "job 'minAvailable' must be >= 0."
- }
-
- if job.Spec.MaxRetry < 0 {
- reviewResponse.Allowed = false
- return "'maxRetry' cannot be less than zero."
- }
-
- if job.Spec.TTLSecondsAfterFinished != nil && *job.Spec.TTLSecondsAfterFinished < 0 {
- reviewResponse.Allowed = false
- return "'ttlSecondsAfterFinished' cannot be less than zero."
- }
-
- if len(job.Spec.Tasks) == 0 {
- reviewResponse.Allowed = false
- return "No task specified in job spec"
- }
-
- for index, task := range job.Spec.Tasks {
- if task.Replicas < 0 {
- msg += fmt.Sprintf(" 'replicas' < 0 in task: %s;", task.Name)
- }
-
- if task.MinAvailable != nil && *task.MinAvailable > task.Replicas {
- msg += fmt.Sprintf(" 'minAvailable' is greater than 'replicas' in task: %s, job: %s", task.Name, job.Name)
- }
-
- // count replicas
- totalReplicas += task.Replicas
-
- // validate task name
- if errMsgs := validation.IsDNS1123Label(task.Name); len(errMsgs) > 0 {
- msg += fmt.Sprintf(" %v;", errMsgs)
- }
-
- // duplicate task name
- if _, found := taskNames[task.Name]; found {
- msg += fmt.Sprintf(" duplicated task name %s;", task.Name)
- break
- } else {
- taskNames[task.Name] = task.Name
- }
-
- if err := validatePolicies(task.Policies, field.NewPath("spec.tasks.policies")); err != nil {
- msg += err.Error() + fmt.Sprintf(" valid events are %v, valid actions are %v",
- getValidEvents(), getValidActions())
- }
- podName := jobhelpers.MakePodName(job.Name, task.Name, index)
- msg += validateK8sPodNameLength(podName)
- msg += validateTaskTemplate(task, job, index)
- }
-
- msg += validateJobName(job)
-
- if totalReplicas < job.Spec.MinAvailable {
- msg += "job 'minAvailable' should not be greater than total replicas in tasks;"
- }
-
- if err := validatePolicies(job.Spec.Policies, field.NewPath("spec.policies")); err != nil {
- msg = msg + err.Error() + fmt.Sprintf(" valid events are %v, valid actions are %v;",
- getValidEvents(), getValidActions())
- }
-
- // invalid job plugins
- if len(job.Spec.Plugins) != 0 {
- for name := range job.Spec.Plugins {
- if _, found := plugins.GetPluginBuilder(name); !found {
- msg += fmt.Sprintf(" unable to find job plugin: %s", name)
- }
- }
- }
-
- if err := validateIO(job.Spec.Volumes); err != nil {
- msg += err.Error()
- }
-
- queue, err := config.VolcanoClient.SchedulingV1beta1().Queues().Get(context.TODO(), job.Spec.Queue, metav1.GetOptions{})
- if err != nil {
- msg += fmt.Sprintf(" unable to find job queue: %v", err)
- } else if queue.Status.State != schedulingv1beta1.QueueStateOpen {
- msg += fmt.Sprintf("can only submit job to queue with state `Open`, "+
- "queue `%s` status is `%s`", queue.Name, queue.Status.State)
- }
-
- if msg != "" {
- reviewResponse.Allowed = false
- }
-
- return msg
-}
-
-func validateJobUpdate(old, new *v1alpha1.Job) error {
- var totalReplicas int32
- for _, task := range new.Spec.Tasks {
- if task.Replicas < 0 {
- return fmt.Errorf("'replicas' must be >= 0 in task: %s", task.Name)
- }
-
- if task.MinAvailable != nil && *task.MinAvailable > task.Replicas {
- return fmt.Errorf("'minAvailable' must be <= 'replicas' in task: %s;", task.Name)
- }
- // count replicas
- totalReplicas += task.Replicas
- }
- if new.Spec.MinAvailable > totalReplicas {
- return fmt.Errorf("job 'minAvailable' must not be greater than total replicas")
- }
- if new.Spec.MinAvailable < 0 {
- return fmt.Errorf("job 'minAvailable' must be >= 0")
- }
-
- if len(old.Spec.Tasks) != len(new.Spec.Tasks) {
- return fmt.Errorf("job updates may not add or remove tasks")
- }
- // other fields under spec are not allowed to mutate
- new.Spec.MinAvailable = old.Spec.MinAvailable
- new.Spec.PriorityClassName = old.Spec.PriorityClassName
- for i := range new.Spec.Tasks {
- new.Spec.Tasks[i].Replicas = old.Spec.Tasks[i].Replicas
- new.Spec.Tasks[i].MinAvailable = old.Spec.Tasks[i].MinAvailable
- }
-
- // job controller will update the pvc name if not provided
- for i := range new.Spec.Volumes {
- if new.Spec.Volumes[i].VolumeClaim != nil {
- new.Spec.Volumes[i].VolumeClaimName = ""
- }
- }
- for i := range old.Spec.Volumes {
- if old.Spec.Volumes[i].VolumeClaim != nil {
- old.Spec.Volumes[i].VolumeClaimName = ""
- }
- }
-
- if !apieequality.Semantic.DeepEqual(new.Spec, old.Spec) {
- return fmt.Errorf("job updates may not change fields other than `minAvailable`, `tasks[*].replicas under spec`")
- }
-
- return nil
-}
-
-func validateTaskTemplate(task v1alpha1.TaskSpec, job *v1alpha1.Job, index int) string {
- var v1PodTemplate v1.PodTemplate
- v1PodTemplate.Template = *task.Template.DeepCopy()
- k8scorev1.SetObjectDefaults_PodTemplate(&v1PodTemplate)
-
- var coreTemplateSpec k8score.PodTemplateSpec
- k8scorev1.Convert_v1_PodTemplateSpec_To_core_PodTemplateSpec(&v1PodTemplate.Template, &coreTemplateSpec, nil)
-
- // Skip verify container SecurityContex.Privileged as it depends on
- // the kube-apiserver `allow-privileged` flag.
- for i, container := range coreTemplateSpec.Spec.Containers {
- if container.SecurityContext != nil && container.SecurityContext.Privileged != nil {
- coreTemplateSpec.Spec.Containers[i].SecurityContext.Privileged = nil
- }
- }
-
- corePodTemplate := k8score.PodTemplate{
- ObjectMeta: metav1.ObjectMeta{
- Name: task.Name,
- Namespace: job.Namespace,
- },
- Template: coreTemplateSpec,
- }
-
- if allErrs := k8scorevalid.ValidatePodTemplate(&corePodTemplate); len(allErrs) > 0 {
- msg := fmt.Sprintf("spec.task[%d].", index)
- for index := range allErrs {
- msg += allErrs[index].Error() + ". "
- }
- return msg
- }
-
- msg := validateTaskTopoPolicy(task, index)
- if msg != "" {
- return msg
- }
-
- return ""
-}
-
-func validateK8sPodNameLength(podName string) string {
- if errMsgs := validation.IsQualifiedName(podName); len(errMsgs) > 0 {
- return fmt.Sprintf("create pod with name %s validate failed %v;", podName, errMsgs)
- }
- return ""
-}
-
-func validateJobName(job *v1alpha1.Job) string {
- if errMsgs := validation.IsQualifiedName(job.Name); len(errMsgs) > 0 {
- return fmt.Sprintf("create job with name %s validate failed %v", job.Name, errMsgs)
- }
- return ""
-}
-
-func validateTaskTopoPolicy(task v1alpha1.TaskSpec, index int) string {
- if task.TopologyPolicy == "" || task.TopologyPolicy == v1alpha1.None {
- return ""
- }
-
- template := task.Template.DeepCopy()
-
- for id, container := range template.Spec.Containers {
- if len(container.Resources.Requests) == 0 {
- template.Spec.Containers[id].Resources.Requests = container.Resources.Limits.DeepCopy()
- }
- }
-
- for id, container := range template.Spec.InitContainers {
- if len(container.Resources.Requests) == 0 {
- template.Spec.InitContainers[id].Resources.Requests = container.Resources.Limits.DeepCopy()
- }
- }
-
- pod := &v1.Pod{
- Spec: template.Spec,
- }
-
- if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
- return fmt.Sprintf("spec.task[%d] isn't Guaranteed pod, kind=%v", index, v1qos.GetPodQOS(pod))
- }
-
- for id, container := range append(template.Spec.Containers, template.Spec.InitContainers...) {
- requestNum := guaranteedCPUs(container)
- if requestNum == 0 {
- return fmt.Sprintf("the cpu request isn't an integer in spec.task[%d] container[%d].",
- index, id)
- }
- }
-
- return ""
-}
-
-func guaranteedCPUs(container v1.Container) int {
- cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
- if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
- return 0
- }
-
- return int(cpuQuantity.Value())
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package validate
-
-import (
- "fmt"
-
- "github.com/hashicorp/go-multierror"
-
- "k8s.io/apimachinery/pkg/util/validation/field"
- "k8s.io/kubernetes/pkg/apis/core/validation"
-
- batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
-)
-
-// policyEventMap defines all policy events and whether to allow external use.
-var policyEventMap = map[busv1alpha1.Event]bool{
- busv1alpha1.AnyEvent: true,
- busv1alpha1.PodFailedEvent: true,
- busv1alpha1.PodEvictedEvent: true,
- busv1alpha1.JobUnknownEvent: true,
- busv1alpha1.TaskCompletedEvent: true,
- busv1alpha1.TaskFailedEvent: true,
- busv1alpha1.OutOfSyncEvent: false,
- busv1alpha1.CommandIssuedEvent: false,
- busv1alpha1.JobUpdatedEvent: true,
-}
-
-// policyActionMap defines all policy actions and whether to allow external use.
-var policyActionMap = map[busv1alpha1.Action]bool{
- busv1alpha1.AbortJobAction: true,
- busv1alpha1.RestartJobAction: true,
- busv1alpha1.RestartTaskAction: true,
- busv1alpha1.TerminateJobAction: true,
- busv1alpha1.CompleteJobAction: true,
- busv1alpha1.ResumeJobAction: true,
- busv1alpha1.SyncJobAction: false,
- busv1alpha1.EnqueueAction: false,
- busv1alpha1.SyncQueueAction: false,
- busv1alpha1.OpenQueueAction: false,
- busv1alpha1.CloseQueueAction: false,
-}
-
-func validatePolicies(policies []batchv1alpha1.LifecyclePolicy, fldPath *field.Path) error {
- var err error
- policyEvents := map[busv1alpha1.Event]struct{}{}
- exitCodes := map[int32]struct{}{}
-
- for _, policy := range policies {
- if (policy.Event != "" || len(policy.Events) != 0) && policy.ExitCode != nil {
- err = multierror.Append(err, fmt.Errorf("must not specify event and exitCode simultaneously"))
- break
- }
-
- if policy.Event == "" && len(policy.Events) == 0 && policy.ExitCode == nil {
- err = multierror.Append(err, fmt.Errorf("either event and exitCode should be specified"))
- break
- }
-
- if len(policy.Event) != 0 || len(policy.Events) != 0 {
- bFlag := false
- policyEventsList := getEventList(policy)
- for _, event := range policyEventsList {
- if allow, ok := policyEventMap[event]; !ok || !allow {
- err = multierror.Append(err, field.Invalid(fldPath, event, "invalid policy event"))
- bFlag = true
- break
- }
-
- if allow, ok := policyActionMap[policy.Action]; !ok || !allow {
- err = multierror.Append(err, field.Invalid(fldPath, policy.Action, "invalid policy action"))
- bFlag = true
- break
- }
- if _, found := policyEvents[event]; found {
- err = multierror.Append(err, fmt.Errorf("duplicate event %v across different policy", event))
- bFlag = true
- break
- } else {
- policyEvents[event] = struct{}{}
- }
- }
- if bFlag {
- break
- }
- } else {
- if *policy.ExitCode == 0 {
- err = multierror.Append(err, fmt.Errorf("0 is not a valid error code"))
- break
- }
- if _, found := exitCodes[*policy.ExitCode]; found {
- err = multierror.Append(err, fmt.Errorf("duplicate exitCode %v", *policy.ExitCode))
- break
- } else {
- exitCodes[*policy.ExitCode] = struct{}{}
- }
- }
- }
-
- if _, found := policyEvents[busv1alpha1.AnyEvent]; found && len(policyEvents) > 1 {
- err = multierror.Append(err, fmt.Errorf("if there's * here, no other policy should be here"))
- }
-
- return err
-}
-
-func getEventList(policy batchv1alpha1.LifecyclePolicy) []busv1alpha1.Event {
- policyEventsList := policy.Events
- if len(policy.Event) > 0 {
- policyEventsList = append(policyEventsList, policy.Event)
- }
- uniquePolicyEventlist := removeDuplicates(policyEventsList)
- return uniquePolicyEventlist
-}
-
-func removeDuplicates(eventList []busv1alpha1.Event) []busv1alpha1.Event {
- keys := make(map[busv1alpha1.Event]bool)
- list := []busv1alpha1.Event{}
- for _, val := range eventList {
- if _, value := keys[val]; !value {
- keys[val] = true
- list = append(list, val)
- }
- }
- return list
-}
-
-func getValidEvents() []busv1alpha1.Event {
- var events []busv1alpha1.Event
- for e, allow := range policyEventMap {
- if allow {
- events = append(events, e)
- }
- }
-
- return events
-}
-
-func getValidActions() []busv1alpha1.Action {
- var actions []busv1alpha1.Action
- for a, allow := range policyActionMap {
- if allow {
- actions = append(actions, a)
- }
- }
-
- return actions
-}
-
-// validateIO validates IO configuration.
-func validateIO(volumes []batchv1alpha1.VolumeSpec) error {
- volumeMap := map[string]bool{}
- for _, volume := range volumes {
- if len(volume.MountPath) == 0 {
- return fmt.Errorf(" mountPath is required;")
- }
- if _, found := volumeMap[volume.MountPath]; found {
- return fmt.Errorf(" duplicated mountPath: %s;", volume.MountPath)
- }
- if volume.VolumeClaim == nil && volume.VolumeClaimName == "" {
- return fmt.Errorf(" either VolumeClaim or VolumeClaimName must be specified;")
- }
- if len(volume.VolumeClaimName) != 0 {
- if volume.VolumeClaim != nil {
- return fmt.Errorf("conflict: If you want to use an existing PVC, just specify VolumeClaimName." +
- "If you want to create a new PVC, you do not need to specify VolumeClaimName")
- }
- if errMsgs := validation.ValidatePersistentVolumeName(volume.VolumeClaimName, false); len(errMsgs) > 0 {
- return fmt.Errorf("invalid VolumeClaimName %s : %v", volume.VolumeClaimName, errMsgs)
- }
- }
-
- volumeMap[volume.MountPath] = true
- }
- return nil
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- "github.com/imdario/mergo"
- "gopkg.in/yaml.v2"
- "k8s.io/klog"
-
- v1 "k8s.io/api/core/v1"
-
- wkconfig "volcano.sh/volcano/pkg/webhooks/config"
-)
-
-type annotationResGroup struct{}
-
-const (
- // defaultAnnotationKey: default annotation key
- defaultAnnotationKey = "volcano.sh/resource-group"
-)
-
-// NewAnnotationResGroup create a new structure
-func NewAnnotationResGroup() ResGroup {
- return &annotationResGroup{}
-}
-
-// getAnnotation get annotations from the resource group
-func getAnnotation(resGroupConfig wkconfig.ResGroupConfig) map[string]string {
- annotations := make(map[string]string)
- for _, val := range resGroupConfig.Object.Value {
- tmp := make(map[string]string)
- err := yaml.Unmarshal([]byte(val), &tmp)
- if err != nil {
- continue
- }
-
- if err := mergo.Merge(&annotations, &tmp); err != nil {
- klog.Errorf("annotations merge failed, err=%v", err)
- continue
- }
- }
-
- return annotations
-}
-
-// IsBelongResGroup adjust whether pod is belong to the resource group
-func (resGroup *annotationResGroup) IsBelongResGroup(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) bool {
- if resGroupConfig.Object.Key != "" && resGroupConfig.Object.Key != "annotation" {
- return false
- }
-
- annotations := getAnnotation(resGroupConfig)
- klog.V(3).Infof("annotations : %v", annotations)
- for key, annotation := range annotations {
- if pod.Annotations[key] == annotation {
- return true
- }
- }
-
- if resGroupConfig.Object.Key == "" && pod.Annotations[defaultAnnotationKey] == resGroupConfig.ResourceGroup {
- return true
- }
-
- return false
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- v1 "k8s.io/api/core/v1"
-
- wkconfig "volcano.sh/volcano/pkg/webhooks/config"
-)
-
-// ResGroup interface for resource group
-type ResGroup interface {
- IsBelongResGroup(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) bool
-}
-
-// GetResGroup return the interface besed on resourceGroup.Object.Key
-func GetResGroup(resourceGroup wkconfig.ResGroupConfig) ResGroup {
- switch resourceGroup.Object.Key {
- case "namespace":
- return NewNamespaceResGroup()
- case "annotation":
- return NewAnnotationResGroup()
- }
- return NewAnnotationResGroup()
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- "encoding/json"
- "fmt"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- wkconfig "volcano.sh/volcano/pkg/webhooks/config"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-// patchOperation define the patch operation structure
-type patchOperation struct {
- Op string `json:"op"`
- Path string `json:"path"`
- Value interface{} `json:"value,omitempty"`
-}
-
-// init register mutate pod
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/pods/mutate",
- Func: Pods,
- Config: config,
- MutatingConfig: &whv1beta1.MutatingWebhookConfiguration{
- Webhooks: []whv1beta1.MutatingWebhook{{
- Name: "mutatepod.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create},
- Rule: whv1beta1.Rule{
- APIGroups: []string{""},
- APIVersions: []string{"v1"},
- Resources: []string{"pods"},
- },
- },
- },
- }},
- },
-}
-
-var config = &router.AdmissionServiceConfig{}
-
-// Pods mutate pods.
-func Pods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("mutating pods -- %s", ar.Request.Operation)
- pod, err := schema.DecodePod(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
-
- if pod.Namespace == "" {
- pod.Namespace = ar.Request.Namespace
- }
-
- var patchBytes []byte
- switch ar.Request.Operation {
- case v1beta1.Create:
- patchBytes, _ = createPatch(pod)
- default:
- err = fmt.Errorf("expect operation to be 'CREATE' ")
- return util.ToAdmissionResponse(err)
- }
-
- reviewResponse := v1beta1.AdmissionResponse{
- Allowed: true,
- Patch: patchBytes,
- }
- pt := v1beta1.PatchTypeJSONPatch
- reviewResponse.PatchType = &pt
-
- return &reviewResponse
-}
-
-// createPatch patch pod
-func createPatch(pod *v1.Pod) ([]byte, error) {
- if config.ConfigData == nil {
- klog.V(5).Infof("admission configuration is empty.")
- return nil, nil
- }
-
- var patch []patchOperation
- config.ConfigData.Lock()
- defer config.ConfigData.Unlock()
-
- for _, resourceGroup := range config.ConfigData.ResGroupsConfig {
- klog.V(3).Infof("resourceGroup %s", resourceGroup.ResourceGroup)
- group := GetResGroup(resourceGroup)
- if !group.IsBelongResGroup(pod, resourceGroup) {
- continue
- }
-
- patchLabel := patchLabels(pod, resourceGroup)
- if patchLabel != nil {
- patch = append(patch, *patchLabel)
- }
-
- patchToleration := patchTaintToleration(pod, resourceGroup)
- if patchToleration != nil {
- patch = append(patch, *patchToleration)
- }
- patchScheduler := patchSchedulerName(resourceGroup)
- if patchScheduler != nil {
- patch = append(patch, *patchScheduler)
- }
-
- klog.V(5).Infof("pod patch %v", patch)
- return json.Marshal(patch)
- }
-
- return json.Marshal(patch)
-}
-
-// patchLabels patch label
-func patchLabels(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) *patchOperation {
- if len(resGroupConfig.Labels) == 0 {
- return nil
- }
-
- nodeSelector := make(map[string]string)
- for key, label := range pod.Spec.NodeSelector {
- nodeSelector[key] = label
- }
-
- for key, label := range resGroupConfig.Labels {
- nodeSelector[key] = label
- }
-
- return &patchOperation{Op: "add", Path: "/spec/nodeSelector", Value: nodeSelector}
-}
-
-// patchTaintToleration patch taint toleration
-func patchTaintToleration(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) *patchOperation {
- if len(resGroupConfig.Tolerations) == 0 {
- return nil
- }
-
- var dst []v1.Toleration
- dst = append(dst, pod.Spec.Tolerations...)
- dst = append(dst, resGroupConfig.Tolerations...)
-
- return &patchOperation{Op: "add", Path: "/spec/tolerations", Value: dst}
-}
-
-// patchSchedulerName patch scheduler
-func patchSchedulerName(resGroupConfig wkconfig.ResGroupConfig) *patchOperation {
- if resGroupConfig.SchedulerName == "" {
- return nil
- }
-
- return &patchOperation{Op: "add", Path: "/spec/schedulerName", Value: resGroupConfig.SchedulerName}
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- v1 "k8s.io/api/core/v1"
-
- wkconfig "volcano.sh/volcano/pkg/webhooks/config"
-)
-
-type namespaceResGroup struct{}
-
-// NewNamespaceResGroup create a new structure
-func NewNamespaceResGroup() ResGroup {
- return &namespaceResGroup{}
-}
-
-// IsBelongResGroup adjust whether pod is belong to the resource group
-func (resGroup *namespaceResGroup) IsBelongResGroup(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) bool {
- if resGroupConfig.Object.Key != "namespace" {
- return false
- }
-
- for _, val := range resGroupConfig.Object.Value {
- if pod.Namespace == val {
- return true
- }
- }
-
- return false
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package validate
-
-import (
- "context"
- "fmt"
- "strconv"
-
- "strings"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- v1 "k8s.io/api/core/v1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/util/intstr"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/helpers"
- vcv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/pods/validate",
- Func: AdmitPods,
-
- Config: config,
-
- ValidatingConfig: &whv1beta1.ValidatingWebhookConfiguration{
- Webhooks: []whv1beta1.ValidatingWebhook{{
- Name: "validatepod.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create},
- Rule: whv1beta1.Rule{
- APIGroups: []string{""},
- APIVersions: []string{"v1"},
- Resources: []string{"pods"},
- },
- },
- },
- }},
- },
-}
-
-var config = &router.AdmissionServiceConfig{}
-
-// AdmitPods is to admit pods and return response.
-func AdmitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("admitting pods -- %s", ar.Request.Operation)
-
- pod, err := schema.DecodePod(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
-
- var msg string
- reviewResponse := v1beta1.AdmissionResponse{}
- reviewResponse.Allowed = true
-
- switch ar.Request.Operation {
- case v1beta1.Create:
- msg = validatePod(pod, &reviewResponse)
- default:
- err := fmt.Errorf("expect operation to be 'CREATE'")
- return util.ToAdmissionResponse(err)
- }
-
- if !reviewResponse.Allowed {
- reviewResponse.Result = &metav1.Status{Message: strings.TrimSpace(msg)}
- }
- return &reviewResponse
-}
-
-/*
-allow pods to create when
-1. schedulerName of pod isn't volcano
-2. pod has Podgroup whose phase isn't Pending
-3. normal pods whose schedulerName is volcano don't have podgroup.
-4. check pod budget annotations configure
-*/
-func validatePod(pod *v1.Pod, reviewResponse *v1beta1.AdmissionResponse) string {
- if pod.Spec.SchedulerName != config.SchedulerName {
- return ""
- }
-
- pgName := ""
- msg := ""
-
- // vc-job, SN == volcano
- if pod.Annotations != nil {
- pgName = pod.Annotations[vcv1beta1.KubeGroupNameAnnotationKey]
- }
- if pgName != "" {
- if err := checkPGPhase(pod, pgName, true); err != nil {
- msg = err.Error()
- reviewResponse.Allowed = false
- }
- return msg
- }
-
- // normal pod, SN == volcano
- pgName = helpers.GeneratePodgroupName(pod)
- if err := checkPGPhase(pod, pgName, false); err != nil {
- msg = err.Error()
- reviewResponse.Allowed = false
- }
-
- // check pod annotatations
- if err := validateAnnotation(pod); err != nil {
- msg = err.Error()
- reviewResponse.Allowed = false
- }
-
- return msg
-}
-
-func checkPGPhase(pod *v1.Pod, pgName string, isVCJob bool) error {
- pg, err := config.VolcanoClient.SchedulingV1beta1().PodGroups(pod.Namespace).Get(context.TODO(), pgName, metav1.GetOptions{})
- if err != nil {
- if isVCJob || (!isVCJob && !apierrors.IsNotFound(err)) {
- return fmt.Errorf("failed to get PodGroup for pod <%s/%s>: %v", pod.Namespace, pod.Name, err)
- }
- return nil
- }
- if pg.Status.Phase != vcv1beta1.PodGroupPending {
- return nil
- }
- return fmt.Errorf("failed to create pod <%s/%s> as the podgroup phase is Pending",
- pod.Namespace, pod.Name)
-}
-
-func validateAnnotation(pod *v1.Pod) error {
- num := 0
- if len(pod.Annotations) > 0 {
- keys := []string{
- vcv1beta1.JDBMinAvailable,
- vcv1beta1.JDBMaxUnavailable,
- }
- for _, key := range keys {
- if value, found := pod.Annotations[key]; found {
- num++
- if err := validateIntPercentageStr(key, value); err != nil {
- recordEvent(err)
- return err
- }
- }
- }
- if num > 1 {
- return fmt.Errorf("not allow configure multiple annotations <%v> at same time", keys)
- }
- }
- return nil
-}
-
-func recordEvent(err error) {
- config.Recorder.Eventf(nil, v1.EventTypeWarning, "Admit", "Create pod failed due to %v", err)
-}
-
-func validateIntPercentageStr(key, value string) error {
- tmp := intstr.Parse(value)
- switch tmp.Type {
- case intstr.Int:
- if tmp.IntValue() <= 0 {
- return fmt.Errorf("invalid value <%q> for %v, it must be a positive integer", value, key)
- }
- return nil
- case intstr.String:
- s := strings.Replace(tmp.StrVal, "%", "", -1)
- v, err := strconv.Atoi(s)
- if err != nil {
- return fmt.Errorf("invalid value %v for %v", err, key)
- }
- if v <= 0 || v >= 100 {
- return fmt.Errorf("invalid value <%q> for %v, it must be a valid percentage which between 1%% ~ 99%%", tmp.StrVal, key)
- }
- return nil
- }
- return fmt.Errorf("invalid type: neither int nor percentage for %v", key)
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- "encoding/json"
- "fmt"
- "strings"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/klog"
-
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/queues/mutate",
- Func: Queues,
-
- MutatingConfig: &whv1beta1.MutatingWebhookConfiguration{
- Webhooks: []whv1beta1.MutatingWebhook{{
- Name: "mutatequeue.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create},
- Rule: whv1beta1.Rule{
- APIGroups: []string{schedulingv1beta1.SchemeGroupVersion.Group},
- APIVersions: []string{schedulingv1beta1.SchemeGroupVersion.Version},
- Resources: []string{"queues"},
- },
- },
- },
- }},
- },
-}
-
-type patchOperation struct {
- Op string `json:"op"`
- Path string `json:"path"`
- Value interface{} `json:"value,omitempty"`
-}
-
-// Queues mutate queues.
-func Queues(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("Mutating %s queue %s.", ar.Request.Operation, ar.Request.Name)
-
- queue, err := schema.DecodeQueue(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
-
- var patchBytes []byte
- switch ar.Request.Operation {
- case v1beta1.Create:
- patchBytes, err = createQueuePatch(queue)
- default:
- return util.ToAdmissionResponse(fmt.Errorf("invalid operation `%s`, "+
- "expect operation to be `CREATE`", ar.Request.Operation))
- }
-
- if err != nil {
- return &v1beta1.AdmissionResponse{
- Allowed: false,
- Result: &metav1.Status{Message: err.Error()},
- }
- }
-
- pt := v1beta1.PatchTypeJSONPatch
- return &v1beta1.AdmissionResponse{
- Allowed: true,
- Patch: patchBytes,
- PatchType: &pt,
- }
-}
-
-func createQueuePatch(queue *schedulingv1beta1.Queue) ([]byte, error) {
- var patch []patchOperation
-
- // add root node if the root node not specified
- hierarchy := queue.Annotations[schedulingv1beta1.KubeHierarchyAnnotationKey]
- hierarchicalWeights := queue.Annotations[schedulingv1beta1.KubeHierarchyWeightAnnotationKey]
-
- if hierarchy != "" && hierarchicalWeights != "" && !strings.HasPrefix(hierarchy, "root") {
- // based on https://tools.ietf.org/html/rfc6901#section-3
- // escape "/" with "~1"
- patch = append(patch, patchOperation{
- Op: "add",
- Path: fmt.Sprintf("/metadata/annotations/%s", strings.ReplaceAll(schedulingv1beta1.KubeHierarchyAnnotationKey, "/", "~1")),
- Value: fmt.Sprintf("root/%s", hierarchy),
- })
- patch = append(patch, patchOperation{
- Op: "add",
- Path: fmt.Sprintf("/metadata/annotations/%s", strings.ReplaceAll(schedulingv1beta1.KubeHierarchyWeightAnnotationKey, "/", "~1")),
- Value: fmt.Sprintf("1/%s", hierarchicalWeights),
- })
- }
-
- trueValue := true
- if queue.Spec.Reclaimable == nil {
- patch = append(patch, patchOperation{
- Op: "add",
- Path: "/spec/reclaimable",
- Value: &trueValue,
- })
- }
-
- defaultWeight := 1
- if queue.Spec.Weight == 0 {
- patch = append(patch, patchOperation{
- Op: "add",
- Path: "/spec/weight",
- Value: &defaultWeight,
- })
- }
-
- return json.Marshal(patch)
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package validate
-
-import (
- "context"
- "fmt"
- "strconv"
- "strings"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/util/validation/field"
- "k8s.io/klog"
-
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/queues/validate",
- Func: AdmitQueues,
-
- Config: config,
-
- ValidatingConfig: &whv1beta1.ValidatingWebhookConfiguration{
- Webhooks: []whv1beta1.ValidatingWebhook{{
- Name: "validatequeue.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create, whv1beta1.Update, whv1beta1.Delete},
- Rule: whv1beta1.Rule{
- APIGroups: []string{schedulingv1beta1.SchemeGroupVersion.Group},
- APIVersions: []string{schedulingv1beta1.SchemeGroupVersion.Version},
- Resources: []string{"queues"},
- },
- },
- },
- }},
- },
-}
-
-var config = &router.AdmissionServiceConfig{}
-
-// AdmitQueues is to admit queues and return response.
-func AdmitQueues(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("Admitting %s queue %s.", ar.Request.Operation, ar.Request.Name)
-
- queue, err := schema.DecodeQueue(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
-
- switch ar.Request.Operation {
- case v1beta1.Create, v1beta1.Update:
- err = validateQueue(queue)
- case v1beta1.Delete:
- err = validateQueueDeleting(ar.Request.Name)
- default:
- return util.ToAdmissionResponse(fmt.Errorf("invalid operation `%s`, "+
- "expect operation to be `CREATE`, `UPDATE` or `DELETE`", ar.Request.Operation))
- }
-
- if err != nil {
- return &v1beta1.AdmissionResponse{
- Allowed: false,
- Result: &metav1.Status{Message: err.Error()},
- }
- }
-
- return &v1beta1.AdmissionResponse{
- Allowed: true,
- }
-}
-
-func validateQueue(queue *schedulingv1beta1.Queue) error {
- errs := field.ErrorList{}
- resourcePath := field.NewPath("requestBody")
-
- errs = append(errs, validateStateOfQueue(queue.Status.State, resourcePath.Child("spec").Child("state"))...)
- errs = append(errs, validateWeightOfQueue(queue.Spec.Weight, resourcePath.Child("spec").Child("weight"))...)
- errs = append(errs, validateHierarchicalAttributes(queue, resourcePath.Child("metadata").Child("annotations"))...)
-
- if len(errs) > 0 {
- return errs.ToAggregate()
- }
-
- return nil
-}
-func validateHierarchicalAttributes(queue *schedulingv1beta1.Queue, fldPath *field.Path) field.ErrorList {
- errs := field.ErrorList{}
- hierarchy := queue.Annotations[schedulingv1beta1.KubeHierarchyAnnotationKey]
- hierarchicalWeights := queue.Annotations[schedulingv1beta1.KubeHierarchyWeightAnnotationKey]
- if hierarchy != "" || hierarchicalWeights != "" {
- paths := strings.Split(hierarchy, "/")
- weights := strings.Split(hierarchicalWeights, "/")
- // path length must be the same with weights length
- if len(paths) != len(weights) {
- return append(errs, field.Invalid(fldPath, hierarchy,
- fmt.Sprintf("%s must have the same length with %s",
- schedulingv1beta1.KubeHierarchyAnnotationKey,
- schedulingv1beta1.KubeHierarchyWeightAnnotationKey,
- )))
- }
-
- // check weights format
- for _, weight := range weights {
- weightFloat, err := strconv.ParseFloat(weight, 64)
- if err != nil {
- return append(errs, field.Invalid(fldPath, hierarchicalWeights,
- fmt.Sprintf("%s in the %s is invalid number: %v",
- weight, hierarchicalWeights, err,
- )))
- }
- if weightFloat <= 0 {
- return append(errs, field.Invalid(fldPath, hierarchicalWeights,
- fmt.Sprintf("%s in the %s must be larger than 0",
- weight, hierarchicalWeights,
- )))
- }
- }
-
- // The node is not allowed to be in the sub path of a node.
- // For example, a queue with "root/sci" conflicts with a queue with "root/sci/dev"
- queueList, err := config.VolcanoClient.SchedulingV1beta1().Queues().List(context.TODO(), metav1.ListOptions{})
- if err != nil {
- return append(errs, field.Invalid(fldPath, hierarchy,
- fmt.Sprintf("checking %s, list queues failed: %v",
- schedulingv1beta1.KubeHierarchyAnnotationKey,
- err,
- )))
- }
- for _, queueInTree := range queueList.Items {
- hierarchyInTree := queueInTree.Annotations[schedulingv1beta1.KubeHierarchyAnnotationKey]
- if hierarchyInTree != "" && queue.Name != queueInTree.Name &&
- strings.HasPrefix(hierarchyInTree, hierarchy) {
- return append(errs, field.Invalid(fldPath, hierarchy,
- fmt.Sprintf("%s is not allowed to be in the sub path of %s of queue %s",
- hierarchy, hierarchyInTree, queueInTree.Name)))
- }
- }
- }
- return errs
-}
-
-func validateStateOfQueue(value schedulingv1beta1.QueueState, fldPath *field.Path) field.ErrorList {
- errs := field.ErrorList{}
-
- if len(value) == 0 {
- return errs
- }
-
- validQueueStates := []schedulingv1beta1.QueueState{
- schedulingv1beta1.QueueStateOpen,
- schedulingv1beta1.QueueStateClosed,
- }
-
- for _, validQueue := range validQueueStates {
- if value == validQueue {
- return errs
- }
- }
-
- return append(errs, field.Invalid(fldPath, value, fmt.Sprintf("queue state must be in %v", validQueueStates)))
-}
-
-func validateWeightOfQueue(value int32, fldPath *field.Path) field.ErrorList {
- errs := field.ErrorList{}
- if value > 0 {
- return errs
- }
- return append(errs, field.Invalid(fldPath, value, "queue weight must be a positive integer"))
-}
-
-func validateQueueDeleting(queue string) error {
- if queue == "default" {
- return fmt.Errorf("`%s` queue can not be deleted", "default")
- }
-
- q, err := config.VolcanoClient.SchedulingV1beta1().Queues().Get(context.TODO(), queue, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- if q.Status.State != schedulingv1beta1.QueueStateClosed {
- return fmt.Errorf("only queue with state `%s` can be deleted, queue `%s` state is `%s`",
- schedulingv1beta1.QueueStateClosed, q.Name, q.Status.State)
- }
-
- return nil
-}
-
-
-
-
-
-
diff --git a/docs/ut_coverage/UT_coverage_v1.5.0.html b/docs/ut_coverage/UT_coverage_v1.5.0.html
deleted file mode 100644
index 1eaaa796fe..0000000000
--- a/docs/ut_coverage/UT_coverage_v1.5.0.html
+++ /dev/null
@@ -1,21819 +0,0 @@
-
-
-
-
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "os"
- "path/filepath"
-
- "github.com/spf13/cobra"
-)
-
-type commonFlags struct {
- Master string
- Kubeconfig string
-}
-
-func initFlags(cmd *cobra.Command, cf *commonFlags) {
- cmd.Flags().StringVarP(&cf.Master, "master", "s", "", "the address of apiserver")
-
- kubeConfFile := os.Getenv("KUBECONFIG")
- if kubeConfFile == "" {
- if home := homeDir(); home != "" {
- kubeConfFile = filepath.Join(home, ".kube", "config")
- }
- }
- cmd.Flags().StringVarP(&cf.Kubeconfig, "kubeconfig", "k", kubeConfFile, "(optional) absolute path to the kubeconfig file")
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type deleteFlags struct {
- commonFlags
-
- Namespace string
- JobName string
-}
-
-var deleteJobFlags = &deleteFlags{}
-
-// InitDeleteFlags init the delete command flags.
-func InitDeleteFlags(cmd *cobra.Command) {
- initFlags(cmd, &deleteJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&deleteJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&deleteJobFlags.JobName, "name", "N", "", "the name of job")
-}
-
-// DeleteJob delete the job.
-func DeleteJob() error {
- config, err := util.BuildConfig(deleteJobFlags.Master, deleteJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if deleteJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to delete a particular job")
- return err
- }
-
- jobClient := versioned.NewForConfigOrDie(config)
- err = jobClient.BatchV1alpha1().Jobs(deleteJobFlags.Namespace).Delete(context.TODO(), deleteJobFlags.JobName, metav1.DeleteOptions{})
- if err != nil {
- return err
- }
- fmt.Printf("delete job %v successfully\n", deleteJobFlags.JobName)
- return nil
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "io"
- "os"
- "strings"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type listFlags struct {
- commonFlags
-
- Namespace string
- SchedulerName string
- allNamespace bool
- selector string
-}
-
-const (
-
- // Name name etc below key words are used in job print format
- Name string = "Name"
- // Creation create
- Creation string = "Creation"
- // Phase phase
- Phase string = "Phase"
- // Replicas replicas
- Replicas string = "Replicas"
- // Min minimum
- Min string = "Min"
- // Scheduler scheduler
- Scheduler string = "Scheduler"
- // Pending pending
- Pending string = "Pending"
- // Running running
- Running string = "Running"
- // Succeeded success
- Succeeded string = "Succeeded"
- // Terminating terminating
- Terminating string = "Terminating"
- // Version version
- Version string = "Version"
- // Failed failed
- Failed string = "Failed"
- // Unknown pod
- Unknown string = "Unknown"
- // RetryCount retry count
- RetryCount string = "RetryCount"
- // JobType job type
- JobType string = "JobType"
- // Namespace job namespace
- Namespace string = "Namespace"
-)
-
-var listJobFlags = &listFlags{}
-
-// InitListFlags init list command flags.
-func InitListFlags(cmd *cobra.Command) {
- initFlags(cmd, &listJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&listJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&listJobFlags.SchedulerName, "scheduler", "S", "", "list job with specified scheduler name")
- cmd.Flags().BoolVarP(&listJobFlags.allNamespace, "all-namespaces", "", false, "list jobs in all namespaces")
- cmd.Flags().StringVarP(&listJobFlags.selector, "selector", "", "", "fuzzy matching jobName")
-}
-
-// ListJobs lists all jobs details.
-func ListJobs() error {
- config, err := util.BuildConfig(listJobFlags.Master, listJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
- if listJobFlags.allNamespace {
- listJobFlags.Namespace = ""
- }
- jobClient := versioned.NewForConfigOrDie(config)
- jobs, err := jobClient.BatchV1alpha1().Jobs(listJobFlags.Namespace).List(context.TODO(), metav1.ListOptions{})
- if err != nil {
- return err
- }
-
- if len(jobs.Items) == 0 {
- fmt.Printf("No resources found\n")
- return nil
- }
- PrintJobs(jobs, os.Stdout)
-
- return nil
-}
-
-// PrintJobs prints all jobs details.
-func PrintJobs(jobs *v1alpha1.JobList, writer io.Writer) {
- maxLenInfo := getMaxLen(jobs)
-
- titleFormat := "%%-%ds%%-15s%%-12s%%-12s%%-12s%%-6s%%-10s%%-10s%%-12s%%-10s%%-12s%%-10s\n"
- contentFormat := "%%-%ds%%-15s%%-12s%%-12s%%-12d%%-6d%%-10d%%-10d%%-12d%%-10d%%-12d%%-10d\n"
-
- var err error
- if listJobFlags.allNamespace {
- _, err = fmt.Fprintf(writer, fmt.Sprintf("%%-%ds"+titleFormat, maxLenInfo[1], maxLenInfo[0]),
- Namespace, Name, Creation, Phase, JobType, Replicas, Min, Pending, Running, Succeeded, Failed, Unknown, RetryCount)
- } else {
- _, err = fmt.Fprintf(writer, fmt.Sprintf(titleFormat, maxLenInfo[0]),
- Name, Creation, Phase, JobType, Replicas, Min, Pending, Running, Succeeded, Failed, Unknown, RetryCount)
- }
- if err != nil {
- fmt.Printf("Failed to print list command result: %s.\n", err)
- }
-
- for _, job := range jobs.Items {
- if listJobFlags.SchedulerName != "" && listJobFlags.SchedulerName != job.Spec.SchedulerName {
- continue
- }
- if !strings.Contains(job.Name, listJobFlags.selector) {
- continue
- }
- replicas := int32(0)
- for _, ts := range job.Spec.Tasks {
- replicas += ts.Replicas
- }
- jobType := job.ObjectMeta.Labels[v1alpha1.JobTypeKey]
- if jobType == "" {
- jobType = "Batch"
- }
-
- if listJobFlags.allNamespace {
- _, err = fmt.Fprintf(writer, fmt.Sprintf("%%-%ds"+contentFormat, maxLenInfo[1], maxLenInfo[0]),
- job.Namespace, job.Name, job.CreationTimestamp.Format("2006-01-02"), job.Status.State.Phase, jobType, replicas,
- job.Status.MinAvailable, job.Status.Pending, job.Status.Running, job.Status.Succeeded, job.Status.Failed, job.Status.Unknown, job.Status.RetryCount)
- } else {
- _, err = fmt.Fprintf(writer, fmt.Sprintf(contentFormat, maxLenInfo[0]),
- job.Name, job.CreationTimestamp.Format("2006-01-02"), job.Status.State.Phase, jobType, replicas,
- job.Status.MinAvailable, job.Status.Pending, job.Status.Running, job.Status.Succeeded, job.Status.Failed, job.Status.Unknown, job.Status.RetryCount)
- }
- if err != nil {
- fmt.Printf("Failed to print list command result: %s.\n", err)
- }
- }
-}
-
-func getMaxLen(jobs *v1alpha1.JobList) []int {
- maxNameLen := len(Name)
- maxNamespaceLen := len(Namespace)
- for _, job := range jobs.Items {
- if len(job.Name) > maxNameLen {
- maxNameLen = len(job.Name)
- }
- if len(job.Namespace) > maxNamespaceLen {
- maxNamespaceLen = len(job.Namespace)
- }
- }
-
- return []int{maxNameLen + 3, maxNamespaceLen + 3}
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "fmt"
-
- "github.com/spf13/cobra"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type resumeFlags struct {
- commonFlags
-
- Namespace string
- JobName string
-}
-
-var resumeJobFlags = &resumeFlags{}
-
-// InitResumeFlags init resume command flags.
-func InitResumeFlags(cmd *cobra.Command) {
- initFlags(cmd, &resumeJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&resumeJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&resumeJobFlags.JobName, "name", "N", "", "the name of job")
-}
-
-// ResumeJob resumes the job.
-func ResumeJob() error {
- config, err := util.BuildConfig(resumeJobFlags.Master, resumeJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
- if resumeJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to resume a particular job")
- return err
- }
-
- return createJobCommand(config,
- resumeJobFlags.Namespace, resumeJobFlags.JobName,
- v1alpha1.ResumeJobAction)
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "io/ioutil"
- "strings"
-
- "github.com/spf13/cobra"
-
- v1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "sigs.k8s.io/yaml"
-
- vcbatch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type runFlags struct {
- commonFlags
-
- Name string
- Namespace string
- Image string
-
- MinAvailable int
- Replicas int
- Requests string
- Limits string
- SchedulerName string
- FileName string
-}
-
-var launchJobFlags = &runFlags{}
-
-// InitRunFlags init the run flags.
-func InitRunFlags(cmd *cobra.Command) {
- initFlags(cmd, &launchJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&launchJobFlags.Image, "image", "i", "busybox", "the container image of job")
- cmd.Flags().StringVarP(&launchJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&launchJobFlags.Name, "name", "N", "", "the name of job")
- cmd.Flags().IntVarP(&launchJobFlags.MinAvailable, "min", "m", 1, "the minimal available tasks of job")
- cmd.Flags().IntVarP(&launchJobFlags.Replicas, "replicas", "r", 1, "the total tasks of job")
- cmd.Flags().StringVarP(&launchJobFlags.Requests, "requests", "R", "cpu=1000m,memory=100Mi", "the resource request of the task")
- cmd.Flags().StringVarP(&launchJobFlags.Limits, "limits", "L", "cpu=1000m,memory=100Mi", "the resource limit of the task")
- cmd.Flags().StringVarP(&launchJobFlags.SchedulerName, "scheduler", "S", "volcano", "the scheduler for this job")
- cmd.Flags().StringVarP(&launchJobFlags.FileName, "filename", "f", "", "the yaml file of job")
-}
-
-var jobName = "job.volcano.sh"
-
-// RunJob creates the job.
-func RunJob() error {
- config, err := util.BuildConfig(launchJobFlags.Master, launchJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if launchJobFlags.Name == "" && launchJobFlags.FileName == "" {
- err = fmt.Errorf("job name cannot be left blank")
- return err
- }
-
- req, err := populateResourceListV1(launchJobFlags.Requests)
- if err != nil {
- return err
- }
-
- limit, err := populateResourceListV1(launchJobFlags.Limits)
- if err != nil {
- return err
- }
-
- job, err := readFile(launchJobFlags.FileName)
- if err != nil {
- return err
- }
-
- if job == nil {
- job = constructLaunchJobFlagsJob(launchJobFlags, req, limit)
- }
-
- jobClient := versioned.NewForConfigOrDie(config)
- newJob, err := jobClient.BatchV1alpha1().Jobs(launchJobFlags.Namespace).Create(context.TODO(), job, metav1.CreateOptions{})
- if err != nil {
- return err
- }
-
- if newJob.Spec.Queue == "" {
- newJob.Spec.Queue = "default"
- }
-
- fmt.Printf("run job %v successfully\n", newJob.Name)
-
- return nil
-}
-
-func readFile(filename string) (*vcbatch.Job, error) {
- if filename == "" {
- return nil, nil
- }
-
- if !strings.Contains(filename, ".yaml") && !strings.Contains(filename, ".yml") {
- return nil, fmt.Errorf("only support yaml file")
- }
-
- file, err := ioutil.ReadFile(filename)
- if err != nil {
- return nil, fmt.Errorf("failed to read file, err: %v", err)
- }
-
- var job vcbatch.Job
- if err := yaml.Unmarshal(file, &job); err != nil {
- return nil, fmt.Errorf("failed to unmarshal file, err: %v", err)
- }
-
- return &job, nil
-}
-
-func constructLaunchJobFlagsJob(launchJobFlags *runFlags, req, limit v1.ResourceList) *vcbatch.Job {
- return &vcbatch.Job{
- ObjectMeta: metav1.ObjectMeta{
- Name: launchJobFlags.Name,
- Namespace: launchJobFlags.Namespace,
- },
- Spec: vcbatch.JobSpec{
- MinAvailable: int32(launchJobFlags.MinAvailable),
- SchedulerName: launchJobFlags.SchedulerName,
- Tasks: []vcbatch.TaskSpec{
- {
- Replicas: int32(launchJobFlags.Replicas),
-
- Template: v1.PodTemplateSpec{
- ObjectMeta: metav1.ObjectMeta{
- Name: launchJobFlags.Name,
- Labels: map[string]string{jobName: launchJobFlags.Name},
- },
- Spec: v1.PodSpec{
- RestartPolicy: v1.RestartPolicyNever,
- Containers: []v1.Container{
- {
- Image: launchJobFlags.Image,
- Name: launchJobFlags.Name,
- ImagePullPolicy: v1.PullIfNotPresent,
- Resources: v1.ResourceRequirements{
- Limits: limit,
- Requests: req,
- },
- },
- },
- },
- },
- },
- },
- },
- }
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "fmt"
-
- "github.com/spf13/cobra"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type suspendFlags struct {
- commonFlags
-
- Namespace string
- JobName string
-}
-
-var suspendJobFlags = &suspendFlags{}
-
-// InitSuspendFlags init suspend related flags.
-func InitSuspendFlags(cmd *cobra.Command) {
- initFlags(cmd, &suspendJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&suspendJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&suspendJobFlags.JobName, "name", "N", "", "the name of job")
-}
-
-// SuspendJob suspends the job.
-func SuspendJob() error {
- config, err := util.BuildConfig(suspendJobFlags.Master, suspendJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if suspendJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to suspend a particular job")
- return err
- }
-
- return createJobCommand(config,
- suspendJobFlags.Namespace, suspendJobFlags.JobName,
- v1alpha1.AbortJobAction)
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "os"
- "strings"
- "time"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
- "k8s.io/client-go/rest"
-
- vcbus "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-func homeDir() string {
- if h := os.Getenv("HOME"); h != "" {
- return h
- }
- return os.Getenv("USERPROFILE") // windows
-}
-
-// populateResourceListV1 takes strings of form <resourceName1>=<value1>,<resourceName1>=<value2>
-// and returns ResourceList.
-func populateResourceListV1(spec string) (v1.ResourceList, error) {
- // empty input gets a nil response to preserve generator test expected behaviors
- if spec == "" {
- return nil, nil
- }
-
- result := v1.ResourceList{}
- resourceStatements := strings.Split(spec, ",")
- for _, resourceStatement := range resourceStatements {
- parts := strings.Split(resourceStatement, "=")
- if len(parts) != 2 {
- return nil, fmt.Errorf("invalid argument syntax %v, expected <resource>=<value>", resourceStatement)
- }
- resourceName := v1.ResourceName(parts[0])
- resourceQuantity, err := resource.ParseQuantity(parts[1])
- if err != nil {
- return nil, err
- }
- result[resourceName] = resourceQuantity
- }
- return result, nil
-}
-
-func createJobCommand(config *rest.Config, ns, name string, action vcbus.Action) error {
- jobClient := versioned.NewForConfigOrDie(config)
- job, err := jobClient.BatchV1alpha1().Jobs(ns).Get(context.TODO(), name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- ctrlRef := metav1.NewControllerRef(job, helpers.JobKind)
- cmd := &vcbus.Command{
- ObjectMeta: metav1.ObjectMeta{
- GenerateName: fmt.Sprintf("%s-%s-",
- job.Name, strings.ToLower(string(action))),
- Namespace: job.Namespace,
- OwnerReferences: []metav1.OwnerReference{
- *ctrlRef,
- },
- },
- TargetObject: ctrlRef,
- Action: string(action),
- }
-
- if _, err := jobClient.BusV1alpha1().Commands(ns).Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
- return err
- }
-
- return nil
-}
-
-func translateTimestampSince(timestamp metav1.Time) string {
- if timestamp.IsZero() {
- return "<unknown>"
- }
- return HumanDuration(time.Since(timestamp.Time))
-}
-
-// HumanDuration translate time.Duration to human readable time string.
-func HumanDuration(d time.Duration) string {
- // Allow deviation no more than 2 seconds(excluded) to tolerate machine time
- // inconsistence, it can be considered as almost now.
- if seconds := int(d.Seconds()); seconds < -1 {
- return "<invalid>"
- } else if seconds < 0 {
- return "0s"
- } else if seconds < 60*2 {
- return fmt.Sprintf("%ds", seconds)
- }
- minutes := int(d / time.Minute)
- if minutes < 10 {
- s := int(d/time.Second) % 60
- if s == 0 {
- return fmt.Sprintf("%dm", minutes)
- }
- return fmt.Sprintf("%dm%ds", minutes, s)
- } else if minutes < 60*3 {
- return fmt.Sprintf("%dm", minutes)
- }
- hours := int(d / time.Hour)
- if hours < 8 {
- m := int(d/time.Minute) % 60
- if m == 0 {
- return fmt.Sprintf("%dh", hours)
- }
- return fmt.Sprintf("%dh%dm", hours, m)
- } else if hours < 48 {
- return fmt.Sprintf("%dh", hours)
- } else if hours < 24*8 {
- h := hours % 24
- if h == 0 {
- return fmt.Sprintf("%dd", hours/24)
- }
- return fmt.Sprintf("%dd%dh", hours/24, h)
- } else if hours < 24*365*2 {
- return fmt.Sprintf("%dd", hours/24)
- } else if hours < 24*365*8 {
- return fmt.Sprintf("%dy%dd", hours/24/365, (hours/24)%365)
- }
- return fmt.Sprintf("%dy", hours/24/365)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "encoding/json"
- "fmt"
- "io"
- "os"
- "strings"
-
- "github.com/spf13/cobra"
-
- coreV1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/client-go/kubernetes"
- "k8s.io/client-go/rest"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type viewFlags struct {
- commonFlags
-
- Namespace string
- JobName string
-}
-
-// level of print indent.
-const (
- Level0 = iota
- Level1
- Level2
-)
-
-var viewJobFlags = &viewFlags{}
-
-// InitViewFlags init the view command flags.
-func InitViewFlags(cmd *cobra.Command) {
- initFlags(cmd, &viewJobFlags.commonFlags)
-
- cmd.Flags().StringVarP(&viewJobFlags.Namespace, "namespace", "n", "default", "the namespace of job")
- cmd.Flags().StringVarP(&viewJobFlags.JobName, "name", "N", "", "the name of job")
-}
-
-// ViewJob gives full details of the job.
-func ViewJob() error {
- config, err := util.BuildConfig(viewJobFlags.Master, viewJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
- if viewJobFlags.JobName == "" {
- err := fmt.Errorf("job name (specified by --name or -N) is mandatory to view a particular job")
- return err
- }
-
- jobClient := versioned.NewForConfigOrDie(config)
- job, err := jobClient.BatchV1alpha1().Jobs(viewJobFlags.Namespace).Get(context.TODO(), viewJobFlags.JobName, metav1.GetOptions{})
- if err != nil {
- return err
- }
- if job == nil {
- fmt.Printf("No resources found\n")
- return nil
- }
- PrintJobInfo(job, os.Stdout)
- PrintEvents(GetEvents(config, job), os.Stdout)
- return nil
-}
-
-// PrintJobInfo print the job detailed info into writer.
-func PrintJobInfo(job *v1alpha1.Job, writer io.Writer) {
- WriteLine(writer, Level0, "Name: \t%s\n", job.Name)
- WriteLine(writer, Level0, "Namespace: \t%s\n", job.Namespace)
- if len(job.Labels) > 0 {
- label, _ := json.Marshal(job.Labels)
- WriteLine(writer, Level0, "Labels: \t%s\n", string(label))
- } else {
- WriteLine(writer, Level0, "Labels: \t<none>\n")
- }
- if len(job.Annotations) > 0 {
- annotation, _ := json.Marshal(job.Annotations)
- WriteLine(writer, Level0, "Annotations:\t%s\n", string(annotation))
- } else {
- WriteLine(writer, Level0, "Annotations:\t<none>\n")
- }
- WriteLine(writer, Level0, "API Version:\t%s\n", job.APIVersion)
- WriteLine(writer, Level0, "Kind: \t%s\n", job.Kind)
-
- WriteLine(writer, Level0, "Metadata:\n")
- WriteLine(writer, Level1, "Creation Timestamp:\t%s\n", job.CreationTimestamp)
- WriteLine(writer, Level1, "Generate Name: \t%s\n", job.GenerateName)
- WriteLine(writer, Level1, "Generation: \t%d\n", job.Generation)
- WriteLine(writer, Level1, "Resource Version: \t%s\n", job.ResourceVersion)
- WriteLine(writer, Level1, "Self Link: \t%s\n", job.SelfLink)
- WriteLine(writer, Level1, "UID: \t%s\n", job.UID)
-
- WriteLine(writer, Level0, "Spec:\n")
- WriteLine(writer, Level1, "Min Available: \t%d\n", job.Spec.MinAvailable)
- WriteLine(writer, Level1, "Plugins:\n")
- WriteLine(writer, Level2, "Env:\t%v\n", job.Spec.Plugins["env"])
- WriteLine(writer, Level2, "Ssh:\t%v\n", job.Spec.Plugins["ssh"])
- WriteLine(writer, Level1, "Scheduler Name: \t%s\n", job.Spec.SchedulerName)
- WriteLine(writer, Level1, "Tasks:\n")
- for i := 0; i < len(job.Spec.Tasks); i++ {
- WriteLine(writer, Level2, "Name:\t%s\n", job.Spec.Tasks[i].Name)
- WriteLine(writer, Level2, "Replicas:\t%d\n", job.Spec.Tasks[i].Replicas)
- WriteLine(writer, Level2, "Template:\n")
- WriteLine(writer, Level2+1, "Metadata:\n")
- WriteLine(writer, Level2+2, "Annotations:\n")
- WriteLine(writer, Level2+3, "Cri . Cci . Io / Container - Type: \t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.Annotations["cri.cci.io/container-type"])
- WriteLine(writer, Level2+3, "Kubernetes . Io / Availablezone: \t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.Annotations["kubernetes.io/availablezone"])
- WriteLine(writer, Level2+3, "Network . Alpha . Kubernetes . Io / Network:\t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.Annotations["network.alpha.kubernetes.io/network"])
- WriteLine(writer, Level2+2, "Creation Timestamp:\t%s\n", job.Spec.Tasks[i].Template.ObjectMeta.CreationTimestamp)
-
- WriteLine(writer, Level2+1, "Spec:\n")
- WriteLine(writer, Level2+2, "Containers:\n")
- for j := 0; j < len(job.Spec.Tasks[i].Template.Spec.Containers); j++ {
- WriteLine(writer, Level2+3, "Command:\n")
- for k := 0; k < len(job.Spec.Tasks[i].Template.Spec.Containers[j].Command); k++ {
- WriteLine(writer, Level2+4, "%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Command[k])
- }
- WriteLine(writer, Level2+3, "Image:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Image)
- WriteLine(writer, Level2+3, "Name: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Name)
- WriteLine(writer, Level2+3, "Ports:\n")
- for k := 0; k < len(job.Spec.Tasks[i].Template.Spec.Containers[j].Ports); k++ {
- WriteLine(writer, Level2+4, "Container Port:\t%d\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Ports[k].ContainerPort)
- WriteLine(writer, Level2+4, "Name: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Ports[k].Name)
- }
- WriteLine(writer, Level2+3, "Resources:\n")
- WriteLine(writer, Level2+4, "Limits:\n")
- WriteLine(writer, Level2+5, "Cpu: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Limits.Cpu())
- WriteLine(writer, Level2+5, "Memory:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Limits.Memory())
- WriteLine(writer, Level2+4, "Requests:\n")
- WriteLine(writer, Level2+5, "Cpu: \t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Requests.Cpu())
- WriteLine(writer, Level2+5, "Memory:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].Resources.Requests.Memory())
- WriteLine(writer, Level2+4, "Working Dir:\t%s\n", job.Spec.Tasks[i].Template.Spec.Containers[j].WorkingDir)
- }
- WriteLine(writer, Level2+2, "Image Pull Secrets:\n")
- for j := 0; j < len(job.Spec.Tasks[i].Template.Spec.ImagePullSecrets); j++ {
- WriteLine(writer, Level2+3, "Name: \t%s\n", job.Spec.Tasks[i].Template.Spec.ImagePullSecrets[j].Name)
- }
- WriteLine(writer, Level2+2, "Restart Policy: \t%s\n", job.Spec.Tasks[i].Template.Spec.RestartPolicy)
- }
-
- WriteLine(writer, Level0, "Status:\n")
- if job.Status.Succeeded > 0 {
- WriteLine(writer, Level1, "Succeeded: \t%d\n", job.Status.Succeeded)
- }
- if job.Status.Pending > 0 {
- WriteLine(writer, Level1, "Pending: \t%d\n", job.Status.Pending)
- }
- if job.Status.Running > 0 {
- WriteLine(writer, Level1, "Running: \t%d\n", job.Status.Running)
- }
- if job.Status.Failed > 0 {
- WriteLine(writer, Level1, "Failed: \t%d\n", job.Status.Failed)
- }
- if job.Status.Terminating > 0 {
- WriteLine(writer, Level1, "Terminating: \t%d\n", job.Status.Terminating)
- }
- if job.Status.Unknown > 0 {
- WriteLine(writer, Level1, "Unknown: \t%d\n", job.Status.Unknown)
- }
- if job.Status.RetryCount > 0 {
- WriteLine(writer, Level1, "RetryCount: \t%d\n", job.Status.RetryCount)
- }
- if job.Status.MinAvailable > 0 {
- WriteLine(writer, Level1, "Min Available:\t%d\n", job.Status.MinAvailable)
- }
- if job.Status.Version > 0 {
- WriteLine(writer, Level1, "Version: \t%d\n", job.Status.Version)
- }
-
- WriteLine(writer, Level1, "State:\n")
- WriteLine(writer, Level2, "Phase:\t%s\n", job.Status.State.Phase)
- if len(job.Status.ControlledResources) > 0 {
- WriteLine(writer, Level1, "Controlled Resources:\n")
- for key, value := range job.Status.ControlledResources {
- WriteLine(writer, Level2, "%s: \t%s\n", key, value)
- }
- }
- if len(job.Status.Conditions) > 0 {
- WriteLine(writer, Level1, "Conditions:\n Status\tTransitionTime\n")
- for _, c := range job.Status.Conditions {
- WriteLine(writer, Level2, "%v \t%v \n",
- c.Status,
- c.LastTransitionTime)
- }
- }
-}
-
-// PrintEvents print event info to writer.
-func PrintEvents(events []coreV1.Event, writer io.Writer) {
- if len(events) > 0 {
- WriteLine(writer, Level0, "%s:\n%-15s\t%-40s\t%-30s\t%-40s\t%s\n", "Events", "Type", "Reason", "Age", "Form", "Message")
- WriteLine(writer, Level0, "%-15s\t%-40s\t%-30s\t%-40s\t%s\n", "-------", "-------", "-------", "-------", "-------")
- for _, e := range events {
- var interval string
- if e.Count > 1 {
- interval = fmt.Sprintf("%s (x%d over %s)", translateTimestampSince(e.LastTimestamp), e.Count, translateTimestampSince(e.FirstTimestamp))
- } else {
- interval = translateTimestampSince(e.FirstTimestamp)
- }
- EventSourceString := []string{e.Source.Component}
- if len(e.Source.Host) > 0 {
- EventSourceString = append(EventSourceString, e.Source.Host)
- }
- WriteLine(writer, Level0, "%-15v\t%-40v\t%-30s\t%-40s\t%v\n",
- e.Type,
- e.Reason,
- interval,
- strings.Join(EventSourceString, ", "),
- strings.TrimSpace(e.Message),
- )
- }
- } else {
- WriteLine(writer, Level0, "Events: \t<none>\n")
- }
-}
-
-// GetEvents get the job event by config.
-func GetEvents(config *rest.Config, job *v1alpha1.Job) []coreV1.Event {
- kubernetes, err := kubernetes.NewForConfig(config)
- if err != nil {
- fmt.Printf("%v\n", err)
- return nil
- }
- events, _ := kubernetes.CoreV1().Events(viewJobFlags.Namespace).List(context.TODO(), metav1.ListOptions{})
- var jobEvents []coreV1.Event
- for _, v := range events.Items {
- if strings.HasPrefix(v.ObjectMeta.Name, job.Name+".") {
- jobEvents = append(jobEvents, v)
- }
- }
- return jobEvents
-}
-
-// WriteLine write lines with specified indent.
-func WriteLine(writer io.Writer, spaces int, content string, params ...interface{}) {
- prefix := ""
- for i := 0; i < spaces; i++ {
- prefix += " "
- }
- fmt.Fprintf(writer, prefix+content, params...)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "os"
- "path/filepath"
-
- "github.com/spf13/cobra"
-)
-
-type commonFlags struct {
- Master string
- Kubeconfig string
- SchedulerName string
-}
-
-func initFlags(cmd *cobra.Command, cf *commonFlags) {
- cmd.Flags().StringVarP(&cf.SchedulerName, "scheduler", "", "volcano", "the scheduler for this job")
- cmd.Flags().StringVarP(&cf.Master, "master", "s", "", "the address of apiserver")
-
- kubeConfFile := os.Getenv("KUBECONFIG")
- if kubeConfFile == "" {
- if home := homeDir(); home != "" {
- kubeConfFile = filepath.Join(home, ".kube", "config")
- }
- }
- cmd.Flags().StringVarP(&cf.Kubeconfig, "kubeconfig", "k", kubeConfFile, "(optional) absolute path to the kubeconfig file")
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-type createFlags struct {
- commonFlags
-
- Name string
- Weight int32
- // State is state of Queue
- State string
-}
-
-var createQueueFlags = &createFlags{}
-
-// InitCreateFlags is used to init all flags during queue creating.
-func InitCreateFlags(cmd *cobra.Command) {
- initFlags(cmd, &createQueueFlags.commonFlags)
-
- cmd.Flags().StringVarP(&createQueueFlags.Name, "name", "n", "test", "the name of queue")
- cmd.Flags().Int32VarP(&createQueueFlags.Weight, "weight", "w", 1, "the weight of the queue")
-
- cmd.Flags().StringVarP(&createQueueFlags.State, "state", "S", "Open", "the state of queue")
-}
-
-// CreateQueue create queue.
-func CreateQueue() error {
- config, err := buildConfig(createQueueFlags.Master, createQueueFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- queue := &schedulingv1beta1.Queue{
- ObjectMeta: metav1.ObjectMeta{
- Name: createQueueFlags.Name,
- },
- Spec: schedulingv1beta1.QueueSpec{
- Weight: createQueueFlags.Weight,
- },
- Status: schedulingv1beta1.QueueStatus{
- State: schedulingv1beta1.QueueState(createQueueFlags.State),
- },
- }
-
- queueClient := versioned.NewForConfigOrDie(config)
- if _, err := queueClient.SchedulingV1beta1().Queues().Create(context.TODO(), queue, metav1.CreateOptions{}); err != nil {
- return err
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
-
- "volcano.sh/apis/pkg/client/clientset/versioned"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-)
-
-type deleteFlags struct {
- commonFlags
-
- // Name is name of queue
- Name string
-}
-
-var deleteQueueFlags = &deleteFlags{}
-
-// InitDeleteFlags is used to init all flags during queue deleting.
-func InitDeleteFlags(cmd *cobra.Command) {
- initFlags(cmd, &deleteQueueFlags.commonFlags)
-
- cmd.Flags().StringVarP(&deleteQueueFlags.Name, "name", "n", "", "the name of queue")
-}
-
-// DeleteQueue delete queue.
-func DeleteQueue() error {
- config, err := buildConfig(deleteQueueFlags.Master, deleteQueueFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if len(deleteQueueFlags.Name) == 0 {
- return fmt.Errorf("queue name must be specified")
- }
-
- queueClient := versioned.NewForConfigOrDie(config)
- return queueClient.SchedulingV1beta1().Queues().Delete(context.TODO(), deleteQueueFlags.Name, metav1.DeleteOptions{})
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
- "io"
- "os"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-type getFlags struct {
- commonFlags
-
- Name string
-}
-
-var getQueueFlags = &getFlags{}
-
-// InitGetFlags is used to init all flags.
-func InitGetFlags(cmd *cobra.Command) {
- initFlags(cmd, &getQueueFlags.commonFlags)
-
- cmd.Flags().StringVarP(&getQueueFlags.Name, "name", "n", "", "the name of queue")
-}
-
-// GetQueue gets a queue.
-func GetQueue() error {
- config, err := buildConfig(getQueueFlags.Master, getQueueFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if getQueueFlags.Name == "" {
- err := fmt.Errorf("name is mandatory to get the particular queue details")
- return err
- }
-
- queueClient := versioned.NewForConfigOrDie(config)
- queue, err := queueClient.SchedulingV1beta1().Queues().Get(context.TODO(), getQueueFlags.Name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- PrintQueue(queue, os.Stdout)
-
- return nil
-}
-
-// PrintQueue prints queue information.
-func PrintQueue(queue *v1beta1.Queue, writer io.Writer) {
- _, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s\n",
- Name, Weight, State, Inqueue, Pending, Running, Unknown)
- if err != nil {
- fmt.Printf("Failed to print queue command result: %s.\n", err)
- }
- _, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d\n",
- queue.Name, queue.Spec.Weight, queue.Status.State, queue.Status.Inqueue,
- queue.Status.Pending, queue.Status.Running, queue.Status.Unknown)
- if err != nil {
- fmt.Printf("Failed to print queue command result: %s.\n", err)
- }
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
- "io"
- "os"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-type listFlags struct {
- commonFlags
-}
-
-const (
- // Weight of the queue
- Weight string = "Weight"
-
- // Name of queue
- Name string = "Name"
-
- // Pending status of the queue
- Pending string = "Pending"
-
- // Running status of the queue
- Running string = "Running"
-
- // Unknown status of the queue
- Unknown string = "Unknown"
-
- // Inqueue status of queue
- Inqueue string = "Inqueue"
-
- // State is state of queue
- State string = "State"
-)
-
-var listQueueFlags = &listFlags{}
-
-// InitListFlags inits all flags.
-func InitListFlags(cmd *cobra.Command) {
- initFlags(cmd, &listQueueFlags.commonFlags)
-}
-
-// ListQueue lists all the queue.
-func ListQueue() error {
- config, err := buildConfig(listQueueFlags.Master, listQueueFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- jobClient := versioned.NewForConfigOrDie(config)
- queues, err := jobClient.SchedulingV1beta1().Queues().List(context.TODO(), metav1.ListOptions{})
- if err != nil {
- return err
- }
-
- if len(queues.Items) == 0 {
- fmt.Printf("No resources found\n")
- return nil
- }
- PrintQueues(queues, os.Stdout)
-
- return nil
-}
-
-// PrintQueues prints queue information.
-func PrintQueues(queues *v1beta1.QueueList, writer io.Writer) {
- _, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s\n",
- Name, Weight, State, Inqueue, Pending, Running, Unknown)
- if err != nil {
- fmt.Printf("Failed to print queue command result: %s.\n", err)
- }
- for _, queue := range queues.Items {
- _, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d\n",
- queue.Name, queue.Spec.Weight, queue.Status.State, queue.Status.Inqueue,
- queue.Status.Pending, queue.Status.Running, queue.Status.Unknown)
- if err != nil {
- fmt.Printf("Failed to print queue command result: %s.\n", err)
- }
- }
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
-
- "github.com/spf13/cobra"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "k8s.io/apimachinery/pkg/types"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-const (
- // ActionOpen is `open` action
- ActionOpen = "open"
- // ActionClose is `close` action
- ActionClose = "close"
- // ActionUpdate is `update` action
- ActionUpdate = "update"
-)
-
-type operateFlags struct {
- commonFlags
-
- // Name is name of queue
- Name string
- // Weight is weight of queue
- Weight int32
- // Action is operation action of queue
- Action string
-}
-
-var operateQueueFlags = &operateFlags{}
-
-// InitOperateFlags is used to init all flags during queue operating
-func InitOperateFlags(cmd *cobra.Command) {
- initFlags(cmd, &operateQueueFlags.commonFlags)
-
- cmd.Flags().StringVarP(&operateQueueFlags.Name, "name", "n", "", "the name of queue")
- cmd.Flags().Int32VarP(&operateQueueFlags.Weight, "weight", "w", 0, "the weight of the queue")
- cmd.Flags().StringVarP(&operateQueueFlags.Action, "action", "a", "",
- "operate action to queue, valid actions are open, close, update")
-}
-
-// OperateQueue operates queue
-func OperateQueue() error {
- config, err := buildConfig(operateQueueFlags.Master, operateQueueFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if len(operateQueueFlags.Name) == 0 {
- return fmt.Errorf("queue name must be specified")
- }
-
- var action v1alpha1.Action
-
- switch operateQueueFlags.Action {
- case ActionOpen:
- action = v1alpha1.OpenQueueAction
- case ActionClose:
- action = v1alpha1.CloseQueueAction
- case ActionUpdate:
- if operateQueueFlags.Weight == 0 {
- return fmt.Errorf("when %s queue %s, weight must be specified, "+
- "the value must be greater than 0", ActionUpdate, operateQueueFlags.Name)
- }
-
- queueClient := versioned.NewForConfigOrDie(config)
- patchBytes := []byte(fmt.Sprintf(`{"spec":{"weight":%d}}`, operateQueueFlags.Weight))
- _, err := queueClient.SchedulingV1beta1().Queues().Patch(context.TODO(),
- operateQueueFlags.Name, types.MergePatchType, patchBytes, metav1.PatchOptions{})
-
- return err
- case "":
- return fmt.Errorf("action can not be null")
- default:
- return fmt.Errorf("action %s invalid, valid actions are %s, %s and %s",
- operateQueueFlags.Action, ActionOpen, ActionClose, ActionUpdate)
- }
-
- return createQueueCommand(config, action)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
- "os"
- "strings"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- // Initialize client auth plugin.
- _ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
- "k8s.io/client-go/rest"
- "k8s.io/client-go/tools/clientcmd"
-
- busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-func homeDir() string {
- if h := os.Getenv("HOME"); h != "" {
- return h
- }
- return os.Getenv("USERPROFILE") // windows
-}
-
-func buildConfig(master, kubeconfig string) (*rest.Config, error) {
- return clientcmd.BuildConfigFromFlags(master, kubeconfig)
-}
-
-func createQueueCommand(config *rest.Config, action busv1alpha1.Action) error {
- queueClient := versioned.NewForConfigOrDie(config)
- queue, err := queueClient.SchedulingV1beta1().Queues().Get(context.TODO(), operateQueueFlags.Name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- ctrlRef := metav1.NewControllerRef(queue, helpers.V1beta1QueueKind)
- cmd := &busv1alpha1.Command{
- ObjectMeta: metav1.ObjectMeta{
- GenerateName: fmt.Sprintf("%s-%s-",
- queue.Name, strings.ToLower(string(action))),
- OwnerReferences: []metav1.OwnerReference{
- *ctrlRef,
- },
- },
- TargetObject: ctrlRef,
- Action: string(action),
- }
-
- if _, err := queueClient.BusV1alpha1().Commands("default").Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
- return err
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package util
-
-import (
- "context"
- "fmt"
- "os"
- "path/filepath"
- "strings"
- "time"
-
- "github.com/spf13/cobra"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
- "k8s.io/client-go/rest"
- "k8s.io/client-go/tools/clientcmd"
-
- vcbus "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- "volcano.sh/apis/pkg/client/clientset/versioned"
-)
-
-// CommonFlags are the flags that most command lines have.
-type CommonFlags struct {
- Master string
- Kubeconfig string
-}
-
-// InitFlags initializes the common flags for most command lines.
-func InitFlags(cmd *cobra.Command, cf *CommonFlags) {
- cmd.Flags().StringVarP(&cf.Master, "master", "s", "", "the address of apiserver")
-
- kubeConfFile := os.Getenv("KUBECONFIG")
- if kubeConfFile == "" {
- if home := HomeDir(); home != "" {
- kubeConfFile = filepath.Join(home, ".kube", "config")
- }
- }
- cmd.Flags().StringVarP(&cf.Kubeconfig, "kubeconfig", "k", kubeConfFile, "(optional) absolute path to the kubeconfig file")
-}
-
-// HomeDir gets the env $HOME.
-func HomeDir() string {
- if h := os.Getenv("HOME"); h != "" {
- return h
- }
- return os.Getenv("USERPROFILE") // windows
-}
-
-// BuildConfig builds the configure file for command lines.
-func BuildConfig(master, kubeconfig string) (*rest.Config, error) {
- return clientcmd.BuildConfigFromFlags(master, kubeconfig)
-}
-
-// PopulateResourceListV1 takes strings of form <resourceName1>=<value1>,<resourceName1>=<value2> and returns ResourceList.
-func PopulateResourceListV1(spec string) (v1.ResourceList, error) {
- // empty input gets a nil response to preserve generator test expected behaviors
- if spec == "" {
- return nil, nil
- }
-
- result := v1.ResourceList{}
- resourceStatements := strings.Split(spec, ",")
- for _, resourceStatement := range resourceStatements {
- parts := strings.Split(resourceStatement, "=")
- if len(parts) != 2 {
- return nil, fmt.Errorf("invalid argument syntax %v, expected <resource>=<value>", resourceStatement)
- }
- resourceName := v1.ResourceName(parts[0])
- resourceQuantity, err := resource.ParseQuantity(parts[1])
- if err != nil {
- return nil, err
- }
- result[resourceName] = resourceQuantity
- }
- return result, nil
-}
-
-// CreateQueueCommand executes a command such as open/close
-func CreateQueueCommand(vcClient *versioned.Clientset, ns, name string, action vcbus.Action) error {
- queue, err := vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), name, metav1.GetOptions{})
- if err != nil {
- return err
- }
- ctrlRef := metav1.NewControllerRef(queue, helpers.V1beta1QueueKind)
- cmd := &vcbus.Command{
- ObjectMeta: metav1.ObjectMeta{
- GenerateName: fmt.Sprintf("%s-%s-",
- queue.Name, strings.ToLower(string(action))),
- Namespace: queue.Namespace,
- OwnerReferences: []metav1.OwnerReference{
- *ctrlRef,
- },
- },
- TargetObject: ctrlRef,
- Action: string(action),
- }
-
- if _, err := vcClient.BusV1alpha1().Commands(ns).Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
- return err
- }
-
- return nil
-}
-
-// CreateJobCommand executes a command such as resume/suspend.
-func CreateJobCommand(config *rest.Config, ns, name string, action vcbus.Action) error {
- jobClient := versioned.NewForConfigOrDie(config)
- job, err := jobClient.BatchV1alpha1().Jobs(ns).Get(context.TODO(), name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- ctrlRef := metav1.NewControllerRef(job, helpers.JobKind)
- cmd := &vcbus.Command{
- ObjectMeta: metav1.ObjectMeta{
- GenerateName: fmt.Sprintf("%s-%s-",
- job.Name, strings.ToLower(string(action))),
- Namespace: job.Namespace,
- OwnerReferences: []metav1.OwnerReference{
- *ctrlRef,
- },
- },
- TargetObject: ctrlRef,
- Action: string(action),
- }
-
- if _, err := jobClient.BusV1alpha1().Commands(ns).Create(context.TODO(), cmd, metav1.CreateOptions{}); err != nil {
- return err
- }
-
- return nil
-}
-
-// TranslateTimestampSince translates the time stamp.
-func TranslateTimestampSince(timestamp metav1.Time) string {
- if timestamp.IsZero() {
- return "<unknown>"
- }
- return HumanDuration(time.Since(timestamp.Time))
-}
-
-// HumanDuration translate time.Duration to human readable time string.
-func HumanDuration(d time.Duration) string {
- // Allow deviation no more than 2 seconds(excluded) to tolerate machine time
- // inconsistence, it can be considered as almost now.
- if seconds := int(d.Seconds()); seconds < -1 {
- return "<invalid>"
- } else if seconds < 0 {
- return "0s"
- } else if seconds < 60*2 {
- return fmt.Sprintf("%ds", seconds)
- }
- minutes := int(d / time.Minute)
- if minutes < 10 {
- s := int(d/time.Second) % 60
- if s == 0 {
- return fmt.Sprintf("%dm", minutes)
- }
- return fmt.Sprintf("%dm%ds", minutes, s)
- } else if minutes < 60*3 {
- return fmt.Sprintf("%dm", minutes)
- }
- hours := int(d / time.Hour)
- if hours < 8 {
- m := int(d/time.Minute) % 60
- if m == 0 {
- return fmt.Sprintf("%dh", hours)
- }
- return fmt.Sprintf("%dh%dm", hours, m)
- } else if hours < 48 {
- return fmt.Sprintf("%dh", hours)
- } else if hours < 24*8 {
- h := hours % 24
- if h == 0 {
- return fmt.Sprintf("%dd", hours/24)
- }
- return fmt.Sprintf("%dd%dh", hours/24, h)
- } else if hours < 24*365*2 {
- return fmt.Sprintf("%dd", hours/24)
- } else if hours < 24*365*8 {
- return fmt.Sprintf("%dy%dd", hours/24/365, (hours/24)%365)
- }
- return fmt.Sprintf("%dy", hours/24/365)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package vcancel
-
-import (
- "context"
- "fmt"
-
- "github.com/spf13/cobra"
-
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type cancelFlags struct {
- util.CommonFlags
-
- Namespace string
- JobName string
-}
-
-var cancelJobFlags = &cancelFlags{}
-
-// InitCancelFlags init the cancel command flags.
-func InitCancelFlags(cmd *cobra.Command) {
- util.InitFlags(cmd, &cancelJobFlags.CommonFlags)
-
- cmd.Flags().StringVarP(&cancelJobFlags.Namespace, "namespace", "N", "default", "the namespace of job")
- cmd.Flags().StringVarP(&cancelJobFlags.JobName, "name", "n", "", "the name of job")
-}
-
-// CancelJob cancel the job.
-func CancelJob() error {
- config, err := util.BuildConfig(cancelJobFlags.Master, cancelJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if cancelJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to cancel a particular job")
- return err
- }
-
- jobClient := versioned.NewForConfigOrDie(config)
- err = jobClient.BatchV1alpha1().Jobs(cancelJobFlags.Namespace).Delete(context.TODO(), cancelJobFlags.JobName, metav1.DeleteOptions{})
- if err != nil {
- return err
- }
- fmt.Printf("cancel job %v successfully\n", cancelJobFlags.JobName)
- return nil
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package vresume
-
-import (
- "fmt"
-
- "github.com/spf13/cobra"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type resumeFlags struct {
- util.CommonFlags
-
- Namespace string
- JobName string
-}
-
-var resumeJobFlags = &resumeFlags{}
-
-// InitResumeFlags init resume command flags.
-func InitResumeFlags(cmd *cobra.Command) {
- util.InitFlags(cmd, &resumeJobFlags.CommonFlags)
-
- cmd.Flags().StringVarP(&resumeJobFlags.Namespace, "namespace", "N", "default", "the namespace of job")
- cmd.Flags().StringVarP(&resumeJobFlags.JobName, "name", "n", "", "the name of job")
-}
-
-// ResumeJob resumes the job.
-func ResumeJob() error {
- config, err := util.BuildConfig(resumeJobFlags.Master, resumeJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
- if resumeJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to resume a particular job")
- return err
- }
-
- return util.CreateJobCommand(config,
- resumeJobFlags.Namespace, resumeJobFlags.JobName,
- v1alpha1.ResumeJobAction)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package vsuspend
-
-import (
- "fmt"
-
- "github.com/spf13/cobra"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/volcano/pkg/cli/util"
-)
-
-type suspendFlags struct {
- util.CommonFlags
-
- Namespace string
- JobName string
-}
-
-var suspendJobFlags = &suspendFlags{}
-
-// InitSuspendFlags init suspend related flags.
-func InitSuspendFlags(cmd *cobra.Command) {
- util.InitFlags(cmd, &suspendJobFlags.CommonFlags)
-
- cmd.Flags().StringVarP(&suspendJobFlags.Namespace, "namespace", "N", "default", "the namespace of job")
- cmd.Flags().StringVarP(&suspendJobFlags.JobName, "name", "n", "", "the name of job")
-}
-
-// SuspendJob suspends the job.
-func SuspendJob() error {
- config, err := util.BuildConfig(suspendJobFlags.Master, suspendJobFlags.Kubeconfig)
- if err != nil {
- return err
- }
-
- if suspendJobFlags.JobName == "" {
- err := fmt.Errorf("job name is mandatory to suspend a particular job")
- return err
- }
-
- return util.CreateJobCommand(config,
- suspendJobFlags.Namespace, suspendJobFlags.JobName,
- v1alpha1.AbortJobAction)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package apis
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
-)
-
-//JobInfo struct.
-type JobInfo struct {
- Namespace string
- Name string
-
- Job *batch.Job
- Pods map[string]map[string]*v1.Pod
-}
-
-//Clone function clones the k8s pod values to the JobInfo struct.
-func (ji *JobInfo) Clone() *JobInfo {
- job := &JobInfo{
- Namespace: ji.Namespace,
- Name: ji.Name,
- Job: ji.Job,
-
- Pods: make(map[string]map[string]*v1.Pod),
- }
-
- for key, pods := range ji.Pods {
- job.Pods[key] = make(map[string]*v1.Pod)
- for pn, pod := range pods {
- job.Pods[key][pn] = pod
- }
- }
-
- return job
-}
-
-//SetJob sets the volcano jobs values to the JobInfo struct.
-func (ji *JobInfo) SetJob(job *batch.Job) {
- ji.Name = job.Name
- ji.Namespace = job.Namespace
- ji.Job = job
-}
-
-//AddPod adds the k8s pod object values to the Pods field
-//of JobStruct if it doesn't exist. Otherwise it throws error.
-func (ji *JobInfo) AddPod(pod *v1.Pod) error {
- taskName, found := pod.Annotations[batch.TaskSpecKey]
- if !found {
- return fmt.Errorf("failed to find taskName of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
-
- _, found = pod.Annotations[batch.JobVersion]
- if !found {
- return fmt.Errorf("failed to find jobVersion of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
-
- if _, found := ji.Pods[taskName]; !found {
- ji.Pods[taskName] = make(map[string]*v1.Pod)
- }
- if _, found := ji.Pods[taskName][pod.Name]; found {
- return fmt.Errorf("duplicated pod")
- }
- ji.Pods[taskName][pod.Name] = pod
-
- return nil
-}
-
-//UpdatePod updates the k8s pod object values to the existing pod.
-func (ji *JobInfo) UpdatePod(pod *v1.Pod) error {
- taskName, found := pod.Annotations[batch.TaskSpecKey]
- if !found {
- return fmt.Errorf("failed to find taskName of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
- _, found = pod.Annotations[batch.JobVersion]
- if !found {
- return fmt.Errorf("failed to find jobVersion of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
-
- if _, found := ji.Pods[taskName]; !found {
- return fmt.Errorf("can not find task %s in cache", taskName)
- }
- if _, found := ji.Pods[taskName][pod.Name]; !found {
- return fmt.Errorf("can not find pod <%s/%s> in cache",
- pod.Namespace, pod.Name)
- }
- ji.Pods[taskName][pod.Name] = pod
-
- return nil
-}
-
-//DeletePod deletes the given k8s pod from the JobInfo struct.
-func (ji *JobInfo) DeletePod(pod *v1.Pod) error {
- taskName, found := pod.Annotations[batch.TaskSpecKey]
- if !found {
- return fmt.Errorf("failed to find taskName of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
- _, found = pod.Annotations[batch.JobVersion]
- if !found {
- return fmt.Errorf("failed to find jobVersion of Pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
-
- if pods, found := ji.Pods[taskName]; found {
- delete(pods, pod.Name)
- if len(pods) == 0 {
- delete(ji.Pods, taskName)
- }
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package apis
-
-import (
- "fmt"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
-)
-
-//Request struct.
-type Request struct {
- Namespace string
- JobName string
- TaskName string
- QueueName string
-
- Event v1alpha1.Event
- ExitCode int32
- Action v1alpha1.Action
- JobVersion int32
-}
-
-// String function returns the request in string format.
-func (r Request) String() string {
- return fmt.Sprintf(
- "Queue: %s, Job: %s/%s, Task:%s, Event:%s, ExitCode:%d, Action:%s, JobVersion: %d",
- r.QueueName, r.Namespace, r.JobName, r.TaskName, r.Event, r.ExitCode, r.Action, r.JobVersion)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package cache
-
-import (
- "fmt"
- "sync"
- "time"
-
- "golang.org/x/time/rate"
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/util/wait"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/volcano/pkg/controllers/apis"
-)
-
-type jobCache struct {
- sync.Mutex
-
- jobs map[string]*apis.JobInfo
- deletedJobs workqueue.RateLimitingInterface
-}
-
-func keyFn(ns, name string) string {
- return fmt.Sprintf("%s/%s", ns, name)
-}
-
-//JobKeyByName gets the key for the job name.
-func JobKeyByName(namespace string, name string) string {
- return keyFn(namespace, name)
-}
-
-//JobKeyByReq gets the key for the job request.
-func JobKeyByReq(req *apis.Request) string {
- return keyFn(req.Namespace, req.JobName)
-}
-
-//JobKey gets the "ns"/"name" format of the given job.
-func JobKey(job *v1alpha1.Job) string {
- return keyFn(job.Namespace, job.Name)
-}
-
-func jobTerminated(job *apis.JobInfo) bool {
- return job.Job == nil && len(job.Pods) == 0
-}
-
-func jobKeyOfPod(pod *v1.Pod) (string, error) {
- jobName, found := pod.Annotations[v1alpha1.JobNameKey]
- if !found {
- return "", fmt.Errorf("failed to find job name of pod <%s/%s>",
- pod.Namespace, pod.Name)
- }
-
- return keyFn(pod.Namespace, jobName), nil
-}
-
-// New gets the job Cache.
-func New() Cache {
- queue := workqueue.NewMaxOfRateLimiter(
- workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 180*time.Second),
- // 10 qps, 100 bucket size. This is only for retry speed and its only the overall factor (not per item)
- &workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
- )
-
- return &jobCache{
- jobs: map[string]*apis.JobInfo{},
- deletedJobs: workqueue.NewRateLimitingQueue(queue),
- }
-}
-
-func (jc *jobCache) Get(key string) (*apis.JobInfo, error) {
- jc.Lock()
- defer jc.Unlock()
-
- job, found := jc.jobs[key]
- if !found {
- return nil, fmt.Errorf("failed to find job <%s>", key)
- }
-
- if job.Job == nil {
- return nil, fmt.Errorf("job <%s> is not ready", key)
- }
-
- return job.Clone(), nil
-}
-
-func (jc *jobCache) GetStatus(key string) (*v1alpha1.JobStatus, error) {
- jc.Lock()
- defer jc.Unlock()
-
- job, found := jc.jobs[key]
- if !found {
- return nil, fmt.Errorf("failed to find job <%s>", key)
- }
-
- if job.Job == nil {
- return nil, fmt.Errorf("job <%s> is not ready", key)
- }
-
- status := job.Job.Status
-
- return &status, nil
-}
-
-func (jc *jobCache) Add(job *v1alpha1.Job) error {
- jc.Lock()
- defer jc.Unlock()
- key := JobKey(job)
- if jobInfo, found := jc.jobs[key]; found {
- if jobInfo.Job == nil {
- jobInfo.SetJob(job)
-
- return nil
- }
- return fmt.Errorf("duplicated jobInfo <%v>", key)
- }
-
- jc.jobs[key] = &apis.JobInfo{
- Name: job.Name,
- Namespace: job.Namespace,
-
- Job: job,
- Pods: make(map[string]map[string]*v1.Pod),
- }
-
- return nil
-}
-
-func (jc *jobCache) Update(obj *v1alpha1.Job) error {
- jc.Lock()
- defer jc.Unlock()
-
- key := JobKey(obj)
- job, found := jc.jobs[key]
- if !found {
- return fmt.Errorf("failed to find job <%v>", key)
- }
- job.Job = obj
-
- return nil
-}
-
-func (jc *jobCache) Delete(obj *v1alpha1.Job) error {
- jc.Lock()
- defer jc.Unlock()
-
- key := JobKey(obj)
- jobInfo, found := jc.jobs[key]
- if !found {
- return fmt.Errorf("failed to find job <%v>", key)
- }
- jobInfo.Job = nil
- jc.deleteJob(jobInfo)
-
- return nil
-}
-
-func (jc *jobCache) AddPod(pod *v1.Pod) error {
- jc.Lock()
- defer jc.Unlock()
-
- key, err := jobKeyOfPod(pod)
- if err != nil {
- return err
- }
-
- job, found := jc.jobs[key]
- if !found {
- job = &apis.JobInfo{
- Pods: make(map[string]map[string]*v1.Pod),
- }
- jc.jobs[key] = job
- }
-
- return job.AddPod(pod)
-}
-
-func (jc *jobCache) UpdatePod(pod *v1.Pod) error {
- jc.Lock()
- defer jc.Unlock()
-
- key, err := jobKeyOfPod(pod)
- if err != nil {
- return err
- }
-
- job, found := jc.jobs[key]
- if !found {
- job = &apis.JobInfo{
- Pods: make(map[string]map[string]*v1.Pod),
- }
- jc.jobs[key] = job
- }
-
- return job.UpdatePod(pod)
-}
-
-func (jc *jobCache) DeletePod(pod *v1.Pod) error {
- jc.Lock()
- defer jc.Unlock()
-
- key, err := jobKeyOfPod(pod)
- if err != nil {
- return err
- }
-
- job, found := jc.jobs[key]
- if !found {
- job = &apis.JobInfo{
- Pods: make(map[string]map[string]*v1.Pod),
- }
- jc.jobs[key] = job
- }
-
- if err := job.DeletePod(pod); err != nil {
- return err
- }
-
- if jc.jobs[key].Job == nil {
- jc.deleteJob(job)
- }
-
- return nil
-}
-
-func (jc *jobCache) Run(stopCh <-chan struct{}) {
- wait.Until(jc.worker, 0, stopCh)
-}
-
-func (jc *jobCache) TaskCompleted(jobKey, taskName string) bool {
- jc.Lock()
- defer jc.Unlock()
-
- var taskReplicas, completed int32
-
- jobInfo, found := jc.jobs[jobKey]
- if !found {
- return false
- }
-
- taskPods, found := jobInfo.Pods[taskName]
-
- if !found {
- return false
- }
-
- if jobInfo.Job == nil {
- return false
- }
-
- for _, task := range jobInfo.Job.Spec.Tasks {
- if task.Name == taskName {
- taskReplicas = task.Replicas
- break
- }
- }
- if taskReplicas <= 0 {
- return false
- }
-
- for _, pod := range taskPods {
- if pod.Status.Phase == v1.PodSucceeded {
- completed++
- }
- }
- return completed >= taskReplicas
-}
-
-func (jc *jobCache) TaskFailed(jobKey, taskName string) bool {
- jc.Lock()
- defer jc.Unlock()
-
- var taskReplicas, retried, maxRetry int32
-
- jobInfo, found := jc.jobs[jobKey]
- if !found {
- return false
- }
-
- taskPods, found := jobInfo.Pods[taskName]
-
- if !found || jobInfo.Job == nil {
- return false
- }
-
- for _, task := range jobInfo.Job.Spec.Tasks {
- if task.Name == taskName {
- maxRetry = task.MaxRetry
- taskReplicas = task.Replicas
- break
- }
- }
-
- // maxRetry == -1 means no limit
- if taskReplicas == 0 || maxRetry == -1 {
- return false
- }
-
- // Compatible with existing job
- if maxRetry == 0 {
- maxRetry = 3
- }
-
- for _, pod := range taskPods {
- if pod.Status.Phase == v1.PodRunning || pod.Status.Phase == v1.PodPending {
- for j := range pod.Status.InitContainerStatuses {
- stat := pod.Status.InitContainerStatuses[j]
- retried += stat.RestartCount
- }
- for j := range pod.Status.ContainerStatuses {
- stat := pod.Status.ContainerStatuses[j]
- retried += stat.RestartCount
- }
- }
- }
- return retried > maxRetry
-}
-
-func (jc *jobCache) worker() {
- for jc.processCleanupJob() {
- }
-}
-
-func (jc *jobCache) processCleanupJob() bool {
- obj, shutdown := jc.deletedJobs.Get()
- if shutdown {
- return false
- }
- defer jc.deletedJobs.Done(obj)
-
- job, ok := obj.(*apis.JobInfo)
- if !ok {
- klog.Errorf("failed to convert %v to *apis.JobInfo", obj)
- return true
- }
-
- jc.Mutex.Lock()
- defer jc.Mutex.Unlock()
-
- if jobTerminated(job) {
- jc.deletedJobs.Forget(obj)
- key := keyFn(job.Namespace, job.Name)
- delete(jc.jobs, key)
- klog.V(3).Infof("Job <%s> was deleted.", key)
- } else {
- // Retry
- jc.deleteJob(job)
- }
- return true
-}
-
-func (jc *jobCache) deleteJob(job *apis.JobInfo) {
- klog.V(3).Infof("Try to delete Job <%v/%v>",
- job.Namespace, job.Name)
-
- jc.deletedJobs.AddRateLimited(job)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package garbagecollector
-
-import (
- "context"
- "fmt"
- "time"
-
- "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/util/wait"
- "k8s.io/client-go/tools/cache"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
- informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
- batchinformers "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1"
- batchlisters "volcano.sh/apis/pkg/client/listers/batch/v1alpha1"
- "volcano.sh/volcano/pkg/controllers/framework"
-)
-
-func init() {
- framework.RegisterController(&gccontroller{})
-}
-
-// gccontroller runs reflectors to watch for changes of managed API
-// objects. Currently it only watches Jobs. Triggered by Job creation
-// and updates, it enqueues Jobs that have non-nil `.spec.ttlSecondsAfterFinished`
-// to the `queue`. The gccontroller has workers who consume `queue`, check whether
-// the Job TTL has expired or not; if the Job TTL hasn't expired, it will add the
-// Job to the queue after the TTL is expected to expire; if the TTL has expired, the
-// worker will send requests to the API server to delete the Jobs accordingly.
-// This is implemented outside of Job controller for separation of concerns, and
-// because it will be extended to handle other finishable resource types.
-type gccontroller struct {
- vcClient vcclientset.Interface
-
- jobInformer batchinformers.JobInformer
-
- // A store of jobs
- jobLister batchlisters.JobLister
- jobSynced func() bool
-
- // queues that need to be updated.
- queue workqueue.RateLimitingInterface
-}
-
-func (gc *gccontroller) Name() string {
- return "gc-controller"
-}
-
-// Initialize creates an instance of gccontroller.
-func (gc *gccontroller) Initialize(opt *framework.ControllerOption) error {
- gc.vcClient = opt.VolcanoClient
- jobInformer := informerfactory.NewSharedInformerFactory(gc.vcClient, 0).Batch().V1alpha1().Jobs()
-
- gc.jobInformer = jobInformer
- gc.jobLister = jobInformer.Lister()
- gc.jobSynced = jobInformer.Informer().HasSynced
- gc.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
-
- jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: gc.addJob,
- UpdateFunc: gc.updateJob,
- })
-
- return nil
-}
-
-// Run starts the worker to clean up Jobs.
-func (gc *gccontroller) Run(stopCh <-chan struct{}) {
- defer gc.queue.ShutDown()
-
- klog.Infof("Starting garbage collector")
- defer klog.Infof("Shutting down garbage collector")
-
- go gc.jobInformer.Informer().Run(stopCh)
- if !cache.WaitForCacheSync(stopCh, gc.jobSynced) {
- return
- }
-
- go wait.Until(gc.worker, time.Second, stopCh)
-
- <-stopCh
-}
-
-func (gc *gccontroller) addJob(obj interface{}) {
- job := obj.(*v1alpha1.Job)
- klog.V(4).Infof("Adding job %s/%s", job.Namespace, job.Name)
-
- if job.DeletionTimestamp == nil && needsCleanup(job) {
- gc.enqueue(job)
- }
-}
-
-func (gc *gccontroller) updateJob(old, cur interface{}) {
- job := cur.(*v1alpha1.Job)
- klog.V(4).Infof("Updating job %s/%s", job.Namespace, job.Name)
-
- if job.DeletionTimestamp == nil && needsCleanup(job) {
- gc.enqueue(job)
- }
-}
-
-func (gc *gccontroller) enqueue(job *v1alpha1.Job) {
- klog.V(4).Infof("Add job %s/%s to cleanup", job.Namespace, job.Name)
- key, err := cache.MetaNamespaceKeyFunc(job)
- if err != nil {
- klog.Errorf("couldn't get key for object %#v: %v", job, err)
- return
- }
-
- gc.queue.Add(key)
-}
-
-func (gc *gccontroller) enqueueAfter(job *v1alpha1.Job, after time.Duration) {
- key, err := cache.MetaNamespaceKeyFunc(job)
- if err != nil {
- klog.Errorf("couldn't get key for object %#v: %v", job, err)
- return
- }
-
- gc.queue.AddAfter(key, after)
-}
-
-func (gc *gccontroller) worker() {
- for gc.processNextWorkItem() {
- }
-}
-
-func (gc *gccontroller) processNextWorkItem() bool {
- key, quit := gc.queue.Get()
- if quit {
- return false
- }
- defer gc.queue.Done(key)
-
- err := gc.processJob(key.(string))
- gc.handleErr(err, key)
-
- return true
-}
-
-func (gc *gccontroller) handleErr(err error, key interface{}) {
- if err == nil {
- gc.queue.Forget(key)
- return
- }
-
- klog.Errorf("error cleaning up Job %v, will retry: %v", key, err)
- gc.queue.AddRateLimited(key)
-}
-
-// processJob will check the Job's state and TTL and delete the Job when it
-// finishes and its TTL after finished has expired. If the Job hasn't finished or
-// its TTL hasn't expired, it will be added to the queue after the TTL is expected
-// to expire.
-// This function is not meant to be invoked concurrently with the same key.
-func (gc *gccontroller) processJob(key string) error {
- namespace, name, err := cache.SplitMetaNamespaceKey(key)
- if err != nil {
- return err
- }
-
- klog.V(4).Infof("Checking if Job %s/%s is ready for cleanup", namespace, name)
- // Ignore the Jobs that are already deleted or being deleted, or the ones that don't need clean up.
- job, err := gc.jobLister.Jobs(namespace).Get(name)
- if errors.IsNotFound(err) {
- return nil
- }
- if err != nil {
- return err
- }
-
- if expired, err := gc.processTTL(job); err != nil {
- return err
- } else if !expired {
- return nil
- }
-
- // The Job's TTL is assumed to have expired, but the Job TTL might be stale.
- // Before deleting the Job, do a final sanity check.
- // If TTL is modified before we do this check, we cannot be sure if the TTL truly expires.
- // The latest Job may have a different UID, but it's fine because the checks will be run again.
- fresh, err := gc.vcClient.BatchV1alpha1().Jobs(namespace).Get(context.TODO(), name, metav1.GetOptions{})
- if errors.IsNotFound(err) {
- return nil
- }
- if err != nil {
- return err
- }
- // Use the latest Job TTL to see if the TTL truly expires.
- if expired, err := gc.processTTL(fresh); err != nil {
- return err
- } else if !expired {
- return nil
- }
- // Cascade deletes the Jobs if TTL truly expires.
- policy := metav1.DeletePropagationForeground
- options := metav1.DeleteOptions{
- PropagationPolicy: &policy,
- Preconditions: &metav1.Preconditions{UID: &fresh.UID},
- }
- klog.V(4).Infof("Cleaning up Job %s/%s", namespace, name)
- return gc.vcClient.BatchV1alpha1().Jobs(fresh.Namespace).Delete(context.TODO(), fresh.Name, options)
-}
-
-// processTTL checks whether a given Job's TTL has expired, and add it to the queue after the TTL is expected to expire
-// if the TTL will expire later.
-func (gc *gccontroller) processTTL(job *v1alpha1.Job) (expired bool, err error) {
- // We don't care about the Jobs that are going to be deleted, or the ones that don't need clean up.
- if job.DeletionTimestamp != nil || !needsCleanup(job) {
- return false, nil
- }
-
- now := time.Now()
- t, err := timeLeft(job, &now)
- if err != nil {
- return false, err
- }
-
- // TTL has expired
- if *t <= 0 {
- return true, nil
- }
-
- gc.enqueueAfter(job, *t)
- return false, nil
-}
-
-// needsCleanup checks whether a Job has finished and has a TTL set.
-func needsCleanup(j *v1alpha1.Job) bool {
- return j.Spec.TTLSecondsAfterFinished != nil && isJobFinished(j)
-}
-
-func isJobFinished(job *v1alpha1.Job) bool {
- return job.Status.State.Phase == v1alpha1.Completed ||
- job.Status.State.Phase == v1alpha1.Failed ||
- job.Status.State.Phase == v1alpha1.Terminated
-}
-
-func getFinishAndExpireTime(j *v1alpha1.Job) (*time.Time, *time.Time, error) {
- if !needsCleanup(j) {
- return nil, nil, fmt.Errorf("job %s/%s should not be cleaned up", j.Namespace, j.Name)
- }
- finishAt, err := jobFinishTime(j)
- if err != nil {
- return nil, nil, err
- }
- finishAtUTC := finishAt.UTC()
- expireAtUTC := finishAtUTC.Add(time.Duration(*j.Spec.TTLSecondsAfterFinished) * time.Second)
- return &finishAtUTC, &expireAtUTC, nil
-}
-
-func timeLeft(j *v1alpha1.Job, since *time.Time) (*time.Duration, error) {
- finishAt, expireAt, err := getFinishAndExpireTime(j)
- if err != nil {
- return nil, err
- }
- if finishAt.UTC().After(since.UTC()) {
- klog.Warningf("Warning: Found Job %s/%s finished in the future. This is likely due to time skew in the cluster. Job cleanup will be deferred.", j.Namespace, j.Name)
- }
- remaining := expireAt.UTC().Sub(since.UTC())
- klog.V(4).Infof("Found Job %s/%s finished at %v, remaining TTL %v since %v, TTL will expire at %v", j.Namespace, j.Name, finishAt.UTC(), remaining, since.UTC(), expireAt.UTC())
- return &remaining, nil
-}
-
-// jobFinishTime takes an already finished Job and returns the time it finishes.
-func jobFinishTime(finishedJob *v1alpha1.Job) (metav1.Time, error) {
- if finishedJob.Status.State.LastTransitionTime.IsZero() {
- return metav1.Time{}, fmt.Errorf("unable to find the time when the Job %s/%s finished", finishedJob.Namespace, finishedJob.Name)
- }
- return finishedJob.Status.State.LastTransitionTime, nil
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package helpers
-
-import (
- "fmt"
- "math/rand"
- "strconv"
- "strings"
- "time"
-
- v1 "k8s.io/api/core/v1"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/volcano/pkg/controllers/apis"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-const (
- // PodNameFmt pod name format
- PodNameFmt = "%s-%s-%d"
- // persistentVolumeClaimFmt represents persistent volume claim name format
- persistentVolumeClaimFmt = "%s-pvc-%s"
-)
-
-// GetPodIndexUnderTask returns task Index.
-func GetPodIndexUnderTask(pod *v1.Pod) string {
- num := strings.Split(pod.Name, "-")
- if len(num) >= 3 {
- return num[len(num)-1]
- }
-
- return ""
-}
-
-// ComparePodByIndex by pod index
-func CompareTask(lv, rv *api.TaskInfo) bool {
- lStr := GetPodIndexUnderTask(lv.Pod)
- rStr := GetPodIndexUnderTask(rv.Pod)
- lIndex, lErr := strconv.Atoi(lStr)
- rIndex, rErr := strconv.Atoi(rStr)
- if lErr != nil || rErr != nil || lIndex == rIndex {
- return lv.Pod.CreationTimestamp.Before(&rv.Pod.CreationTimestamp)
- }
- if lIndex > rIndex {
- return false
- }
- return true
-}
-
-// GetTaskKey returns task key/name
-func GetTaskKey(pod *v1.Pod) string {
- if pod.Annotations == nil || pod.Annotations[batch.TaskSpecKey] == "" {
- return batch.DefaultTaskSpec
- }
- return pod.Annotations[batch.TaskSpecKey]
-}
-
-// GetTaskSpec returns task spec
-func GetTaskSpec(job *batch.Job, taskName string) (batch.TaskSpec, bool) {
- for _, ts := range job.Spec.Tasks {
- if ts.Name == taskName {
- return ts, true
- }
- }
- return batch.TaskSpec{}, false
-}
-
-// MakeDomainName creates task domain name
-func MakeDomainName(ts batch.TaskSpec, job *batch.Job, index int) string {
- hostName := ts.Template.Spec.Hostname
- subdomain := ts.Template.Spec.Subdomain
- if len(hostName) == 0 {
- hostName = MakePodName(job.Name, ts.Name, index)
- }
- if len(subdomain) == 0 {
- subdomain = job.Name
- }
- return hostName + "." + subdomain
-}
-
-// MakePodName creates pod name.
-func MakePodName(jobName string, taskName string, index int) string {
- return fmt.Sprintf(PodNameFmt, jobName, taskName, index)
-}
-
-// GenRandomStr generate random str with specified length l.
-func GenRandomStr(l int) string {
- str := "0123456789abcdefghijklmnopqrstuvwxyz"
- bytes := []byte(str)
- var result []byte
- r := rand.New(rand.NewSource(time.Now().UnixNano()))
- for i := 0; i < l; i++ {
- result = append(result, bytes[r.Intn(len(bytes))])
- }
- return string(result)
-}
-
-// GenPVCName generates pvc name with job name.
-func GenPVCName(jobName string) string {
- return fmt.Sprintf(persistentVolumeClaimFmt, jobName, GenRandomStr(12))
-}
-
-// GetJobKeyByReq gets the key for the job request.
-func GetJobKeyByReq(req *apis.Request) string {
- return fmt.Sprintf("%s/%s", req.Namespace, req.JobName)
-}
-
-// GetTasklndexUnderJob return index of the task in the job.
-func GetTasklndexUnderJob(taskName string, job *batch.Job) int {
- for index, task := range job.Spec.Tasks {
- if task.Name == taskName {
- return index
- }
- }
- return -1
-}
-
-// GetPodsNameUnderTask return names of all pods in the task.
-func GetPodsNameUnderTask(taskName string, job *batch.Job) []string {
- var res []string
- for _, task := range job.Spec.Tasks {
- if task.Name == taskName {
- for index := 0; index < int(task.Replicas); index++ {
- res = append(res, MakePodName(job.Name, taskName, index))
- }
- break
- }
- }
- return res
-}
-
-
-
/*
-Copyright 2017 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "fmt"
- "hash"
- "hash/fnv"
- "time"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/util/wait"
- coreinformers "k8s.io/client-go/informers/core/v1"
- kubeschedulinginformers "k8s.io/client-go/informers/scheduling/v1"
- "k8s.io/client-go/kubernetes"
- corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
- corelisters "k8s.io/client-go/listers/core/v1"
- kubeschedulinglisters "k8s.io/client-go/listers/scheduling/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/client-go/tools/record"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
- vcscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
- informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
- batchinformer "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1"
- businformer "volcano.sh/apis/pkg/client/informers/externalversions/bus/v1alpha1"
- schedulinginformers "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
- batchlister "volcano.sh/apis/pkg/client/listers/batch/v1alpha1"
- buslister "volcano.sh/apis/pkg/client/listers/bus/v1alpha1"
- schedulinglisters "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
- jobcache "volcano.sh/volcano/pkg/controllers/cache"
- "volcano.sh/volcano/pkg/controllers/framework"
- "volcano.sh/volcano/pkg/controllers/job/state"
-)
-
-func init() {
- framework.RegisterController(&jobcontroller{})
-}
-
-// jobcontroller the Job jobcontroller type.
-type jobcontroller struct {
- kubeClient kubernetes.Interface
- vcClient vcclientset.Interface
-
- jobInformer batchinformer.JobInformer
- podInformer coreinformers.PodInformer
- pvcInformer coreinformers.PersistentVolumeClaimInformer
- pgInformer schedulinginformers.PodGroupInformer
- svcInformer coreinformers.ServiceInformer
- cmdInformer businformer.CommandInformer
- pcInformer kubeschedulinginformers.PriorityClassInformer
- queueInformer schedulinginformers.QueueInformer
-
- // A store of jobs
- jobLister batchlister.JobLister
- jobSynced func() bool
-
- // A store of pods
- podLister corelisters.PodLister
- podSynced func() bool
-
- pvcLister corelisters.PersistentVolumeClaimLister
- pvcSynced func() bool
-
- // A store of podgroups
- pgLister schedulinglisters.PodGroupLister
- pgSynced func() bool
-
- // A store of service
- svcLister corelisters.ServiceLister
- svcSynced func() bool
-
- cmdLister buslister.CommandLister
- cmdSynced func() bool
-
- pcLister kubeschedulinglisters.PriorityClassLister
- pcSynced func() bool
-
- queueLister schedulinglisters.QueueLister
- queueSynced func() bool
-
- // queue that need to sync up
- queueList []workqueue.RateLimitingInterface
- commandQueue workqueue.RateLimitingInterface
- cache jobcache.Cache
- // Job Event recorder
- recorder record.EventRecorder
-
- errTasks workqueue.RateLimitingInterface
- workers uint32
- maxRequeueNum int
-}
-
-func (cc *jobcontroller) Name() string {
- return "job-controller"
-}
-
-// Initialize creates the new Job job controller.
-func (cc *jobcontroller) Initialize(opt *framework.ControllerOption) error {
- cc.kubeClient = opt.KubeClient
- cc.vcClient = opt.VolcanoClient
-
- sharedInformers := opt.SharedInformerFactory
- workers := opt.WorkerNum
- // Initialize event client
- eventBroadcaster := record.NewBroadcaster()
- eventBroadcaster.StartLogging(klog.Infof)
- eventBroadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: cc.kubeClient.CoreV1().Events("")})
- recorder := eventBroadcaster.NewRecorder(vcscheme.Scheme, v1.EventSource{Component: "vc-controller-manager"})
-
- cc.queueList = make([]workqueue.RateLimitingInterface, workers)
- cc.commandQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
- cc.cache = jobcache.New()
- cc.errTasks = newRateLimitingQueue()
- cc.recorder = recorder
- cc.workers = workers
- cc.maxRequeueNum = opt.MaxRequeueNum
- if cc.maxRequeueNum < 0 {
- cc.maxRequeueNum = -1
- }
-
- var i uint32
- for i = 0; i < workers; i++ {
- cc.queueList[i] = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
- }
-
- cc.jobInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Batch().V1alpha1().Jobs()
- cc.jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: cc.addJob,
- UpdateFunc: cc.updateJob,
- DeleteFunc: cc.deleteJob,
- })
- cc.jobLister = cc.jobInformer.Lister()
- cc.jobSynced = cc.jobInformer.Informer().HasSynced
-
- cc.cmdInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Bus().V1alpha1().Commands()
- cc.cmdInformer.Informer().AddEventHandler(
- cache.FilteringResourceEventHandler{
- FilterFunc: func(obj interface{}) bool {
- switch v := obj.(type) {
- case *busv1alpha1.Command:
- if v.TargetObject != nil &&
- v.TargetObject.APIVersion == batchv1alpha1.SchemeGroupVersion.String() &&
- v.TargetObject.Kind == "Job" {
- return true
- }
-
- return false
- default:
- return false
- }
- },
- Handler: cache.ResourceEventHandlerFuncs{
- AddFunc: cc.addCommand,
- },
- },
- )
- cc.cmdLister = cc.cmdInformer.Lister()
- cc.cmdSynced = cc.cmdInformer.Informer().HasSynced
-
- cc.podInformer = sharedInformers.Core().V1().Pods()
- cc.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: cc.addPod,
- UpdateFunc: cc.updatePod,
- DeleteFunc: cc.deletePod,
- })
-
- cc.podLister = cc.podInformer.Lister()
- cc.podSynced = cc.podInformer.Informer().HasSynced
-
- cc.pvcInformer = sharedInformers.Core().V1().PersistentVolumeClaims()
- cc.pvcLister = cc.pvcInformer.Lister()
- cc.pvcSynced = cc.pvcInformer.Informer().HasSynced
-
- cc.svcInformer = sharedInformers.Core().V1().Services()
- cc.svcLister = cc.svcInformer.Lister()
- cc.svcSynced = cc.svcInformer.Informer().HasSynced
-
- cc.pgInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Scheduling().V1beta1().PodGroups()
- cc.pgInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- UpdateFunc: cc.updatePodGroup,
- })
- cc.pgLister = cc.pgInformer.Lister()
- cc.pgSynced = cc.pgInformer.Informer().HasSynced
-
- cc.pcInformer = sharedInformers.Scheduling().V1().PriorityClasses()
- cc.pcLister = cc.pcInformer.Lister()
- cc.pcSynced = cc.pcInformer.Informer().HasSynced
-
- cc.queueInformer = informerfactory.NewSharedInformerFactory(cc.vcClient, 0).Scheduling().V1beta1().Queues()
- cc.queueLister = cc.queueInformer.Lister()
- cc.queueSynced = cc.queueInformer.Informer().HasSynced
-
- // Register actions
- state.SyncJob = cc.syncJob
- state.KillJob = cc.killJob
-
- return nil
-}
-
-// Run start JobController.
-func (cc *jobcontroller) Run(stopCh <-chan struct{}) {
- go cc.jobInformer.Informer().Run(stopCh)
- go cc.podInformer.Informer().Run(stopCh)
- go cc.pvcInformer.Informer().Run(stopCh)
- go cc.pgInformer.Informer().Run(stopCh)
- go cc.svcInformer.Informer().Run(stopCh)
- go cc.cmdInformer.Informer().Run(stopCh)
- go cc.pcInformer.Informer().Run(stopCh)
- go cc.queueInformer.Informer().Run(stopCh)
-
- cache.WaitForCacheSync(stopCh, cc.jobSynced, cc.podSynced, cc.pgSynced,
- cc.svcSynced, cc.cmdSynced, cc.pvcSynced, cc.pcSynced, cc.queueSynced)
-
- go wait.Until(cc.handleCommands, 0, stopCh)
- var i uint32
- for i = 0; i < cc.workers; i++ {
- go func(num uint32) {
- wait.Until(
- func() {
- cc.worker(num)
- },
- time.Second,
- stopCh)
- }(i)
- }
-
- go cc.cache.Run(stopCh)
-
- // Re-sync error tasks.
- go wait.Until(cc.processResyncTask, 0, stopCh)
-
- klog.Infof("JobController is running ...... ")
-}
-
-func (cc *jobcontroller) worker(i uint32) {
- klog.Infof("worker %d start ...... ", i)
-
- for cc.processNextReq(i) {
- }
-}
-
-func (cc *jobcontroller) belongsToThisRoutine(key string, count uint32) bool {
- var hashVal hash.Hash32
- var val uint32
-
- hashVal = fnv.New32()
- hashVal.Write([]byte(key))
-
- val = hashVal.Sum32()
-
- return val%cc.workers == count
-}
-
-func (cc *jobcontroller) getWorkerQueue(key string) workqueue.RateLimitingInterface {
- var hashVal hash.Hash32
- var val uint32
-
- hashVal = fnv.New32()
- hashVal.Write([]byte(key))
-
- val = hashVal.Sum32()
-
- queue := cc.queueList[val%cc.workers]
-
- return queue
-}
-
-func (cc *jobcontroller) processNextReq(count uint32) bool {
- queue := cc.queueList[count]
- obj, shutdown := queue.Get()
- if shutdown {
- klog.Errorf("Fail to pop item from queue")
- return false
- }
-
- req := obj.(apis.Request)
- defer queue.Done(req)
-
- key := jobcache.JobKeyByReq(&req)
- if !cc.belongsToThisRoutine(key, count) {
- klog.Errorf("should not occur The job does not belongs to this routine key:%s, worker:%d...... ", key, count)
- queueLocal := cc.getWorkerQueue(key)
- queueLocal.Add(req)
- return true
- }
-
- klog.V(3).Infof("Try to handle request <%v>", req)
-
- jobInfo, err := cc.cache.Get(key)
- if err != nil {
- // TODO(k82cn): ignore not-ready error.
- klog.Errorf("Failed to get job by <%v> from cache: %v", req, err)
- return true
- }
-
- st := state.NewState(jobInfo)
- if st == nil {
- klog.Errorf("Invalid state <%s> of Job <%v/%v>",
- jobInfo.Job.Status.State, jobInfo.Job.Namespace, jobInfo.Job.Name)
- return true
- }
-
- action := applyPolicies(jobInfo.Job, &req)
- klog.V(3).Infof("Execute <%v> on Job <%s/%s> in <%s> by <%T>.",
- action, req.Namespace, req.JobName, jobInfo.Job.Status.State.Phase, st)
-
- if action != busv1alpha1.SyncJobAction {
- cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, batchv1alpha1.ExecuteAction, fmt.Sprintf(
- "Start to execute action %s ", action))
- }
-
- if err := st.Execute(action); err != nil {
- if cc.maxRequeueNum == -1 || queue.NumRequeues(req) < cc.maxRequeueNum {
- klog.V(2).Infof("Failed to handle Job <%s/%s>: %v",
- jobInfo.Job.Namespace, jobInfo.Job.Name, err)
- // If any error, requeue it.
- queue.AddRateLimited(req)
- return true
- }
- cc.recordJobEvent(jobInfo.Job.Namespace, jobInfo.Job.Name, batchv1alpha1.ExecuteAction, fmt.Sprintf(
- "Job failed on action %s for retry limit reached", action))
- klog.Warningf("Terminating Job <%s/%s> and releasing resources", jobInfo.Job.Namespace, jobInfo.Job.Name)
- if err = st.Execute(busv1alpha1.TerminateJobAction); err != nil {
- klog.Errorf("Failed to terminate Job<%s/%s>: %v", jobInfo.Job.Namespace, jobInfo.Job.Name, err)
- }
- klog.Warningf("Dropping job<%s/%s> out of the queue: %v because max retries has reached", jobInfo.Job.Namespace, jobInfo.Job.Name, err)
- }
-
- // If no error, forget it.
- queue.Forget(req)
-
- return true
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "reflect"
- "sort"
- "sync"
- "sync/atomic"
- "time"
-
- v1 "k8s.io/api/core/v1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/util/wait"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
- jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
- "volcano.sh/volcano/pkg/controllers/job/state"
-)
-
-var calMutex sync.Mutex
-
-func (cc *jobcontroller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.PhaseMap, updateStatus state.UpdateStatusFn) error {
- job := jobInfo.Job
- klog.V(3).Infof("Killing Job <%s/%s>, current version %d", job.Namespace, job.Name, job.Status.Version)
- defer klog.V(3).Infof("Finished Job <%s/%s> killing, current version %d", job.Namespace, job.Name, job.Status.Version)
-
- if job.DeletionTimestamp != nil {
- klog.Infof("Job <%s/%s> is terminating, skip management process.",
- job.Namespace, job.Name)
- return nil
- }
-
- var pending, running, terminating, succeeded, failed, unknown int32
- taskStatusCount := make(map[string]batch.TaskState)
-
- var errs []error
- var total int
-
- for _, pods := range jobInfo.Pods {
- for _, pod := range pods {
- total++
-
- if pod.DeletionTimestamp != nil {
- klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name)
- terminating++
- continue
- }
-
- maxRetry := job.Spec.MaxRetry
- lastRetry := false
- if job.Status.RetryCount >= maxRetry-1 {
- lastRetry = true
- }
-
- // Only retain the Failed and Succeeded pods at the last retry.
- // If it is not the last retry, kill pod as defined in `podRetainPhase`.
- retainPhase := podRetainPhase
- if lastRetry {
- retainPhase = state.PodRetainPhaseSoft
- }
- _, retain := retainPhase[pod.Status.Phase]
-
- if !retain {
- err := cc.deleteJobPod(job.Name, pod)
- if err == nil {
- terminating++
- continue
- }
- // record the err, and then collect the pod info like retained pod
- errs = append(errs, err)
- cc.resyncTask(pod)
- }
-
- classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
- calcPodStatus(pod, taskStatusCount)
- }
- }
-
- if len(errs) != 0 {
- klog.Errorf("failed to kill pods for job %s/%s, with err %+v", job.Namespace, job.Name, errs)
- cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason,
- fmt.Sprintf("Error deleting pods: %+v", errs))
- return fmt.Errorf("failed to kill %d pods of %d", len(errs), total)
- }
-
- job = job.DeepCopy()
- // Job version is bumped only when job is killed
- job.Status.Version++
- job.Status.Pending = pending
- job.Status.Running = running
- job.Status.Succeeded = succeeded
- job.Status.Failed = failed
- job.Status.Terminating = terminating
- job.Status.Unknown = unknown
- job.Status.TaskStatusCount = taskStatusCount
-
- // Update running duration
- klog.V(3).Infof("Running duration is %s", metav1.Duration{Duration: time.Since(jobInfo.Job.CreationTimestamp.Time)}.ToUnstructured())
- job.Status.RunningDuration = &metav1.Duration{Duration: time.Since(jobInfo.Job.CreationTimestamp.Time)}
-
- if updateStatus != nil {
- if updateStatus(&job.Status) {
- job.Status.State.LastTransitionTime = metav1.Now()
- jobCondition := newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime)
- job.Status.Conditions = append(job.Status.Conditions, jobCondition)
- }
- }
-
- // must be called before update job status
- if err := cc.pluginOnJobDelete(job); err != nil {
- return err
- }
-
- // Update Job status
- newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Failed to update status of Job %v/%v: %v",
- job.Namespace, job.Name, err)
- return err
- }
- if e := cc.cache.Update(newJob); e != nil {
- klog.Errorf("KillJob - Failed to update Job %v/%v in cache: %v",
- newJob.Namespace, newJob.Name, e)
- return e
- }
-
- // Delete PodGroup
- if err := cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Delete(context.TODO(), job.Name, metav1.DeleteOptions{}); err != nil {
- if !apierrors.IsNotFound(err) {
- klog.Errorf("Failed to delete PodGroup of Job %v/%v: %v",
- job.Namespace, job.Name, err)
- return err
- }
- }
-
- // NOTE(k82cn): DO NOT delete input/output until job is deleted.
-
- return nil
-}
-
-func (cc *jobcontroller) initiateJob(job *batch.Job) (*batch.Job, error) {
- klog.V(3).Infof("Starting to initiate Job <%s/%s>", job.Namespace, job.Name)
- jobInstance, err := cc.initJobStatus(job)
- if err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.JobStatusError),
- fmt.Sprintf("Failed to initialize job status, err: %v", err))
- return nil, err
- }
-
- if err := cc.pluginOnJobAdd(jobInstance); err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError),
- fmt.Sprintf("Execute plugin when job add failed, err: %v", err))
- return nil, err
- }
-
- newJob, err := cc.createJobIOIfNotExist(jobInstance)
- if err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PVCError),
- fmt.Sprintf("Failed to create PVC, err: %v", err))
- return nil, err
- }
-
- if err := cc.createOrUpdatePodGroup(newJob); err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError),
- fmt.Sprintf("Failed to create PodGroup, err: %v", err))
- return nil, err
- }
-
- return newJob, nil
-}
-
-func (cc *jobcontroller) initOnJobUpdate(job *batch.Job) error {
- klog.V(3).Infof("Starting to initiate Job <%s/%s> on update", job.Namespace, job.Name)
-
- if err := cc.pluginOnJobUpdate(job); err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PluginError),
- fmt.Sprintf("Execute plugin when job add failed, err: %v", err))
- return err
- }
-
- if err := cc.createOrUpdatePodGroup(job); err != nil {
- cc.recorder.Event(job, v1.EventTypeWarning, string(batch.PodGroupError),
- fmt.Sprintf("Failed to create PodGroup, err: %v", err))
- return err
- }
-
- return nil
-}
-
-func (cc *jobcontroller) GetQueueInfo(queue string) (*scheduling.Queue, error) {
- queueInfo, err := cc.queueLister.Get(queue)
- if err != nil {
- klog.Errorf("Failed to get queue from queueLister, error: %s", err.Error())
- }
-
- return queueInfo, err
-}
-
-func (cc *jobcontroller) syncJob(jobInfo *apis.JobInfo, updateStatus state.UpdateStatusFn) error {
- job := jobInfo.Job
- klog.V(3).Infof("Starting to sync up Job <%s/%s>, current version %d", job.Namespace, job.Name, job.Status.Version)
- defer klog.V(3).Infof("Finished Job <%s/%s> sync up, current version %d", job.Namespace, job.Name, job.Status.Version)
-
- if jobInfo.Job.DeletionTimestamp != nil {
- klog.Infof("Job <%s/%s> is terminating, skip management process.",
- jobInfo.Job.Namespace, jobInfo.Job.Name)
- return nil
- }
-
- // deep copy job to prevent mutate it
- job = job.DeepCopy()
-
- // Find queue that job belongs to, and check if the queue has forwarding metadata
- queueInfo, err := cc.GetQueueInfo(job.Spec.Queue)
- if err != nil {
- return err
- }
-
- var jobForwarding bool
- if len(queueInfo.Spec.ExtendClusters) != 0 {
- jobForwarding = true
- if len(job.Annotations) == 0 {
- job.Annotations = make(map[string]string)
- }
- job.Annotations[batch.JobForwardingKey] = "true"
- job, err = cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("failed to update job: %s/%s, error: %s", job.Namespace, job.Name, err.Error())
- return err
- }
- }
-
- // Skip job initiation if job is already initiated
- if !isInitiated(job) {
- if job, err = cc.initiateJob(job); err != nil {
- return err
- }
- } else {
- // TODO: optimize this call it only when scale up/down
- if err = cc.initOnJobUpdate(job); err != nil {
- return err
- }
- }
-
- if len(queueInfo.Spec.ExtendClusters) != 0 {
- jobForwarding = true
- job.Annotations[batch.JobForwardingKey] = "true"
- _, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("failed to update job: %s/%s, error: %s", job.Namespace, job.Name, err.Error())
- return err
- }
- }
-
- var syncTask bool
- if pg, _ := cc.pgLister.PodGroups(job.Namespace).Get(job.Name); pg != nil {
- if pg.Status.Phase != "" && pg.Status.Phase != scheduling.PodGroupPending {
- syncTask = true
- }
-
- for _, condition := range pg.Status.Conditions {
- if condition.Type == scheduling.PodGroupUnschedulableType {
- cc.recorder.Eventf(job, v1.EventTypeWarning, string(batch.PodGroupPending),
- fmt.Sprintf("PodGroup %s:%s unschedule,reason: %s", job.Namespace, job.Name, condition.Message))
- }
- }
- }
-
- var jobCondition batch.JobCondition
- if !syncTask {
- if updateStatus != nil {
- if updateStatus(&job.Status) {
- job.Status.State.LastTransitionTime = metav1.Now()
- jobCondition = newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime)
- job.Status.Conditions = append(job.Status.Conditions, jobCondition)
- }
- }
- newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Failed to update status of Job %v/%v: %v",
- job.Namespace, job.Name, err)
- return err
- }
- if e := cc.cache.Update(newJob); e != nil {
- klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v",
- newJob.Namespace, newJob.Name, e)
- return e
- }
- return nil
- }
-
- var running, pending, terminating, succeeded, failed, unknown int32
- taskStatusCount := make(map[string]batch.TaskState)
-
- podToCreate := make(map[string][]*v1.Pod)
- var podToDelete []*v1.Pod
- var creationErrs []error
- var deletionErrs []error
- appendMutex := sync.Mutex{}
-
- appendError := func(container *[]error, err error) {
- appendMutex.Lock()
- defer appendMutex.Unlock()
- *container = append(*container, err)
- }
-
- waitCreationGroup := sync.WaitGroup{}
-
- for _, ts := range job.Spec.Tasks {
- ts.Template.Name = ts.Name
- tc := ts.Template.DeepCopy()
- name := ts.Template.Name
-
- pods, found := jobInfo.Pods[name]
- if !found {
- pods = map[string]*v1.Pod{}
- }
-
- var podToCreateEachTask []*v1.Pod
- for i := 0; i < int(ts.Replicas); i++ {
- podName := fmt.Sprintf(jobhelpers.PodNameFmt, job.Name, name, i)
- if pod, found := pods[podName]; !found {
- newPod := createJobPod(job, tc, ts.TopologyPolicy, i, jobForwarding)
- if err := cc.pluginOnPodCreate(job, newPod); err != nil {
- return err
- }
- podToCreateEachTask = append(podToCreateEachTask, newPod)
- waitCreationGroup.Add(1)
- } else {
- delete(pods, podName)
- if pod.DeletionTimestamp != nil {
- klog.Infof("Pod <%s/%s> is terminating", pod.Namespace, pod.Name)
- atomic.AddInt32(&terminating, 1)
- continue
- }
-
- classifyAndAddUpPodBaseOnPhase(pod, &pending, &running, &succeeded, &failed, &unknown)
- calcPodStatus(pod, taskStatusCount)
- }
- }
- podToCreate[ts.Name] = podToCreateEachTask
- for _, pod := range pods {
- podToDelete = append(podToDelete, pod)
- }
- }
-
- for taskName, podToCreateEachTask := range podToCreate {
- if len(podToCreateEachTask) == 0 {
- continue
- }
- go func(taskName string, podToCreateEachTask []*v1.Pod) {
- taskIndex := jobhelpers.GetTasklndexUnderJob(taskName, job)
- if job.Spec.Tasks[taskIndex].DependsOn != nil {
- cc.waitDependsOnTaskMeetCondition(taskName, taskIndex, podToCreateEachTask, job)
- }
-
- for _, pod := range podToCreateEachTask {
- go func(pod *v1.Pod) {
- defer waitCreationGroup.Done()
- newPod, err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Create(context.TODO(), pod, metav1.CreateOptions{})
- if err != nil && !apierrors.IsAlreadyExists(err) {
- // Failed to create Pod, waitCreationGroup a moment and then create it again
- // This is to ensure all podsMap under the same Job created
- // So gang-scheduling could schedule the Job successfully
- klog.Errorf("Failed to create pod %s for Job %s, err %#v",
- pod.Name, job.Name, err)
- appendError(&creationErrs, fmt.Errorf("failed to create pod %s, err: %#v", pod.Name, err))
- } else {
- classifyAndAddUpPodBaseOnPhase(newPod, &pending, &running, &succeeded, &failed, &unknown)
- calcPodStatus(pod, taskStatusCount)
- klog.V(5).Infof("Created Task <%s> of Job <%s/%s>",
- pod.Name, job.Namespace, job.Name)
- }
- }(pod)
- }
- }(taskName, podToCreateEachTask)
- }
-
- waitCreationGroup.Wait()
-
- if len(creationErrs) != 0 {
- cc.recorder.Event(job, v1.EventTypeWarning, FailedCreatePodReason,
- fmt.Sprintf("Error creating pods: %+v", creationErrs))
- return fmt.Errorf("failed to create %d pods of %d", len(creationErrs), len(podToCreate))
- }
-
- // Delete pods when scale down.
- waitDeletionGroup := sync.WaitGroup{}
- waitDeletionGroup.Add(len(podToDelete))
- for _, pod := range podToDelete {
- go func(pod *v1.Pod) {
- defer waitDeletionGroup.Done()
- err := cc.deleteJobPod(job.Name, pod)
- if err != nil {
- // Failed to delete Pod, waitCreationGroup a moment and then create it again
- // This is to ensure all podsMap under the same Job created
- // So gang-scheduling could schedule the Job successfully
- klog.Errorf("Failed to delete pod %s for Job %s, err %#v",
- pod.Name, job.Name, err)
- appendError(&deletionErrs, err)
- cc.resyncTask(pod)
- } else {
- klog.V(3).Infof("Deleted Task <%s> of Job <%s/%s>",
- pod.Name, job.Namespace, job.Name)
- atomic.AddInt32(&terminating, 1)
- }
- }(pod)
- }
- waitDeletionGroup.Wait()
-
- if len(deletionErrs) != 0 {
- cc.recorder.Event(job, v1.EventTypeWarning, FailedDeletePodReason,
- fmt.Sprintf("Error deleting pods: %+v", deletionErrs))
- return fmt.Errorf("failed to delete %d pods of %d", len(deletionErrs), len(podToDelete))
- }
- job.Status = batch.JobStatus{
- State: job.Status.State,
-
- Pending: pending,
- Running: running,
- Succeeded: succeeded,
- Failed: failed,
- Terminating: terminating,
- Unknown: unknown,
- Version: job.Status.Version,
- MinAvailable: job.Spec.MinAvailable,
- TaskStatusCount: taskStatusCount,
- ControlledResources: job.Status.ControlledResources,
- Conditions: job.Status.Conditions,
- RetryCount: job.Status.RetryCount,
- }
-
- if updateStatus != nil {
- if updateStatus(&job.Status) {
- job.Status.State.LastTransitionTime = metav1.Now()
- jobCondition = newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime)
- job.Status.Conditions = append(job.Status.Conditions, jobCondition)
- }
- }
- newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Failed to update status of Job %v/%v: %v",
- job.Namespace, job.Name, err)
- return err
- }
- if e := cc.cache.Update(newJob); e != nil {
- klog.Errorf("SyncJob - Failed to update Job %v/%v in cache: %v",
- newJob.Namespace, newJob.Name, e)
- return e
- }
-
- return nil
-}
-
-func (cc *jobcontroller) waitDependsOnTaskMeetCondition(taskName string, taskIndex int, podToCreateEachTask []*v1.Pod, job *batch.Job) {
- if job.Spec.Tasks[taskIndex].DependsOn != nil {
- dependsOn := *job.Spec.Tasks[taskIndex].DependsOn
- if len(dependsOn.Name) > 1 && dependsOn.Iteration == batch.IterationAny {
- wait.PollInfinite(detectionPeriodOfDependsOntask, func() (bool, error) {
- for _, task := range dependsOn.Name {
- if cc.isDependsOnPodsReady(task, job) {
- return true, nil
- }
- }
- return false, nil
- })
- } else {
- for _, dependsOnTask := range dependsOn.Name {
- wait.PollInfinite(detectionPeriodOfDependsOntask, func() (bool, error) {
- if cc.isDependsOnPodsReady(dependsOnTask, job) {
- return true, nil
- }
- return false, nil
- })
- }
- }
- }
-}
-
-func (cc *jobcontroller) isDependsOnPodsReady(task string, job *batch.Job) bool {
- dependsOnPods := jobhelpers.GetPodsNameUnderTask(task, job)
- dependsOnTaskIndex := jobhelpers.GetTasklndexUnderJob(task, job)
- runningPodCount := 0
- for _, podName := range dependsOnPods {
- pod, err := cc.podLister.Pods(job.Namespace).Get(podName)
- if err != nil {
- klog.Errorf("Failed to get pod %v/%v %v", job.Namespace, podName, err)
- continue
- }
-
- if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodSucceeded {
- klog.V(5).Infof("Sequential state, pod %v/%v of depends on tasks is not running", pod.Namespace, pod.Name)
- continue
- }
-
- allContainerReady := true
- for _, containerStatus := range pod.Status.ContainerStatuses {
- if !containerStatus.Ready {
- allContainerReady = false
- break
- }
- }
- if allContainerReady {
- runningPodCount++
- }
- }
- dependsOnTaskMinReplicas := job.Spec.Tasks[dependsOnTaskIndex].MinAvailable
- if dependsOnTaskMinReplicas != nil {
- if runningPodCount < int(*dependsOnTaskMinReplicas) {
- klog.V(5).Infof("In a depends on startup state, there are already %d pods running, which is less than the minimum number of runs", runningPodCount)
- return false
- }
- }
- return true
-}
-
-func (cc *jobcontroller) createJobIOIfNotExist(job *batch.Job) (*batch.Job, error) {
- // If PVC does not exist, create them for Job.
- var needUpdate bool
- if job.Status.ControlledResources == nil {
- job.Status.ControlledResources = make(map[string]string)
- }
- for index, volume := range job.Spec.Volumes {
- vcName := volume.VolumeClaimName
- if len(vcName) == 0 {
- // NOTE(k82cn): Ensure never have duplicated generated names.
- for {
- vcName = jobhelpers.GenPVCName(job.Name)
- exist, err := cc.checkPVCExist(job, vcName)
- if err != nil {
- return job, err
- }
- if exist {
- continue
- }
- job.Spec.Volumes[index].VolumeClaimName = vcName
- needUpdate = true
- break
- }
- // TODO: check VolumeClaim must be set if VolumeClaimName is empty
- if volume.VolumeClaim != nil {
- if err := cc.createPVC(job, vcName, volume.VolumeClaim); err != nil {
- return job, err
- }
- }
- } else {
- exist, err := cc.checkPVCExist(job, vcName)
- if err != nil {
- return job, err
- }
- if !exist {
- return job, fmt.Errorf("pvc %s is not found, the job will be in the Pending state until the PVC is created", vcName)
- }
- }
- job.Status.ControlledResources["volume-pvc-"+vcName] = vcName
- }
- if needUpdate {
- newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).Update(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Failed to update Job %v/%v for volume claim name: %v ",
- job.Namespace, job.Name, err)
- return job, err
- }
-
- newJob.Status = job.Status
- return newJob, err
- }
- return job, nil
-}
-
-func (cc *jobcontroller) checkPVCExist(job *batch.Job, pvc string) (bool, error) {
- if _, err := cc.pvcLister.PersistentVolumeClaims(job.Namespace).Get(pvc); err != nil {
- if apierrors.IsNotFound(err) {
- return false, nil
- }
- klog.V(3).Infof("Failed to get PVC %s for job <%s/%s>: %v",
- pvc, job.Namespace, job.Name, err)
- return false, err
- }
- return true, nil
-}
-
-func (cc *jobcontroller) createPVC(job *batch.Job, vcName string, volumeClaim *v1.PersistentVolumeClaimSpec) error {
- pvc := &v1.PersistentVolumeClaim{
- ObjectMeta: metav1.ObjectMeta{
- Namespace: job.Namespace,
- Name: vcName,
- OwnerReferences: []metav1.OwnerReference{
- *metav1.NewControllerRef(job, helpers.JobKind),
- },
- },
- Spec: *volumeClaim,
- }
-
- klog.V(3).Infof("Try to create PVC: %v", pvc)
-
- if _, e := cc.kubeClient.CoreV1().PersistentVolumeClaims(job.Namespace).Create(context.TODO(), pvc, metav1.CreateOptions{}); e != nil {
- klog.V(3).Infof("Failed to create PVC for Job <%s/%s>: %v",
- job.Namespace, job.Name, e)
- return e
- }
- return nil
-}
-
-func (cc *jobcontroller) createOrUpdatePodGroup(job *batch.Job) error {
- // If PodGroup does not exist, create one for Job.
- pg, err := cc.pgLister.PodGroups(job.Namespace).Get(job.Name)
- if err != nil {
- if !apierrors.IsNotFound(err) {
- klog.Errorf("Failed to get PodGroup for Job <%s/%s>: %v",
- job.Namespace, job.Name, err)
- return err
- }
-
- minTaskMember := map[string]int32{}
- for _, task := range job.Spec.Tasks {
- if task.MinAvailable != nil {
- minTaskMember[task.Name] = *task.MinAvailable
- } else {
- minTaskMember[task.Name] = task.Replicas
- }
- }
-
- pg := &scheduling.PodGroup{
- ObjectMeta: metav1.ObjectMeta{
- Namespace: job.Namespace,
- Name: job.Name,
- Annotations: job.Annotations,
- Labels: job.Labels,
- OwnerReferences: []metav1.OwnerReference{
- *metav1.NewControllerRef(job, helpers.JobKind),
- },
- },
- Spec: scheduling.PodGroupSpec{
- MinMember: job.Spec.MinAvailable,
- MinTaskMember: minTaskMember,
- Queue: job.Spec.Queue,
- MinResources: cc.calcPGMinResources(job),
- PriorityClassName: job.Spec.PriorityClassName,
- },
- }
-
- if _, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Create(context.TODO(), pg, metav1.CreateOptions{}); err != nil {
- if !apierrors.IsAlreadyExists(err) {
- klog.Errorf("Failed to create PodGroup for Job <%s/%s>: %v",
- job.Namespace, job.Name, err)
- return err
- }
- }
- return nil
- }
-
- pgShouldUpdate := false
- if pg.Spec.PriorityClassName != job.Spec.PriorityClassName {
- pg.Spec.PriorityClassName = job.Spec.PriorityClassName
- pgShouldUpdate = true
- }
-
- minResources := cc.calcPGMinResources(job)
- if pg.Spec.MinMember != job.Spec.MinAvailable || !equality.Semantic.DeepEqual(pg.Spec.MinResources, minResources) {
- pg.Spec.MinMember = job.Spec.MinAvailable
- pg.Spec.MinResources = minResources
- pgShouldUpdate = true
- }
-
- if pg.Spec.MinTaskMember == nil {
- pgShouldUpdate = true
- pg.Spec.MinTaskMember = make(map[string]int32)
- }
-
- for _, task := range job.Spec.Tasks {
- if task.MinAvailable == nil {
- continue
- }
-
- if taskMember, ok := pg.Spec.MinTaskMember[task.Name]; !ok {
- pgShouldUpdate = true
- pg.Spec.MinTaskMember[task.Name] = *task.MinAvailable
- } else {
- if taskMember == *task.MinAvailable {
- continue
- }
-
- pgShouldUpdate = true
- pg.Spec.MinTaskMember[task.Name] = *task.MinAvailable
- }
- }
-
- if !pgShouldUpdate {
- return nil
- }
-
- _, err = cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Update(context.TODO(), pg, metav1.UpdateOptions{})
- if err != nil {
- klog.V(3).Infof("Failed to update PodGroup for Job <%s/%s>: %v",
- job.Namespace, job.Name, err)
- }
- return err
-}
-
-func (cc *jobcontroller) deleteJobPod(jobName string, pod *v1.Pod) error {
- err := cc.kubeClient.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{})
- if err != nil && !apierrors.IsNotFound(err) {
- klog.Errorf("Failed to delete pod %s/%s for Job %s, err %#v",
- pod.Namespace, pod.Name, jobName, err)
-
- return fmt.Errorf("failed to delete pod %s, err %#v", pod.Name, err)
- }
-
- return nil
-}
-
-func (cc *jobcontroller) calcPGMinResources(job *batch.Job) *v1.ResourceList {
- // sort task by priorityClasses
- var tasksPriority TasksPriority
- for _, task := range job.Spec.Tasks {
- tp := TaskPriority{0, task}
- pc := task.Template.Spec.PriorityClassName
-
- priorityClass, err := cc.pcLister.Get(pc)
- if err != nil || priorityClass == nil {
- klog.Warningf("Ignore task %s priority class %s: %v", task.Name, pc, err)
- } else {
- tp.priority = priorityClass.Value
- }
-
- tasksPriority = append(tasksPriority, tp)
- }
-
- sort.Sort(tasksPriority)
-
- minAvailableTasksRes := v1.ResourceList{}
- podCnt := int32(0)
- for _, task := range tasksPriority {
- for i := int32(0); i < task.Replicas; i++ {
- if podCnt >= job.Spec.MinAvailable {
- break
- }
- podCnt++
- for _, c := range task.Template.Spec.Containers {
- addResourceList(minAvailableTasksRes, c.Resources.Requests, c.Resources.Limits)
- }
- }
- }
-
- return &minAvailableTasksRes
-}
-
-func (cc *jobcontroller) initJobStatus(job *batch.Job) (*batch.Job, error) {
- if job.Status.State.Phase != "" {
- return job, nil
- }
-
- job.Status.State.Phase = batch.Pending
- job.Status.State.LastTransitionTime = metav1.Now()
- job.Status.MinAvailable = job.Spec.MinAvailable
- jobCondition := newCondition(job.Status.State.Phase, &job.Status.State.LastTransitionTime)
- job.Status.Conditions = append(job.Status.Conditions, jobCondition)
- newJob, err := cc.vcClient.BatchV1alpha1().Jobs(job.Namespace).UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Failed to update status of Job %v/%v: %v",
- job.Namespace, job.Name, err)
- return nil, err
- }
- if err := cc.cache.Update(newJob); err != nil {
- klog.Errorf("CreateJob - Failed to update Job %v/%v in cache: %v",
- newJob.Namespace, newJob.Name, err)
- return nil, err
- }
-
- return newJob, nil
-}
-
-func classifyAndAddUpPodBaseOnPhase(pod *v1.Pod, pending, running, succeeded, failed, unknown *int32) {
- switch pod.Status.Phase {
- case v1.PodPending:
- atomic.AddInt32(pending, 1)
- case v1.PodRunning:
- atomic.AddInt32(running, 1)
- case v1.PodSucceeded:
- atomic.AddInt32(succeeded, 1)
- case v1.PodFailed:
- atomic.AddInt32(failed, 1)
- default:
- atomic.AddInt32(unknown, 1)
- }
-}
-
-func calcPodStatus(pod *v1.Pod, taskStatusCount map[string]batch.TaskState) {
- taskName, found := pod.Annotations[batch.TaskSpecKey]
- if !found {
- return
- }
-
- calMutex.Lock()
- defer calMutex.Unlock()
- if _, ok := taskStatusCount[taskName]; !ok {
- taskStatusCount[taskName] = batch.TaskState{
- Phase: make(map[v1.PodPhase]int32),
- }
- }
-
- switch pod.Status.Phase {
- case v1.PodPending:
- taskStatusCount[taskName].Phase[v1.PodPending]++
- case v1.PodRunning:
- taskStatusCount[taskName].Phase[v1.PodRunning]++
- case v1.PodSucceeded:
- taskStatusCount[taskName].Phase[v1.PodSucceeded]++
- case v1.PodFailed:
- taskStatusCount[taskName].Phase[v1.PodFailed]++
- default:
- taskStatusCount[taskName].Phase[v1.PodUnknown]++
- }
-}
-
-func isInitiated(job *batch.Job) bool {
- if job.Status.State.Phase == "" || job.Status.State.Phase == batch.Pending {
- return false
- }
-
- return true
-}
-
-func newCondition(status batch.JobPhase, lastTransitionTime *metav1.Time) batch.JobCondition {
- return batch.JobCondition{
- Status: status,
- LastTransitionTime: lastTransitionTime,
- }
-}
-
-
-
/*
-Copyright 2017 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "reflect"
- "strconv"
-
- v1 "k8s.io/api/core/v1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- bus "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
- jobcache "volcano.sh/volcano/pkg/controllers/cache"
- jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
-)
-
-func (cc *jobcontroller) addCommand(obj interface{}) {
- cmd, ok := obj.(*bus.Command)
- if !ok {
- klog.Errorf("obj is not Command")
- return
- }
-
- cc.commandQueue.Add(cmd)
-}
-
-func (cc *jobcontroller) addJob(obj interface{}) {
- job, ok := obj.(*batch.Job)
- if !ok {
- klog.Errorf("obj is not Job")
- return
- }
-
- req := apis.Request{
- Namespace: job.Namespace,
- JobName: job.Name,
-
- Event: bus.OutOfSyncEvent,
- }
-
- // TODO(k82cn): if failed to add job, the cache should be refresh
- if err := cc.cache.Add(job); err != nil {
- klog.Errorf("Failed to add job <%s/%s>: %v in cache",
- job.Namespace, job.Name, err)
- }
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-}
-
-func (cc *jobcontroller) updateJob(oldObj, newObj interface{}) {
- newJob, ok := newObj.(*batch.Job)
- if !ok {
- klog.Errorf("newObj is not Job")
- return
- }
-
- oldJob, ok := oldObj.(*batch.Job)
- if !ok {
- klog.Errorf("oldJob is not Job")
- return
- }
-
- // No need to update if ResourceVersion is not changed
- if newJob.ResourceVersion == oldJob.ResourceVersion {
- klog.V(6).Infof("No need to update because job is not modified.")
- return
- }
-
- if err := cc.cache.Update(newJob); err != nil {
- klog.Errorf("UpdateJob - Failed to update job <%s/%s>: %v in cache",
- newJob.Namespace, newJob.Name, err)
- }
-
- // NOTE: Since we only reconcile job based on Spec, we will ignore other attributes
- // For Job status, it's used internally and always been updated via our controller.
- if equality.Semantic.DeepEqual(newJob.Spec, oldJob.Spec) && newJob.Status.State.Phase == oldJob.Status.State.Phase {
- klog.V(6).Infof("Job update event is ignored since no update in 'Spec'.")
- return
- }
-
- req := apis.Request{
- Namespace: newJob.Namespace,
- JobName: newJob.Name,
- Event: bus.OutOfSyncEvent,
- }
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-}
-
-func (cc *jobcontroller) deleteJob(obj interface{}) {
- job, ok := obj.(*batch.Job)
- if !ok {
- // If we reached here it means the Job was deleted but its final state is unrecorded.
- tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
- if !ok {
- klog.Errorf("Couldn't get object from tombstone %#v", obj)
- return
- }
- job, ok = tombstone.Obj.(*batch.Job)
- if !ok {
- klog.Errorf("Tombstone contained object that is not a volcano Job: %#v", obj)
- return
- }
- }
-
- if err := cc.cache.Delete(job); err != nil {
- klog.Errorf("Failed to delete job <%s/%s>: %v in cache",
- job.Namespace, job.Name, err)
- }
-}
-
-func (cc *jobcontroller) addPod(obj interface{}) {
- pod, ok := obj.(*v1.Pod)
- if !ok {
- klog.Errorf("Failed to convert %v to v1.Pod", obj)
- return
- }
- // Filter out pods that are not created from volcano job
- if !isControlledBy(pod, helpers.JobKind) {
- return
- }
-
- jobName, found := pod.Annotations[batch.JobNameKey]
- if !found {
- klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- version, found := pod.Annotations[batch.JobVersion]
- if !found {
- klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- dVersion, err := strconv.Atoi(version)
- if err != nil {
- klog.Infof("Failed to convert jobVersion of Pod <%s/%s> into number, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- if pod.DeletionTimestamp != nil {
- cc.deletePod(pod)
- return
- }
-
- req := apis.Request{
- Namespace: pod.Namespace,
- JobName: jobName,
-
- Event: bus.OutOfSyncEvent,
- JobVersion: int32(dVersion),
- }
-
- if err := cc.cache.AddPod(pod); err != nil {
- klog.Errorf("Failed to add Pod <%s/%s>: %v to cache",
- pod.Namespace, pod.Name, err)
- }
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-}
-
-func (cc *jobcontroller) updatePod(oldObj, newObj interface{}) {
- oldPod, ok := oldObj.(*v1.Pod)
- if !ok {
- klog.Errorf("Failed to convert %v to v1.Pod", oldObj)
- return
- }
-
- newPod, ok := newObj.(*v1.Pod)
- if !ok {
- klog.Errorf("Failed to convert %v to v1.Pod", newObj)
- return
- }
-
- // Filter out pods that are not created from volcano job
- if !isControlledBy(newPod, helpers.JobKind) {
- return
- }
-
- if newPod.ResourceVersion == oldPod.ResourceVersion {
- return
- }
-
- if newPod.DeletionTimestamp != nil {
- cc.deletePod(newObj)
- return
- }
-
- taskName, found := newPod.Annotations[batch.TaskSpecKey]
- if !found {
- klog.Infof("Failed to find taskName of Pod <%s/%s>, skipping",
- newPod.Namespace, newPod.Name)
- return
- }
-
- jobName, found := newPod.Annotations[batch.JobNameKey]
- if !found {
- klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
- newPod.Namespace, newPod.Name)
- return
- }
-
- version, found := newPod.Annotations[batch.JobVersion]
- if !found {
- klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
- newPod.Namespace, newPod.Name)
- return
- }
-
- dVersion, err := strconv.Atoi(version)
- if err != nil {
- klog.Infof("Failed to convert jobVersion of Pod into number <%s/%s>, skipping",
- newPod.Namespace, newPod.Name)
- return
- }
-
- if err := cc.cache.UpdatePod(newPod); err != nil {
- klog.Errorf("Failed to update Pod <%s/%s>: %v in cache",
- newPod.Namespace, newPod.Name, err)
- }
-
- event := bus.OutOfSyncEvent
- var exitCode int32
-
- switch newPod.Status.Phase {
- case v1.PodFailed:
- if oldPod.Status.Phase != v1.PodFailed {
- event = bus.PodFailedEvent
- // TODO: currently only one container pod is supported by volcano
- // Once multi containers pod is supported, update accordingly.
- if len(newPod.Status.ContainerStatuses) > 0 && newPod.Status.ContainerStatuses[0].State.Terminated != nil {
- exitCode = newPod.Status.ContainerStatuses[0].State.Terminated.ExitCode
- }
- }
- case v1.PodSucceeded:
- if oldPod.Status.Phase != v1.PodSucceeded &&
- cc.cache.TaskCompleted(jobcache.JobKeyByName(newPod.Namespace, jobName), taskName) {
- event = bus.TaskCompletedEvent
- }
- case v1.PodPending, v1.PodRunning:
- if cc.cache.TaskFailed(jobcache.JobKeyByName(newPod.Namespace, jobName), taskName) {
- event = bus.TaskFailedEvent
- }
- }
-
- req := apis.Request{
- Namespace: newPod.Namespace,
- JobName: jobName,
- TaskName: taskName,
-
- Event: event,
- ExitCode: exitCode,
- JobVersion: int32(dVersion),
- }
-
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-}
-
-func (cc *jobcontroller) deletePod(obj interface{}) {
- pod, ok := obj.(*v1.Pod)
- if !ok {
- // If we reached here it means the pod was deleted but its final state is unrecorded.
- tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
- if !ok {
- klog.Errorf("Couldn't get object from tombstone %#v", obj)
- return
- }
- pod, ok = tombstone.Obj.(*v1.Pod)
- if !ok {
- klog.Errorf("Tombstone contained object that is not a Pod: %#v", obj)
- return
- }
- }
-
- // Filter out pods that are not created from volcano job
- if !isControlledBy(pod, helpers.JobKind) {
- return
- }
-
- taskName, found := pod.Annotations[batch.TaskSpecKey]
- if !found {
- klog.Infof("Failed to find taskName of Pod <%s/%s>, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- jobName, found := pod.Annotations[batch.JobNameKey]
- if !found {
- klog.Infof("Failed to find jobName of Pod <%s/%s>, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- version, found := pod.Annotations[batch.JobVersion]
- if !found {
- klog.Infof("Failed to find jobVersion of Pod <%s/%s>, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- dVersion, err := strconv.Atoi(version)
- if err != nil {
- klog.Infof("Failed to convert jobVersion of Pod <%s/%s> into number, skipping",
- pod.Namespace, pod.Name)
- return
- }
-
- req := apis.Request{
- Namespace: pod.Namespace,
- JobName: jobName,
- TaskName: taskName,
-
- Event: bus.PodEvictedEvent,
- JobVersion: int32(dVersion),
- }
-
- if err := cc.cache.DeletePod(pod); err != nil {
- klog.Errorf("Failed to delete Pod <%s/%s>: %v in cache",
- pod.Namespace, pod.Name, err)
- }
-
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-}
-
-func (cc *jobcontroller) recordJobEvent(namespace, name string, event batch.JobEvent, message string) {
- job, err := cc.cache.Get(jobcache.JobKeyByName(namespace, name))
- if err != nil {
- klog.Warningf("Failed to find job in cache when reporting job event <%s/%s>: %v",
- namespace, name, err)
- return
- }
- cc.recorder.Event(job.Job, v1.EventTypeNormal, string(event), message)
-}
-
-func (cc *jobcontroller) handleCommands() {
- for cc.processNextCommand() {
- }
-}
-
-func (cc *jobcontroller) processNextCommand() bool {
- obj, shutdown := cc.commandQueue.Get()
- if shutdown {
- return false
- }
- cmd := obj.(*bus.Command)
- defer cc.commandQueue.Done(cmd)
-
- if err := cc.vcClient.BusV1alpha1().Commands(cmd.Namespace).Delete(context.TODO(), cmd.Name, metav1.DeleteOptions{}); err != nil {
- if !apierrors.IsNotFound(err) {
- klog.Errorf("Failed to delete Command <%s/%s>.", cmd.Namespace, cmd.Name)
- cc.commandQueue.AddRateLimited(cmd)
- }
- return true
- }
- cc.recordJobEvent(cmd.Namespace, cmd.TargetObject.Name,
- batch.CommandIssued,
- fmt.Sprintf(
- "Start to execute command %s, and clean it up to make sure executed not more than once.", cmd.Action))
- req := apis.Request{
- Namespace: cmd.Namespace,
- JobName: cmd.TargetObject.Name,
- Event: bus.CommandIssuedEvent,
- Action: bus.Action(cmd.Action),
- }
-
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
-
- return true
-}
-
-func (cc *jobcontroller) updatePodGroup(oldObj, newObj interface{}) {
- oldPG, ok := oldObj.(*scheduling.PodGroup)
- if !ok {
- klog.Errorf("Failed to convert %v to PodGroup", newObj)
- return
- }
-
- newPG, ok := newObj.(*scheduling.PodGroup)
- if !ok {
- klog.Errorf("Failed to convert %v to PodGroup", newObj)
- return
- }
-
- _, err := cc.cache.Get(jobcache.JobKeyByName(newPG.Namespace, newPG.Name))
- if err != nil && newPG.Annotations != nil {
- klog.Warningf(
- "Failed to find job in cache by PodGroup, this may not be a PodGroup for volcano job.")
- }
-
- if newPG.Status.Phase != oldPG.Status.Phase {
- req := apis.Request{
- Namespace: newPG.Namespace,
- JobName: newPG.Name,
- }
- switch newPG.Status.Phase {
- case scheduling.PodGroupUnknown:
- req.Event = bus.JobUnknownEvent
- }
- key := jobhelpers.GetJobKeyByReq(&req)
- queue := cc.getWorkerQueue(key)
- queue.Add(req)
- }
-}
-
-// TODO(k82cn): add handler for PodGroup unschedulable event.
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/volcano/pkg/controllers/job/plugins"
- pluginsinterface "volcano.sh/volcano/pkg/controllers/job/plugins/interface"
-)
-
-func (cc *jobcontroller) pluginOnPodCreate(job *batch.Job, pod *v1.Pod) error {
- client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
- for name, args := range job.Spec.Plugins {
- pb, found := plugins.GetPluginBuilder(name)
- if !found {
- err := fmt.Errorf("failed to get plugin %s", name)
- klog.Error(err)
- return err
- }
- klog.Infof("Starting to execute plugin at <pluginOnPodCreate>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
- if err := pb(client, args).OnPodCreate(pod, job); err != nil {
- klog.Errorf("Failed to process on pod create plugin %s, err %v.", name, err)
- return err
- }
- }
- return nil
-}
-
-func (cc *jobcontroller) pluginOnJobAdd(job *batch.Job) error {
- client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
- if job.Status.ControlledResources == nil {
- job.Status.ControlledResources = make(map[string]string)
- }
- for name, args := range job.Spec.Plugins {
- pb, found := plugins.GetPluginBuilder(name)
- if !found {
- err := fmt.Errorf("failed to get plugin %s", name)
- klog.Error(err)
- return err
- }
- klog.Infof("Starting to execute plugin at <pluginOnJobAdd>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
- if err := pb(client, args).OnJobAdd(job); err != nil {
- klog.Errorf("Failed to process on job add plugin %s, err %v.", name, err)
- return err
- }
- }
-
- return nil
-}
-
-func (cc *jobcontroller) pluginOnJobDelete(job *batch.Job) error {
- if job.Status.ControlledResources == nil {
- job.Status.ControlledResources = make(map[string]string)
- }
- client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
- for name, args := range job.Spec.Plugins {
- pb, found := plugins.GetPluginBuilder(name)
- if !found {
- err := fmt.Errorf("failed to get plugin %s", name)
- klog.Error(err)
- return err
- }
- klog.Infof("Starting to execute plugin at <pluginOnJobDelete>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
- if err := pb(client, args).OnJobDelete(job); err != nil {
- klog.Errorf("failed to process on job delete plugin %s, err %v.", name, err)
- return err
- }
- }
-
- return nil
-}
-
-func (cc *jobcontroller) pluginOnJobUpdate(job *batch.Job) error {
- client := pluginsinterface.PluginClientset{KubeClients: cc.kubeClient}
- if job.Status.ControlledResources == nil {
- job.Status.ControlledResources = make(map[string]string)
- }
- for name, args := range job.Spec.Plugins {
- pb, found := plugins.GetPluginBuilder(name)
- if !found {
- err := fmt.Errorf("failed to get plugin %s", name)
- klog.Error(err)
- return err
- }
- klog.Infof("Starting to execute plugin at <pluginOnJobUpdate>: %s on job: <%s/%s>", name, job.Namespace, job.Name)
- if err := pb(client, args).OnJobUpdate(job); err != nil {
- klog.Errorf("Failed to process on job update plugin %s, err %v.", name, err)
- return err
- }
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "context"
- "fmt"
- "time"
-
- "golang.org/x/time/rate"
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-)
-
-func newRateLimitingQueue() workqueue.RateLimitingInterface {
- return workqueue.NewRateLimitingQueue(workqueue.NewMaxOfRateLimiter(
- workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 180*time.Second),
- // 10 qps, 100 bucket size. This is only for retry speed and its only the overall factor (not per item)
- &workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
- ))
-}
-
-func (cc *jobcontroller) processResyncTask() {
- obj, shutdown := cc.errTasks.Get()
- if shutdown {
- return
- }
-
- // one task only resync 10 times
- if cc.errTasks.NumRequeues(obj) > 10 {
- cc.errTasks.Forget(obj)
- return
- }
-
- defer cc.errTasks.Done(obj)
-
- task, ok := obj.(*v1.Pod)
- if !ok {
- klog.Errorf("failed to convert %v to *v1.Pod", obj)
- return
- }
-
- if err := cc.syncTask(task); err != nil {
- klog.Errorf("Failed to sync pod <%v/%v>, retry it, err %v", task.Namespace, task.Name, err)
- cc.resyncTask(task)
- }
-}
-
-func (cc *jobcontroller) syncTask(oldTask *v1.Pod) error {
- newPod, err := cc.kubeClient.CoreV1().Pods(oldTask.Namespace).Get(context.TODO(), oldTask.Name, metav1.GetOptions{})
- if err != nil {
- if errors.IsNotFound(err) {
- if err := cc.cache.DeletePod(oldTask); err != nil {
- klog.Errorf("failed to delete cache pod <%v/%v>, err %v.", oldTask.Namespace, oldTask.Name, err)
- return err
- }
- klog.V(3).Infof("Pod <%v/%v> was deleted, removed from cache.", oldTask.Namespace, oldTask.Name)
-
- return nil
- }
- return fmt.Errorf("failed to get Pod <%v/%v>: err %v", oldTask.Namespace, oldTask.Name, err)
- }
-
- return cc.cache.UpdatePod(newPod)
-}
-
-func (cc *jobcontroller) resyncTask(task *v1.Pod) {
- cc.errTasks.AddRateLimited(task)
-}
-
-
-
/*
-Copyright 2017 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package job
-
-import (
- "fmt"
- "time"
-
- v1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/runtime/schema"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- schedulingv2 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
- jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
-)
-
-var detectionPeriodOfDependsOntask time.Duration
-
-// MakePodName append podname,jobname,taskName and index and returns the string.
-func MakePodName(jobName string, taskName string, index int) string {
- return fmt.Sprintf(jobhelpers.PodNameFmt, jobName, taskName, index)
-}
-
-func createJobPod(job *batch.Job, template *v1.PodTemplateSpec, topologyPolicy batch.NumaPolicy, ix int, jobForwarding bool) *v1.Pod {
- templateCopy := template.DeepCopy()
-
- pod := &v1.Pod{
- ObjectMeta: metav1.ObjectMeta{
- Name: jobhelpers.MakePodName(job.Name, template.Name, ix),
- Namespace: job.Namespace,
- OwnerReferences: []metav1.OwnerReference{
- *metav1.NewControllerRef(job, helpers.JobKind),
- },
- Labels: templateCopy.Labels,
- Annotations: templateCopy.Annotations,
- },
- Spec: templateCopy.Spec,
- }
-
- // If no scheduler name in Pod, use scheduler name from Job.
- if len(pod.Spec.SchedulerName) == 0 {
- pod.Spec.SchedulerName = job.Spec.SchedulerName
- }
-
- volumeMap := make(map[string]string)
- for _, volume := range job.Spec.Volumes {
- vcName := volume.VolumeClaimName
- name := fmt.Sprintf("%s-%s", job.Name, jobhelpers.GenRandomStr(12))
- if _, ok := volumeMap[vcName]; !ok {
- volume := v1.Volume{
- Name: name,
- VolumeSource: v1.VolumeSource{
- PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
- ClaimName: vcName,
- },
- },
- }
- pod.Spec.Volumes = append(pod.Spec.Volumes, volume)
- volumeMap[vcName] = name
- } else {
- // duplicate volumes, should be prevented
- continue
- }
-
- for i, c := range pod.Spec.Containers {
- vm := v1.VolumeMount{
- MountPath: volume.MountPath,
- Name: name,
- }
- pod.Spec.Containers[i].VolumeMounts = append(c.VolumeMounts, vm)
- }
- }
-
- tsKey := templateCopy.Name
- if len(tsKey) == 0 {
- tsKey = batch.DefaultTaskSpec
- }
-
- if len(pod.Annotations) == 0 {
- pod.Annotations = make(map[string]string)
- }
-
- pod.Annotations[batch.TaskSpecKey] = tsKey
- pod.Annotations[schedulingv2.KubeGroupNameAnnotationKey] = job.Name
- pod.Annotations[batch.JobNameKey] = job.Name
- pod.Annotations[batch.QueueNameKey] = job.Spec.Queue
- pod.Annotations[batch.JobVersion] = fmt.Sprintf("%d", job.Status.Version)
- pod.Annotations[batch.PodTemplateKey] = fmt.Sprintf("%s-%s", job.Name, template.Name)
-
- if topologyPolicy != "" {
- pod.Annotations[schedulingv2.NumaPolicyKey] = string(topologyPolicy)
- }
-
- if len(job.Annotations) > 0 {
- if value, found := job.Annotations[schedulingv2.PodPreemptable]; found {
- pod.Annotations[schedulingv2.PodPreemptable] = value
- }
- if value, found := job.Annotations[schedulingv2.RevocableZone]; found {
- pod.Annotations[schedulingv2.RevocableZone] = value
- }
-
- if value, found := job.Annotations[schedulingv2.JDBMinAvailable]; found {
- pod.Annotations[schedulingv2.JDBMinAvailable] = value
- } else if value, found := job.Annotations[schedulingv2.JDBMaxUnavailable]; found {
- pod.Annotations[schedulingv2.JDBMaxUnavailable] = value
- }
- }
-
- if len(pod.Labels) == 0 {
- pod.Labels = make(map[string]string)
- }
-
- // Set pod labels for Service.
- pod.Labels[batch.JobNameKey] = job.Name
- pod.Labels[batch.TaskSpecKey] = tsKey
- pod.Labels[batch.JobNamespaceKey] = job.Namespace
- pod.Labels[batch.QueueNameKey] = job.Spec.Queue
- if len(job.Labels) > 0 {
- if value, found := job.Labels[schedulingv2.PodPreemptable]; found {
- pod.Labels[schedulingv2.PodPreemptable] = value
- }
- }
-
- if jobForwarding {
- pod.Annotations[batch.JobForwardingKey] = "true"
- pod.Labels[batch.JobForwardingKey] = "true"
- }
-
- return pod
-}
-
-func applyPolicies(job *batch.Job, req *apis.Request) v1alpha1.Action {
- if len(req.Action) != 0 {
- return req.Action
- }
-
- if req.Event == v1alpha1.OutOfSyncEvent {
- return v1alpha1.SyncJobAction
- }
-
- // For all the requests triggered from discarded job resources will perform sync action instead
- if req.JobVersion < job.Status.Version {
- klog.Infof("Request %s is outdated, will perform sync instead.", req)
- return v1alpha1.SyncJobAction
- }
-
- // Overwrite Job level policies
- if len(req.TaskName) != 0 {
- // Parse task level policies
- for _, task := range job.Spec.Tasks {
- if task.Name == req.TaskName {
- for _, policy := range task.Policies {
- policyEvents := getEventlist(policy)
-
- if len(policyEvents) > 0 && len(req.Event) > 0 {
- if checkEventExist(policyEvents, req.Event) || checkEventExist(policyEvents, v1alpha1.AnyEvent) {
- return policy.Action
- }
- }
-
- // 0 is not an error code, is prevented in validation admission controller
- if policy.ExitCode != nil && *policy.ExitCode == req.ExitCode {
- return policy.Action
- }
- }
- break
- }
- }
- }
-
- // Parse Job level policies
- for _, policy := range job.Spec.Policies {
- policyEvents := getEventlist(policy)
-
- if len(policyEvents) > 0 && len(req.Event) > 0 {
- if checkEventExist(policyEvents, req.Event) || checkEventExist(policyEvents, v1alpha1.AnyEvent) {
- return policy.Action
- }
- }
-
- // 0 is not an error code, is prevented in validation admission controller
- if policy.ExitCode != nil && *policy.ExitCode == req.ExitCode {
- return policy.Action
- }
- }
-
- return v1alpha1.SyncJobAction
-}
-
-func getEventlist(policy batch.LifecyclePolicy) []v1alpha1.Event {
- policyEventsList := policy.Events
- if len(policy.Event) > 0 {
- policyEventsList = append(policyEventsList, policy.Event)
- }
- return policyEventsList
-}
-
-func checkEventExist(policyEvents []v1alpha1.Event, reqEvent v1alpha1.Event) bool {
- for _, event := range policyEvents {
- if event == reqEvent {
- return true
- }
- }
- return false
-}
-
-func addResourceList(list, req, limit v1.ResourceList) {
- for name, quantity := range req {
- if value, ok := list[name]; !ok {
- list[name] = quantity.DeepCopy()
- } else {
- value.Add(quantity)
- list[name] = value
- }
- }
-
- if req != nil {
- return
- }
-
- // If Requests is omitted for a container,
- // it defaults to Limits if that is explicitly specified.
- for name, quantity := range limit {
- if value, ok := list[name]; !ok {
- list[name] = quantity.DeepCopy()
- } else {
- value.Add(quantity)
- list[name] = value
- }
- }
-}
-
-// TaskPriority structure.
-type TaskPriority struct {
- priority int32
-
- batch.TaskSpec
-}
-
-// TasksPriority is a slice of TaskPriority.
-type TasksPriority []TaskPriority
-
-func (p TasksPriority) Len() int { return len(p) }
-
-func (p TasksPriority) Less(i, j int) bool {
- return p[i].priority > p[j].priority
-}
-
-func (p TasksPriority) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
-
-func isControlledBy(obj metav1.Object, gvk schema.GroupVersionKind) bool {
- controllerRef := metav1.GetControllerOf(obj)
- if controllerRef == nil {
- return false
- }
- if controllerRef.APIVersion == gvk.GroupVersion().String() && controllerRef.Kind == gvk.Kind {
- return true
- }
- return false
-}
-
-func SetDetectionPeriodOfDependsOntask(period time.Duration) {
- detectionPeriodOfDependsOntask = period
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package tensorflow
-
-import (
- "encoding/json"
- "flag"
- "fmt"
- "strconv"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
- pluginsinterface "volcano.sh/volcano/pkg/controllers/job/plugins/interface"
-)
-
-const (
- DefaultPort = 2222
- TFConfig = "TF_CONFIG"
-)
-
-type tensorflowPlugin struct {
- tfArguments []string
- Clientset pluginsinterface.PluginClientset
- psName string
- workerName string
- chiefName string
- evaluatorName string
- port int
-}
-
-// New creates tensorflow plugin.
-func New(client pluginsinterface.PluginClientset, arguments []string) pluginsinterface.PluginInterface {
- tp := tensorflowPlugin{tfArguments: arguments, Clientset: client}
- tp.addFlags()
- return &tp
-}
-
-func (tp *tensorflowPlugin) addFlags() {
- flagSet := flag.NewFlagSet(tp.Name(), flag.ContinueOnError)
- flagSet.StringVar(&tp.psName, "ps", "ps", "name of ps role task")
- flagSet.StringVar(&tp.workerName, "worker", "worker", "name of ps role task")
- flagSet.StringVar(&tp.chiefName, "chief", "chief", "name of chief role task")
- flagSet.StringVar(&tp.evaluatorName, "evaluator", "evaluator", "name of evaluator role task")
- flagSet.IntVar(&tp.port, "port", DefaultPort, "service port")
- if err := flagSet.Parse(tp.tfArguments); err != nil {
- klog.Errorf("plugin %s flagset parse failed, err: %v", tp.Name(), err)
- }
-}
-
-func (tp *tensorflowPlugin) Name() string {
- return "tensorflow"
-}
-
-func (tp *tensorflowPlugin) OnPodCreate(pod *v1.Pod, job *batch.Job) error {
- // No need to generate TF_CONFIG for stand-alone tensorflow job
- if len(job.Spec.Tasks) == 1 && job.Spec.Tasks[0].Replicas == 1 {
- return nil
- }
- // Generate TF_CONFIG value
- spec, err := tp.generateTFClusterSpec(pod, job)
- if err != nil {
- return err
- }
- raw, err := json.Marshal(spec)
- if err != nil {
- return err
- }
-
- // Add TF_CONFIG enviroment variables
- for i := range pod.Spec.Containers {
- pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, v1.EnvVar{
- Name: TFConfig,
- Value: string(raw),
- })
- }
- return nil
-}
-
-func (tp *tensorflowPlugin) OnJobAdd(job *batch.Job) error {
- if job.Status.ControlledResources["plugin-"+tp.Name()] == tp.Name() {
- return nil
- }
-
- job.Status.ControlledResources["plugin-"+tp.Name()] = tp.Name()
-
- return nil
-}
-
-func (tp *tensorflowPlugin) OnJobDelete(job *batch.Job) error {
- if job.Status.ControlledResources["plugin-"+tp.Name()] != tp.Name() {
- return nil
- }
- delete(job.Status.ControlledResources, "plugin-"+tp.Name())
- return nil
-}
-
-func (tp *tensorflowPlugin) OnJobUpdate(job *batch.Job) error {
- return nil
-}
-
-func (tp *tensorflowPlugin) generateTFClusterSpec(pod *v1.Pod, job *batch.Job) (tfClusterSpec, error) {
- index, err := strconv.Atoi(jobhelpers.GetPodIndexUnderTask(pod))
- if err != nil {
- return tfClusterSpec{}, err
- }
-
- // Generate tensorflow task info
- c := tfClusterSpec{
- Task: taskInfo{
- Type: tp.getTaskType(jobhelpers.GetTaskKey(pod)),
- Index: index,
- },
- }
-
- // Generate tensorflow cluster info
- for _, ts := range job.Spec.Tasks {
- hosts := []string{}
- for i := 0; i < int(ts.Replicas); i++ {
- hosts = append(hosts, fmt.Sprintf("%s:%d", jobhelpers.MakeDomainName(ts, job, i), tp.port))
- }
- switch ts.Name {
- case tp.psName:
- c.Cluster.PS = hosts
- case tp.workerName:
- c.Cluster.Worker = hosts
- case tp.chiefName:
- c.Cluster.Chief = hosts
- case tp.evaluatorName:
- c.Cluster.Evaluator = hosts
- }
- }
- return c, nil
-}
-
-func (tp *tensorflowPlugin) getTaskType(taskKey string) tfTaskType {
- switch taskKey {
- case tp.chiefName:
- return tfChief
- case tp.workerName:
- return tfWorker
- case tp.psName:
- return tfPS
- case tp.evaluatorName:
- return tfEvaluator
- }
- return tfTaskType(taskKey)
-}
-
-// TfClusterSpec is the spec of a tensorflow cluster
-// It will be injected into container's environment variables, and be used by tensorflow framework.
-// e.g.
-// {
-// "cluster": {
-// "worker": ["worker-0:2222", "worker-1:2222"],
-// "ps": ["ps-0:2222"]
-// },
-// "task": {
-// "type": "worker",
-// "index": 0
-// }
-// }
-type tfClusterSpec struct {
- Cluster clusterInfo `json:"cluster"`
- Task taskInfo `json:"task"`
-}
-
-type clusterInfo struct {
- PS []string `json:"ps,omitempty"`
- Worker []string `json:"worker,omitempty"`
- Chief []string `json:"chief,omitempty"`
- Evaluator []string `json:"evaluator,omitempty"`
-}
-
-type tfTaskType string
-
-const (
- tfWorker tfTaskType = "worker"
- tfChief tfTaskType = "chief"
- tfPS tfTaskType = "ps"
- tfEvaluator tfTaskType = "evaluator"
-)
-
-type taskInfo struct {
- Type tfTaskType `json:"type"`
- Index int `json:"index"`
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package ssh
-
-import (
- "crypto/rand"
- "crypto/rsa"
- "crypto/x509"
- "encoding/pem"
- "flag"
- "fmt"
-
- "golang.org/x/crypto/ssh"
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/apis/helpers"
- jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
- pluginsinterface "volcano.sh/volcano/pkg/controllers/job/plugins/interface"
-)
-
-type sshPlugin struct {
- // Arguments given for the plugin
- pluginArguments []string
-
- client pluginsinterface.PluginClientset
-
- // flag parse args
- sshKeyFilePath string
-
- // private key string
- sshPrivateKey string
-
- // public key string
- sshPublicKey string
-}
-
-// New creates ssh plugin
-func New(client pluginsinterface.PluginClientset, arguments []string) pluginsinterface.PluginInterface {
- p := sshPlugin{
- pluginArguments: arguments,
- client: client,
- sshKeyFilePath: SSHAbsolutePath,
- }
-
- p.addFlags()
-
- return &p
-}
-
-func (sp *sshPlugin) Name() string {
- return "ssh"
-}
-
-func (sp *sshPlugin) OnPodCreate(pod *v1.Pod, job *batch.Job) error {
- sp.mountRsaKey(pod, job)
-
- return nil
-}
-
-func (sp *sshPlugin) OnJobAdd(job *batch.Job) error {
- if job.Status.ControlledResources["plugin-"+sp.Name()] == sp.Name() {
- return nil
- }
-
- var data map[string][]byte
- var err error
- if len(sp.sshPrivateKey) > 0 {
- data, err = withUserProvidedRsaKey(job, sp.sshPrivateKey, sp.sshPublicKey)
- } else {
- data, err = generateRsaKey(job)
- }
- if err != nil {
- return err
- }
-
- if err := helpers.CreateOrUpdateSecret(job, sp.client.KubeClients, data, sp.secretName(job)); err != nil {
- return fmt.Errorf("create secret for job <%s/%s> with ssh plugin failed for %v",
- job.Namespace, job.Name, err)
- }
-
- job.Status.ControlledResources["plugin-"+sp.Name()] = sp.Name()
-
- return nil
-}
-
-func (sp *sshPlugin) OnJobDelete(job *batch.Job) error {
- if job.Status.ControlledResources["plugin-"+sp.Name()] != sp.Name() {
- return nil
- }
- if err := helpers.DeleteSecret(job, sp.client.KubeClients, sp.secretName(job)); err != nil {
- return err
- }
- delete(job.Status.ControlledResources, "plugin-"+sp.Name())
-
- return nil
-}
-
-// TODO: currently a container using a Secret as a subPath volume mount will not receive Secret updates.
-// we may not update the job secret due to the above reason now.
-// related issue: https://github.com/volcano-sh/volcano/issues/1420
-func (sp *sshPlugin) OnJobUpdate(job *batch.Job) error {
- //data, err := generateRsaKey(job)
- //if err != nil {
- // return err
- //}
- //
- //if err := helpers.CreateOrUpdateSecret(job, sp.client.KubeClients, data, sp.secretName(job)); err != nil {
- // return fmt.Errorf("update secret for job <%s/%s> with ssh plugin failed for %v",
- // job.Namespace, job.Name, err)
- //}
-
- return nil
-}
-
-func (sp *sshPlugin) mountRsaKey(pod *v1.Pod, job *batch.Job) {
- secretName := sp.secretName(job)
-
- sshVolume := v1.Volume{
- Name: secretName,
- }
-
- var mode int32 = 0600
- sshVolume.Secret = &v1.SecretVolumeSource{
- SecretName: secretName,
- Items: []v1.KeyToPath{
- {
- Key: SSHPrivateKey,
- Path: SSHRelativePath + "/" + SSHPrivateKey,
- },
- {
- Key: SSHPublicKey,
- Path: SSHRelativePath + "/" + SSHPublicKey,
- },
- {
- Key: SSHAuthorizedKeys,
- Path: SSHRelativePath + "/" + SSHAuthorizedKeys,
- },
- {
- Key: SSHConfig,
- Path: SSHRelativePath + "/" + SSHConfig,
- },
- },
- DefaultMode: &mode,
- }
-
- if sp.sshKeyFilePath != SSHAbsolutePath {
- var noRootMode int32 = 0600
- sshVolume.Secret.DefaultMode = &noRootMode
- }
-
- pod.Spec.Volumes = append(pod.Spec.Volumes, sshVolume)
-
- for i, c := range pod.Spec.Containers {
- vm := v1.VolumeMount{
- MountPath: sp.sshKeyFilePath,
- SubPath: SSHRelativePath,
- Name: secretName,
- }
-
- pod.Spec.Containers[i].VolumeMounts = append(c.VolumeMounts, vm)
- }
- for i, c := range pod.Spec.InitContainers {
- vm := v1.VolumeMount{
- MountPath: sp.sshKeyFilePath,
- SubPath: SSHRelativePath,
- Name: secretName,
- }
-
- pod.Spec.InitContainers[i].VolumeMounts = append(c.VolumeMounts, vm)
- }
-}
-
-func generateRsaKey(job *batch.Job) (map[string][]byte, error) {
- bitSize := 2048
-
- privateKey, err := rsa.GenerateKey(rand.Reader, bitSize)
- if err != nil {
- klog.Errorf("rsa generateKey err: %v", err)
- return nil, err
- }
-
- // id_rsa
- privBlock := pem.Block{
- Type: "RSA PRIVATE KEY",
- Bytes: x509.MarshalPKCS1PrivateKey(privateKey),
- }
- privateKeyBytes := pem.EncodeToMemory(&privBlock)
-
- // id_rsa.pub
- publicRsaKey, err := ssh.NewPublicKey(&privateKey.PublicKey)
- if err != nil {
- klog.Errorf("ssh newPublicKey err: %v", err)
- return nil, err
- }
- publicKeyBytes := ssh.MarshalAuthorizedKey(publicRsaKey)
-
- data := make(map[string][]byte)
- data[SSHPrivateKey] = privateKeyBytes
- data[SSHPublicKey] = publicKeyBytes
- data[SSHAuthorizedKeys] = publicKeyBytes
- data[SSHConfig] = []byte(generateSSHConfig(job))
-
- return data, nil
-}
-
-func withUserProvidedRsaKey(job *batch.Job, sshPrivateKey string, sshPublicKey string) (map[string][]byte, error) {
- data := make(map[string][]byte)
- data[SSHPrivateKey] = []byte(sshPrivateKey)
- data[SSHPublicKey] = []byte(sshPublicKey)
- data[SSHAuthorizedKeys] = []byte(sshPublicKey)
- data[SSHConfig] = []byte(generateSSHConfig(job))
-
- return data, nil
-}
-
-func (sp *sshPlugin) secretName(job *batch.Job) string {
- return fmt.Sprintf("%s-%s", job.Name, sp.Name())
-}
-
-func (sp *sshPlugin) addFlags() {
- flagSet := flag.NewFlagSet(sp.Name(), flag.ContinueOnError)
- flagSet.StringVar(&sp.sshKeyFilePath, "ssh-key-file-path", sp.sshKeyFilePath, "The path used to store "+
- "ssh private and public keys, it is `/root/.ssh` by default.")
- flagSet.StringVar(&sp.sshPrivateKey, "ssh-private-key", sp.sshPrivateKey, "The input string of the private key")
- flagSet.StringVar(&sp.sshPublicKey, "ssh-public-key", sp.sshPublicKey, "The input string of the public key")
-
- if err := flagSet.Parse(sp.pluginArguments); err != nil {
- klog.Errorf("plugin %s flagset parse failed, err: %v", sp.Name(), err)
- }
-}
-
-func generateSSHConfig(job *batch.Job) string {
- config := "StrictHostKeyChecking no\nUserKnownHostsFile /dev/null\n"
-
- for _, ts := range job.Spec.Tasks {
- for i := 0; i < int(ts.Replicas); i++ {
- hostName := ts.Template.Spec.Hostname
- subdomain := ts.Template.Spec.Subdomain
- if len(hostName) == 0 {
- hostName = jobhelpers.MakePodName(job.Name, ts.Name, i)
- }
- if len(subdomain) == 0 {
- subdomain = job.Name
- }
-
- config += "Host " + hostName + "\n"
- config += " HostName " + hostName + "." + subdomain + "\n"
- if len(ts.Template.Spec.Hostname) != 0 {
- break
- }
- }
- }
-
- return config
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package podgroup
-
-import (
- "k8s.io/apimachinery/pkg/util/wait"
- coreinformers "k8s.io/client-go/informers/core/v1"
- "k8s.io/client-go/kubernetes"
- corelisters "k8s.io/client-go/listers/core/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
- informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
- schedulinginformer "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
- schedulinglister "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/framework"
-)
-
-func init() {
- framework.RegisterController(&pgcontroller{})
-}
-
-// pgcontroller the Podgroup pgcontroller type.
-type pgcontroller struct {
- kubeClient kubernetes.Interface
- vcClient vcclientset.Interface
-
- podInformer coreinformers.PodInformer
- pgInformer schedulinginformer.PodGroupInformer
-
- // A store of pods
- podLister corelisters.PodLister
- podSynced func() bool
-
- // A store of podgroups
- pgLister schedulinglister.PodGroupLister
- pgSynced func() bool
-
- queue workqueue.RateLimitingInterface
-
- schedulerNames []string
-}
-
-func (pg *pgcontroller) Name() string {
- return "pg-controller"
-}
-
-// Initialize create new Podgroup Controller.
-func (pg *pgcontroller) Initialize(opt *framework.ControllerOption) error {
- pg.kubeClient = opt.KubeClient
- pg.vcClient = opt.VolcanoClient
-
- pg.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
-
- pg.schedulerNames = make([]string, len(opt.SchedulerNames))
- copy(pg.schedulerNames, opt.SchedulerNames)
-
- pg.podInformer = opt.SharedInformerFactory.Core().V1().Pods()
- pg.podLister = pg.podInformer.Lister()
- pg.podSynced = pg.podInformer.Informer().HasSynced
- pg.podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: pg.addPod,
- })
-
- pg.pgInformer = informerfactory.NewSharedInformerFactory(pg.vcClient, 0).Scheduling().V1beta1().PodGroups()
- pg.pgLister = pg.pgInformer.Lister()
- pg.pgSynced = pg.pgInformer.Informer().HasSynced
-
- return nil
-}
-
-// Run start NewPodgroupController.
-func (pg *pgcontroller) Run(stopCh <-chan struct{}) {
- go pg.podInformer.Informer().Run(stopCh)
- go pg.pgInformer.Informer().Run(stopCh)
-
- cache.WaitForCacheSync(stopCh, pg.podSynced, pg.pgSynced)
-
- go wait.Until(pg.worker, 0, stopCh)
-
- klog.Infof("PodgroupController is running ...... ")
-}
-
-func (pg *pgcontroller) worker() {
- for pg.processNextReq() {
- }
-}
-
-func (pg *pgcontroller) processNextReq() bool {
- obj, shutdown := pg.queue.Get()
- if shutdown {
- klog.Errorf("Fail to pop item from queue")
- return false
- }
-
- req := obj.(podRequest)
- defer pg.queue.Done(req)
-
- pod, err := pg.podLister.Pods(req.podNamespace).Get(req.podName)
- if err != nil {
- klog.Errorf("Failed to get pod by <%v> from cache: %v", req, err)
- return true
- }
-
- if !contains(pg.schedulerNames, pod.Spec.SchedulerName) {
- klog.V(5).Infof("pod %v/%v field SchedulerName is not matched", pod.Namespace, pod.Name)
- return true
- }
-
- if pod.Annotations != nil && pod.Annotations[scheduling.KubeGroupNameAnnotationKey] != "" {
- klog.V(5).Infof("pod %v/%v has created podgroup", pod.Namespace, pod.Name)
- return true
- }
-
- // normal pod use volcano
- if err := pg.createNormalPodPGIfNotExist(pod); err != nil {
- klog.Errorf("Failed to handle Pod <%s/%s>: %v", pod.Namespace, pod.Name, err)
- pg.queue.AddRateLimited(req)
- return true
- }
-
- // If no error, forget it.
- pg.queue.Forget(req)
-
- return true
-}
-
-func contains(slice []string, element string) bool {
- for _, item := range slice {
- if item == element {
- return true
- }
- }
- return false
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package podgroup
-
-import (
- "context"
-
- v1 "k8s.io/api/core/v1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/runtime/schema"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/helpers"
- scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-type podRequest struct {
- podName string
- podNamespace string
-}
-
-func (pg *pgcontroller) addPod(obj interface{}) {
- pod, ok := obj.(*v1.Pod)
- if !ok {
- klog.Errorf("Failed to convert %v to v1.Pod", obj)
- return
- }
-
- req := podRequest{
- podName: pod.Name,
- podNamespace: pod.Namespace,
- }
-
- pg.queue.Add(req)
-}
-
-func (pg *pgcontroller) updatePodAnnotations(pod *v1.Pod, pgName string) error {
- if pod.Annotations == nil {
- pod.Annotations = make(map[string]string)
- }
- if pod.Annotations[scheduling.KubeGroupNameAnnotationKey] == "" {
- pod.Annotations[scheduling.KubeGroupNameAnnotationKey] = pgName
- } else {
- if pod.Annotations[scheduling.KubeGroupNameAnnotationKey] != pgName {
- klog.Errorf("normal pod %s/%s annotations %s value is not %s, but %s", pod.Namespace, pod.Name,
- scheduling.KubeGroupNameAnnotationKey, pgName, pod.Annotations[scheduling.KubeGroupNameAnnotationKey])
- }
- return nil
- }
-
- if _, err := pg.kubeClient.CoreV1().Pods(pod.Namespace).Update(context.TODO(), pod, metav1.UpdateOptions{}); err != nil {
- klog.Errorf("Failed to update pod <%s/%s>: %v", pod.Namespace, pod.Name, err)
- return err
- }
-
- return nil
-}
-
-func (pg *pgcontroller) createNormalPodPGIfNotExist(pod *v1.Pod) error {
- pgName := helpers.GeneratePodgroupName(pod)
-
- if _, err := pg.pgLister.PodGroups(pod.Namespace).Get(pgName); err != nil {
- if !apierrors.IsNotFound(err) {
- klog.Errorf("Failed to get normal PodGroup for Pod <%s/%s>: %v",
- pod.Namespace, pod.Name, err)
- return err
- }
-
- obj := &scheduling.PodGroup{
- ObjectMeta: metav1.ObjectMeta{
- Namespace: pod.Namespace,
- Name: pgName,
- OwnerReferences: newPGOwnerReferences(pod),
- Annotations: map[string]string{},
- Labels: map[string]string{},
- },
- Spec: scheduling.PodGroupSpec{
- MinMember: 1,
- PriorityClassName: pod.Spec.PriorityClassName,
- MinResources: calcPGMinResources(pod),
- },
- }
- if queueName, ok := pod.Annotations[scheduling.QueueNameAnnotationKey]; ok {
- obj.Spec.Queue = queueName
- }
-
- if value, ok := pod.Annotations[scheduling.PodPreemptable]; ok {
- obj.Annotations[scheduling.PodPreemptable] = value
- }
- if value, ok := pod.Annotations[scheduling.RevocableZone]; ok {
- obj.Annotations[scheduling.RevocableZone] = value
- }
- if value, ok := pod.Labels[scheduling.PodPreemptable]; ok {
- obj.Labels[scheduling.PodPreemptable] = value
- }
-
- if value, found := pod.Annotations[scheduling.JDBMinAvailable]; found {
- obj.Annotations[scheduling.JDBMinAvailable] = value
- } else if value, found := pod.Annotations[scheduling.JDBMaxUnavailable]; found {
- obj.Annotations[scheduling.JDBMaxUnavailable] = value
- }
-
- if _, err := pg.vcClient.SchedulingV1beta1().PodGroups(pod.Namespace).Create(context.TODO(), obj, metav1.CreateOptions{}); err != nil {
- klog.Errorf("Failed to create normal PodGroup for Pod <%s/%s>: %v",
- pod.Namespace, pod.Name, err)
- return err
- }
- }
-
- return pg.updatePodAnnotations(pod, pgName)
-}
-
-func newPGOwnerReferences(pod *v1.Pod) []metav1.OwnerReference {
- if len(pod.OwnerReferences) != 0 {
- for _, ownerReference := range pod.OwnerReferences {
- if ownerReference.Controller != nil && *ownerReference.Controller {
- return pod.OwnerReferences
- }
- }
- }
-
- gvk := schema.GroupVersionKind{
- Group: v1.SchemeGroupVersion.Group,
- Version: v1.SchemeGroupVersion.Version,
- Kind: "Pod",
- }
- ref := metav1.NewControllerRef(pod, gvk)
- return []metav1.OwnerReference{*ref}
-}
-
-// addResourceList add list resource quantity
-func addResourceList(list, req, limit v1.ResourceList) {
- for name, quantity := range req {
- if value, ok := list[name]; !ok {
- list[name] = quantity.DeepCopy()
- } else {
- value.Add(quantity)
- list[name] = value
- }
- }
-
- if req != nil {
- return
- }
-
- // If Requests is omitted for a container,
- // it defaults to Limits if that is explicitly specified.
- for name, quantity := range limit {
- if value, ok := list[name]; !ok {
- list[name] = quantity.DeepCopy()
- } else {
- value.Add(quantity)
- list[name] = value
- }
- }
-}
-
-// calcPGMinResources calculate podgroup minimum resource
-func calcPGMinResources(pod *v1.Pod) *v1.ResourceList {
- pgMinRes := v1.ResourceList{}
-
- for _, c := range pod.Spec.Containers {
- addResourceList(pgMinRes, c.Resources.Requests, c.Resources.Limits)
- }
-
- return &pgMinRes
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
- "sync"
- "time"
-
- v1 "k8s.io/api/core/v1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- utilruntime "k8s.io/apimachinery/pkg/util/runtime"
- "k8s.io/apimachinery/pkg/util/wait"
- "k8s.io/client-go/kubernetes"
- corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/client-go/tools/record"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- vcclientset "volcano.sh/apis/pkg/client/clientset/versioned"
- versionedscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
- informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
- busv1alpha1informer "volcano.sh/apis/pkg/client/informers/externalversions/bus/v1alpha1"
- schedulinginformer "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
- busv1alpha1lister "volcano.sh/apis/pkg/client/listers/bus/v1alpha1"
- schedulinglister "volcano.sh/apis/pkg/client/listers/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
- "volcano.sh/volcano/pkg/controllers/framework"
- queuestate "volcano.sh/volcano/pkg/controllers/queue/state"
-)
-
-func init() {
- framework.RegisterController(&queuecontroller{})
-}
-
-// queuecontroller manages queue status.
-type queuecontroller struct {
- kubeClient kubernetes.Interface
- vcClient vcclientset.Interface
-
- // informer
- queueInformer schedulinginformer.QueueInformer
- pgInformer schedulinginformer.PodGroupInformer
-
- // queueLister
- queueLister schedulinglister.QueueLister
- queueSynced cache.InformerSynced
-
- // podGroup lister
- pgLister schedulinglister.PodGroupLister
- pgSynced cache.InformerSynced
-
- cmdInformer busv1alpha1informer.CommandInformer
- cmdLister busv1alpha1lister.CommandLister
- cmdSynced cache.InformerSynced
-
- // queues that need to be updated.
- queue workqueue.RateLimitingInterface
- commandQueue workqueue.RateLimitingInterface
-
- pgMutex sync.RWMutex
- // queue name -> podgroup namespace/name
- podGroups map[string]map[string]struct{}
-
- syncHandler func(req *apis.Request) error
- syncCommandHandler func(cmd *busv1alpha1.Command) error
-
- enqueueQueue func(req *apis.Request)
-
- recorder record.EventRecorder
- maxRequeueNum int
-}
-
-func (c *queuecontroller) Name() string {
- return "queue-controller"
-}
-
-// NewQueueController creates a QueueController.
-func (c *queuecontroller) Initialize(opt *framework.ControllerOption) error {
- c.vcClient = opt.VolcanoClient
- c.kubeClient = opt.KubeClient
-
- factory := informerfactory.NewSharedInformerFactory(c.vcClient, 0)
- queueInformer := factory.Scheduling().V1beta1().Queues()
- pgInformer := factory.Scheduling().V1beta1().PodGroups()
-
- eventBroadcaster := record.NewBroadcaster()
- eventBroadcaster.StartLogging(klog.Infof)
- eventBroadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: c.kubeClient.CoreV1().Events("")})
-
- c.queueInformer = queueInformer
- c.pgInformer = pgInformer
- c.queueLister = queueInformer.Lister()
- c.queueSynced = queueInformer.Informer().HasSynced
- c.pgLister = pgInformer.Lister()
- c.pgSynced = pgInformer.Informer().HasSynced
- c.queue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
- c.commandQueue = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
- c.podGroups = make(map[string]map[string]struct{})
- c.recorder = eventBroadcaster.NewRecorder(versionedscheme.Scheme, v1.EventSource{Component: "vc-controller-manager"})
- c.maxRequeueNum = opt.MaxRequeueNum
- if c.maxRequeueNum < 0 {
- c.maxRequeueNum = -1
- }
-
- queueInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: c.addQueue,
- UpdateFunc: c.updateQueue,
- DeleteFunc: c.deleteQueue,
- })
-
- pgInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: c.addPodGroup,
- UpdateFunc: c.updatePodGroup,
- DeleteFunc: c.deletePodGroup,
- })
-
- c.cmdInformer = informerfactory.NewSharedInformerFactory(c.vcClient, 0).Bus().V1alpha1().Commands()
- c.cmdInformer.Informer().AddEventHandler(cache.FilteringResourceEventHandler{
- FilterFunc: func(obj interface{}) bool {
- switch v := obj.(type) {
- case *busv1alpha1.Command:
- return IsQueueReference(v.TargetObject)
- default:
- return false
- }
- },
- Handler: cache.ResourceEventHandlerFuncs{
- AddFunc: c.addCommand,
- },
- })
- c.cmdLister = c.cmdInformer.Lister()
- c.cmdSynced = c.cmdInformer.Informer().HasSynced
-
- queuestate.SyncQueue = c.syncQueue
- queuestate.OpenQueue = c.openQueue
- queuestate.CloseQueue = c.closeQueue
-
- c.syncHandler = c.handleQueue
- c.syncCommandHandler = c.handleCommand
-
- c.enqueueQueue = c.enqueue
-
- return nil
-}
-
-// Run starts QueueController.
-func (c *queuecontroller) Run(stopCh <-chan struct{}) {
- defer utilruntime.HandleCrash()
- defer c.queue.ShutDown()
- defer c.commandQueue.ShutDown()
-
- klog.Infof("Starting queue controller.")
- defer klog.Infof("Shutting down queue controller.")
-
- go c.queueInformer.Informer().Run(stopCh)
- go c.pgInformer.Informer().Run(stopCh)
- go c.cmdInformer.Informer().Run(stopCh)
-
- if !cache.WaitForCacheSync(stopCh, c.queueSynced, c.pgSynced, c.cmdSynced) {
- klog.Errorf("unable to sync caches for queue controller.")
- return
- }
-
- go wait.Until(c.worker, 0, stopCh)
- go wait.Until(c.commandWorker, 0, stopCh)
-
- <-stopCh
-}
-
-// worker runs a worker thread that just dequeues items, processes them, and
-// marks them done. You may run as many of these in parallel as you wish; the
-// workqueue guarantees that they will not end up processing the same `queue`
-// at the same time.
-func (c *queuecontroller) worker() {
- for c.processNextWorkItem() {
- }
-}
-
-func (c *queuecontroller) processNextWorkItem() bool {
- obj, shutdown := c.queue.Get()
- if shutdown {
- return false
- }
- defer c.queue.Done(obj)
-
- req, ok := obj.(*apis.Request)
- if !ok {
- klog.Errorf("%v is not a valid queue request struct.", obj)
- return true
- }
-
- err := c.syncHandler(req)
- c.handleQueueErr(err, obj)
-
- return true
-}
-
-func (c *queuecontroller) handleQueue(req *apis.Request) error {
- startTime := time.Now()
- defer func() {
- klog.V(4).Infof("Finished syncing queue %s (%v).", req.QueueName, time.Since(startTime))
- }()
-
- queue, err := c.queueLister.Get(req.QueueName)
- if err != nil {
- if apierrors.IsNotFound(err) {
- klog.V(4).Infof("Queue %s has been deleted.", req.QueueName)
- return nil
- }
-
- return fmt.Errorf("get queue %s failed for %v", req.QueueName, err)
- }
-
- queueState := queuestate.NewState(queue)
- if queueState == nil {
- return fmt.Errorf("queue %s state %s is invalid", queue.Name, queue.Status.State)
- }
-
- klog.V(4).Infof("Begin execute %s action for queue %s, current status %s", req.Action, req.QueueName, queue.Status.State)
- if err := queueState.Execute(req.Action); err != nil {
- return fmt.Errorf("sync queue %s failed for %v, event is %v, action is %s",
- req.QueueName, err, req.Event, req.Action)
- }
-
- return nil
-}
-
-func (c *queuecontroller) handleQueueErr(err error, obj interface{}) {
- if err == nil {
- c.queue.Forget(obj)
- return
- }
-
- if c.maxRequeueNum == -1 || c.queue.NumRequeues(obj) < c.maxRequeueNum {
- klog.V(4).Infof("Error syncing queue request %v for %v.", obj, err)
- c.queue.AddRateLimited(obj)
- return
- }
-
- req, _ := obj.(*apis.Request)
- c.recordEventsForQueue(req.QueueName, v1.EventTypeWarning, string(req.Action),
- fmt.Sprintf("%v queue failed for %v", req.Action, err))
- klog.V(2).Infof("Dropping queue request %v out of the queue for %v.", obj, err)
- c.queue.Forget(obj)
-}
-
-func (c *queuecontroller) commandWorker() {
- for c.processNextCommand() {
- }
-}
-
-func (c *queuecontroller) processNextCommand() bool {
- obj, shutdown := c.commandQueue.Get()
- if shutdown {
- return false
- }
- defer c.commandQueue.Done(obj)
-
- cmd, ok := obj.(*busv1alpha1.Command)
- if !ok {
- klog.Errorf("%v is not a valid Command struct.", obj)
- return true
- }
-
- err := c.syncCommandHandler(cmd)
- c.handleCommandErr(err, obj)
-
- return true
-}
-
-func (c *queuecontroller) handleCommand(cmd *busv1alpha1.Command) error {
- startTime := time.Now()
- defer func() {
- klog.V(4).Infof("Finished syncing command %s/%s (%v).", cmd.Namespace, cmd.Name, time.Since(startTime))
- }()
-
- err := c.vcClient.BusV1alpha1().Commands(cmd.Namespace).Delete(context.TODO(), cmd.Name, metav1.DeleteOptions{})
- if err != nil {
- if apierrors.IsNotFound(err) {
- return nil
- }
-
- return fmt.Errorf("failed to delete command <%s/%s> for %v", cmd.Namespace, cmd.Name, err)
- }
-
- req := &apis.Request{
- QueueName: cmd.TargetObject.Name,
- Event: busv1alpha1.CommandIssuedEvent,
- Action: busv1alpha1.Action(cmd.Action),
- }
-
- c.enqueueQueue(req)
-
- return nil
-}
-
-func (c *queuecontroller) handleCommandErr(err error, obj interface{}) {
- if err == nil {
- c.commandQueue.Forget(obj)
- return
- }
-
- if c.maxRequeueNum == -1 || c.commandQueue.NumRequeues(obj) < c.maxRequeueNum {
- klog.V(4).Infof("Error syncing command %v for %v.", obj, err)
- c.commandQueue.AddRateLimited(obj)
- return
- }
-
- klog.V(2).Infof("Dropping command %v out of the queue for %v.", obj, err)
- c.commandQueue.Forget(obj)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "context"
- "fmt"
- "reflect"
-
- v1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/queue/state"
-)
-
-func (c *queuecontroller) syncQueue(queue *schedulingv1beta1.Queue, updateStateFn state.UpdateQueueStatusFn) error {
- klog.V(4).Infof("Begin to sync queue %s.", queue.Name)
- defer klog.V(4).Infof("End sync queue %s.", queue.Name)
-
- podGroups := c.getPodGroups(queue.Name)
- queueStatus := schedulingv1beta1.QueueStatus{}
-
- for _, pgKey := range podGroups {
- // Ignore error here, tt can not occur.
- ns, name, _ := cache.SplitMetaNamespaceKey(pgKey)
-
- // TODO: check NotFound error and sync local cache.
- pg, err := c.pgLister.PodGroups(ns).Get(name)
- if err != nil {
- return err
- }
-
- switch pg.Status.Phase {
- case schedulingv1beta1.PodGroupPending:
- queueStatus.Pending++
- case schedulingv1beta1.PodGroupRunning:
- queueStatus.Running++
- case schedulingv1beta1.PodGroupUnknown:
- queueStatus.Unknown++
- case schedulingv1beta1.PodGroupInqueue:
- queueStatus.Inqueue++
- }
- }
-
- if updateStateFn != nil {
- updateStateFn(&queueStatus, podGroups)
- } else {
- queueStatus.State = queue.Status.State
- }
-
- // ignore update when status does not change
- if equality.Semantic.DeepEqual(queueStatus, queue.Status) {
- return nil
- }
-
- newQueue := queue.DeepCopy()
- newQueue.Status = queueStatus
- if _, err := c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- klog.Errorf("Failed to update status of Queue %s: %v.", newQueue.Name, err)
- return err
- }
-
- return nil
-}
-
-func (c *queuecontroller) openQueue(queue *schedulingv1beta1.Queue, updateStateFn state.UpdateQueueStatusFn) error {
- klog.V(4).Infof("Begin to open queue %s.", queue.Name)
-
- newQueue := queue.DeepCopy()
- newQueue.Status.State = schedulingv1beta1.QueueStateOpen
-
- if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().Update(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.OpenQueueAction),
- fmt.Sprintf("Open queue failed for %v", err))
- return err
- }
-
- c.recorder.Event(newQueue, v1.EventTypeNormal, string(v1alpha1.OpenQueueAction), "Open queue succeed")
- } else {
- return nil
- }
-
- q, err := c.vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), newQueue.Name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- newQueue = q.DeepCopy()
- if updateStateFn != nil {
- updateStateFn(&newQueue.Status, nil)
- } else {
- return fmt.Errorf("internal error, update state function should be provided")
- }
-
- if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.OpenQueueAction),
- fmt.Sprintf("Update queue status from %s to %s failed for %v",
- queue.Status.State, newQueue.Status.State, err))
- return err
- }
- }
-
- return nil
-}
-
-func (c *queuecontroller) closeQueue(queue *schedulingv1beta1.Queue, updateStateFn state.UpdateQueueStatusFn) error {
- klog.V(4).Infof("Begin to close queue %s.", queue.Name)
-
- newQueue := queue.DeepCopy()
- newQueue.Status.State = schedulingv1beta1.QueueStateClosed
-
- if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().Update(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.CloseQueueAction),
- fmt.Sprintf("Close queue failed for %v", err))
- return err
- }
-
- c.recorder.Event(newQueue, v1.EventTypeNormal, string(v1alpha1.CloseQueueAction), "Close queue succeed")
- } else {
- return nil
- }
-
- q, err := c.vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), newQueue.Name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- newQueue = q.DeepCopy()
- podGroups := c.getPodGroups(newQueue.Name)
- if updateStateFn != nil {
- updateStateFn(&newQueue.Status, podGroups)
- } else {
- return fmt.Errorf("internal error, update state function should be provided")
- }
-
- if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.CloseQueueAction),
- fmt.Sprintf("Update queue status from %s to %s failed for %v",
- queue.Status.State, newQueue.Status.State, err))
- return err
- }
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- "k8s.io/client-go/tools/cache"
- "k8s.io/klog"
-
- busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/controllers/apis"
-)
-
-func (c *queuecontroller) enqueue(req *apis.Request) {
- c.queue.Add(req)
-}
-
-func (c *queuecontroller) addQueue(obj interface{}) {
- queue := obj.(*schedulingv1beta1.Queue)
-
- req := &apis.Request{
- QueueName: queue.Name,
-
- Event: busv1alpha1.OutOfSyncEvent,
- Action: busv1alpha1.SyncQueueAction,
- }
-
- c.enqueue(req)
-}
-
-func (c *queuecontroller) deleteQueue(obj interface{}) {
- queue, ok := obj.(*schedulingv1beta1.Queue)
- if !ok {
- tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
- if !ok {
- klog.Errorf("Couldn't get object from tombstone %#v.", obj)
- return
- }
- queue, ok = tombstone.Obj.(*schedulingv1beta1.Queue)
- if !ok {
- klog.Errorf("Tombstone contained object that is not a Queue: %#v.", obj)
- return
- }
- }
-
- c.pgMutex.Lock()
- defer c.pgMutex.Unlock()
- delete(c.podGroups, queue.Name)
-}
-
-func (c *queuecontroller) updateQueue(_, _ interface{}) {
- // currently do not care about queue update
-}
-
-func (c *queuecontroller) addPodGroup(obj interface{}) {
- pg := obj.(*schedulingv1beta1.PodGroup)
- key, _ := cache.MetaNamespaceKeyFunc(obj)
-
- c.pgMutex.Lock()
- defer c.pgMutex.Unlock()
-
- if c.podGroups[pg.Spec.Queue] == nil {
- c.podGroups[pg.Spec.Queue] = make(map[string]struct{})
- }
- c.podGroups[pg.Spec.Queue][key] = struct{}{}
-
- req := &apis.Request{
- QueueName: pg.Spec.Queue,
-
- Event: busv1alpha1.OutOfSyncEvent,
- Action: busv1alpha1.SyncQueueAction,
- }
-
- c.enqueue(req)
-}
-
-func (c *queuecontroller) updatePodGroup(old, new interface{}) {
- oldPG := old.(*schedulingv1beta1.PodGroup)
- newPG := new.(*schedulingv1beta1.PodGroup)
-
- // Note: we have no use case update PodGroup.Spec.Queue
- // So do not consider it here.
- if oldPG.Status.Phase != newPG.Status.Phase {
- c.addPodGroup(newPG)
- }
-}
-
-func (c *queuecontroller) deletePodGroup(obj interface{}) {
- pg, ok := obj.(*schedulingv1beta1.PodGroup)
- if !ok {
- tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
- if !ok {
- klog.Errorf("Couldn't get object from tombstone %#v.", obj)
- return
- }
- pg, ok = tombstone.Obj.(*schedulingv1beta1.PodGroup)
- if !ok {
- klog.Errorf("Tombstone contained object that is not a PodGroup: %#v.", obj)
- return
- }
- }
-
- key, _ := cache.MetaNamespaceKeyFunc(obj)
-
- c.pgMutex.Lock()
- defer c.pgMutex.Unlock()
-
- delete(c.podGroups[pg.Spec.Queue], key)
-
- req := &apis.Request{
- QueueName: pg.Spec.Queue,
-
- Event: busv1alpha1.OutOfSyncEvent,
- Action: busv1alpha1.SyncQueueAction,
- }
-
- c.enqueue(req)
-}
-
-func (c *queuecontroller) addCommand(obj interface{}) {
- cmd, ok := obj.(*busv1alpha1.Command)
- if !ok {
- klog.Errorf("Obj %v is not command.", obj)
- return
- }
-
- c.commandQueue.Add(cmd)
-}
-
-func (c *queuecontroller) getPodGroups(key string) []string {
- c.pgMutex.RLock()
- defer c.pgMutex.RUnlock()
-
- if c.podGroups[key] == nil {
- return nil
- }
- podGroups := make([]string, 0, len(c.podGroups[key]))
- for pgKey := range c.podGroups[key] {
- podGroups = append(podGroups, pgKey)
- }
-
- return podGroups
-}
-
-func (c *queuecontroller) recordEventsForQueue(name, eventType, reason, message string) {
- queue, err := c.queueLister.Get(name)
- if err != nil {
- klog.Errorf("Get queue %s failed for %v.", name, err)
- return
- }
-
- c.recorder.Event(queue, eventType, reason, message)
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package queue
-
-import (
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-// IsQueueReference return if ownerReference is Queue Kind.
-func IsQueueReference(ref *metav1.OwnerReference) bool {
- if ref == nil {
- return false
- }
-
- if ref.APIVersion != schedulingv1beta1.SchemeGroupVersion.String() {
- return false
- }
-
- if ref.Kind != "Queue" {
- return false
- }
-
- return true
-}
-
-
-
/*
- Copyright 2021 The Volcano Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-package allocate
-
-import (
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/metrics"
- "volcano.sh/volcano/pkg/scheduler/util"
-)
-
-var targetJob = util.Reservation.TargetJob
-
-type Action struct{}
-
-func New() *Action {
- return &Action{}
-}
-
-func (alloc *Action) Name() string {
- return "allocate"
-}
-
-func (alloc *Action) Initialize() {}
-
-func (alloc *Action) Execute(ssn *framework.Session) {
- klog.V(3).Infof("Enter Allocate ...")
- defer klog.V(3).Infof("Leaving Allocate ...")
-
- // the allocation for pod may have many stages
- // 1. pick a namespace named N (using ssn.NamespaceOrderFn)
- // 2. pick a queue named Q from N (using ssn.QueueOrderFn)
- // 3. pick a job named J from Q (using ssn.JobOrderFn)
- // 4. pick a task T from J (using ssn.TaskOrderFn)
- // 5. use predicateFn to filter out node that T can not be allocated on.
- // 6. use ssn.NodeOrderFn to judge the best node and assign it to T
-
- namespaces := util.NewPriorityQueue(ssn.NamespaceOrderFn)
-
- // jobsMap is map[api.NamespaceName]map[api.QueueID]PriorityQueue(*api.JobInfo)
- // used to find job with highest priority in given queue and namespace
- jobsMap := map[api.NamespaceName]map[api.QueueID]*util.PriorityQueue{}
-
- for _, job := range ssn.Jobs {
- if job.IsPending() {
- klog.V(4).Infof("Job <%s/%s> Queue <%s> skip allocate, reason: job status is pending.",
- job.Namespace, job.Name, job.Queue)
- continue
- }
- if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
- klog.V(4).Infof("Job <%s/%s> Queue <%s> skip allocate, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
- continue
- }
-
- if _, found := ssn.Queues[job.Queue]; !found {
- klog.Warningf("Skip adding Job <%s/%s> because its queue %s is not found",
- job.Namespace, job.Name, job.Queue)
- continue
- }
-
- namespace := api.NamespaceName(job.Namespace)
- queueMap, found := jobsMap[namespace]
- if !found {
- namespaces.Push(namespace)
-
- queueMap = make(map[api.QueueID]*util.PriorityQueue)
- jobsMap[namespace] = queueMap
- }
-
- jobs, found := queueMap[job.Queue]
- if !found {
- jobs = util.NewPriorityQueue(ssn.JobOrderFn)
- queueMap[job.Queue] = jobs
- }
-
- klog.V(4).Infof("Added Job <%s/%s> into Queue <%s>", job.Namespace, job.Name, job.Queue)
- jobs.Push(job)
- }
-
- klog.V(3).Infof("Try to allocate resource to %d Namespaces", len(jobsMap))
-
- pendingTasks := map[api.JobID]*util.PriorityQueue{}
-
- allNodes := ssn.NodeList
- unlockedNodes := allNodes
- if targetJob != nil && len(util.Reservation.LockedNodes) != 0 {
- unlockedNodes = unlockedNodes[0:0]
- for _, node := range allNodes {
- if _, exist := util.Reservation.LockedNodes[node.Name]; !exist {
- unlockedNodes = append(unlockedNodes, node)
- }
- }
- }
- for _, unlockedNode := range unlockedNodes {
- klog.V(4).Infof("unlockedNode ID: %s, Name: %s", unlockedNode.Node.UID, unlockedNode.Node.Name)
- }
- predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) error {
- // Check for Resource Predicate
- if !task.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
- return api.NewFitError(task, node, api.NodeResourceFitFailed)
- }
-
- return ssn.PredicateFn(task, node)
- }
-
- // To pick <namespace, queue> tuple for job, we choose to pick namespace firstly.
- // Because we believe that number of queues would less than namespaces in most case.
- // And, this action would make the resource usage among namespace balanced.
- for {
- if namespaces.Empty() {
- break
- }
-
- // pick namespace from namespaces PriorityQueue
- namespace := namespaces.Pop().(api.NamespaceName)
-
- queueInNamespace := jobsMap[namespace]
-
- // pick queue for given namespace
- //
- // This block use an algorithm with time complex O(n).
- // But at least PriorityQueue could not be used here,
- // because the allocation of job would change the priority of queue among all namespaces,
- // and the PriorityQueue have no ability to update priority for a special queue.
- var queue *api.QueueInfo
- for queueID := range queueInNamespace {
- currentQueue := ssn.Queues[queueID]
- if ssn.Overused(currentQueue) {
- klog.V(3).Infof("Namespace <%s> Queue <%s> is overused, ignore it.", namespace, currentQueue.Name)
- delete(queueInNamespace, queueID)
- continue
- }
- if jobs, found := queueInNamespace[currentQueue.UID]; found && jobs.Empty() {
- continue
- }
-
- if queue == nil || ssn.QueueOrderFn(currentQueue, queue) {
- queue = currentQueue
- }
- }
-
- if queue == nil {
- klog.V(3).Infof("Namespace <%s> have no queue, skip it", namespace)
- continue
- }
-
- klog.V(3).Infof("Try to allocate resource to Jobs in Namespace <%s> Queue <%v>", namespace, queue.Name)
-
- jobs, found := queueInNamespace[queue.UID]
- if !found || jobs.Empty() {
- delete(queueInNamespace, queue.UID)
- namespaces.Push(namespace)
- klog.V(4).Infof("Can not find jobs for queue %s.", queue.Name)
- continue
- }
-
- job := jobs.Pop().(*api.JobInfo)
- var nodes []*api.NodeInfo
- if targetJob != nil && job.UID == targetJob.UID {
- klog.V(4).Infof("Try to allocate resource to target job: %s", job.Name)
- nodes = allNodes
- } else {
- nodes = unlockedNodes
- }
- if _, found = pendingTasks[job.UID]; !found {
- tasks := util.NewPriorityQueue(ssn.TaskOrderFn)
- for _, task := range job.TaskStatusIndex[api.Pending] {
- // Skip BestEffort task in 'allocate' action.
- if task.Resreq.IsEmpty() {
- klog.V(4).Infof("Task <%v/%v> is BestEffort task, skip it.",
- task.Namespace, task.Name)
- continue
- }
-
- tasks.Push(task)
- }
- pendingTasks[job.UID] = tasks
- }
- tasks := pendingTasks[job.UID]
-
- klog.V(3).Infof("Try to allocate resource to %d tasks of Job <%v/%v>",
- tasks.Len(), job.Namespace, job.Name)
-
- stmt := framework.NewStatement(ssn)
- ph := util.NewPredicateHelper()
- for !tasks.Empty() {
- task := tasks.Pop().(*api.TaskInfo)
-
- // Check whether the queue is overused on dimension that the task requested
- taskRequest := task.Resreq.ResourceNames()
- if underusedResources := ssn.UnderusedResources(queue); underusedResources != nil && !underusedResources.Contains(taskRequest) {
- klog.V(3).Infof("Queue <%s> is overused when considering task <%s>, ignore it.", queue.Name, task.Name)
- continue
- }
-
- klog.V(3).Infof("There are <%d> nodes for Job <%v/%v>", len(nodes), job.Namespace, job.Name)
-
- predicateNodes, fitErrors := ph.PredicateNodes(task, nodes, predicateFn)
- if len(predicateNodes) == 0 {
- job.NodesFitErrors[task.UID] = fitErrors
- break
- }
-
- var candidateNodes []*api.NodeInfo
- for _, n := range predicateNodes {
- if task.InitResreq.LessEqual(n.Idle, api.Zero) || task.InitResreq.LessEqual(n.FutureIdle(), api.Zero) {
- candidateNodes = append(candidateNodes, n)
- }
- }
-
- // If not candidate nodes for this task, skip it.
- if len(candidateNodes) == 0 {
- continue
- }
-
- nodeScores := util.PrioritizeNodes(task, candidateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
-
- node := ssn.BestNodeFn(task, nodeScores)
- if node == nil {
- node = util.SelectBestNode(nodeScores)
- }
-
- // Allocate idle resource to the task.
- if task.InitResreq.LessEqual(node.Idle, api.Zero) {
- klog.V(3).Infof("Binding Task <%v/%v> to node <%v>",
- task.Namespace, task.Name, node.Name)
- if err := stmt.Allocate(task, node); err != nil {
- klog.Errorf("Failed to bind Task %v on %v in Session %v, err: %v",
- task.UID, node.Name, ssn.UID, err)
- } else {
- metrics.UpdateE2eSchedulingDurationByJob(job.Name, string(job.Queue), job.Namespace, metrics.Duration(job.CreationTimestamp.Time))
- }
- } else {
- klog.V(3).Infof("Predicates failed for task <%s/%s> on node <%s> with limited resources",
- task.Namespace, task.Name, node.Name)
-
- // Allocate releasing resource to the task if any.
- if task.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
- klog.V(3).Infof("Pipelining Task <%v/%v> to node <%v> for <%v> on <%v>",
- task.Namespace, task.Name, node.Name, task.InitResreq, node.Releasing)
- if err := stmt.Pipeline(task, node.Name); err != nil {
- klog.Errorf("Failed to pipeline Task %v on %v in Session %v for %v.",
- task.UID, node.Name, ssn.UID, err)
- } else {
- metrics.UpdateE2eSchedulingDurationByJob(job.Name, string(job.Queue), job.Namespace, metrics.Duration(job.CreationTimestamp.Time))
- }
- }
- }
-
- if ssn.JobReady(job) && !tasks.Empty() {
- jobs.Push(job)
- break
- }
- }
-
- if ssn.JobReady(job) {
- stmt.Commit()
- } else {
- if !ssn.JobPipelined(job) {
- stmt.Discard()
- }
- }
-
- // Added Namespace back until no job in Namespace.
- namespaces.Push(namespace)
- }
-}
-
-func (alloc *Action) UnInitialize() {}
-
-
-
// Package elect is used to find the target job and reserve resource for it
-package elect
-
-import (
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/util"
-)
-
-// Action defines the action
-type Action struct{}
-
-// New returns the action instance
-func New() *Action {
- return &Action{}
-}
-
-// Name returns the action name
-func (alloc *Action) Name() string {
- return "elect"
-}
-
-// Initialize inits the action
-func (alloc *Action) Initialize() {}
-
-// Execute selects the target job which is of the highest priority and waits for the longest time.
-func (alloc *Action) Execute(ssn *framework.Session) {
- klog.V(3).Infof("Enter Elect ...")
- defer klog.V(3).Infof("Leaving Elect ...")
-
- if util.Reservation.TargetJob == nil {
- klog.V(4).Infof("Start select Target Job")
- var pendingJobs []*api.JobInfo
- for _, job := range ssn.Jobs {
- if job.PodGroup.Status.Phase == scheduling.PodGroupPending {
- pendingJobs = append(pendingJobs, job)
- }
- }
- util.Reservation.TargetJob = ssn.TargetJob(pendingJobs)
- if util.Reservation.TargetJob != nil {
- klog.V(3).Infof("Target Job name: %s", util.Reservation.TargetJob.Name)
- } else {
- klog.V(3).Infof("Target Job name: nil")
- }
- }
-}
-
-// UnInitialize releases resource which are not useful.
-func (alloc *Action) UnInitialize() {}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package preempt
-
-import (
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/metrics"
- "volcano.sh/volcano/pkg/scheduler/util"
-)
-
-type Action struct{}
-
-func New() *Action {
- return &Action{}
-}
-
-func (alloc *Action) Name() string {
- return "preempt"
-}
-
-func (alloc *Action) Initialize() {}
-
-func (alloc *Action) Execute(ssn *framework.Session) {
- klog.V(3).Infof("Enter Preempt ...")
- defer klog.V(3).Infof("Leaving Preempt ...")
-
- preemptorsMap := map[api.QueueID]*util.PriorityQueue{}
- preemptorTasks := map[api.JobID]*util.PriorityQueue{}
-
- var underRequest []*api.JobInfo
- queues := map[api.QueueID]*api.QueueInfo{}
-
- for _, job := range ssn.Jobs {
- if job.IsPending() {
- continue
- }
-
- if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
- klog.V(4).Infof("Job <%s/%s> Queue <%s> skip preemption, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
- continue
- }
-
- if queue, found := ssn.Queues[job.Queue]; !found {
- continue
- } else if _, existed := queues[queue.UID]; !existed {
- klog.V(3).Infof("Added Queue <%s> for Job <%s/%s>",
- queue.Name, job.Namespace, job.Name)
- queues[queue.UID] = queue
- }
-
- // check job if starting for more resources.
- if ssn.JobStarving(job) {
- if _, found := preemptorsMap[job.Queue]; !found {
- preemptorsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn)
- }
- preemptorsMap[job.Queue].Push(job)
- underRequest = append(underRequest, job)
- preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
- for _, task := range job.TaskStatusIndex[api.Pending] {
- preemptorTasks[job.UID].Push(task)
- }
- }
- }
-
- ph := util.NewPredicateHelper()
- // Preemption between Jobs within Queue.
- for _, queue := range queues {
- for {
- preemptors := preemptorsMap[queue.UID]
-
- // If no preemptors, no preemption.
- if preemptors == nil || preemptors.Empty() {
- klog.V(4).Infof("No preemptors in Queue <%s>, break.", queue.Name)
- break
- }
-
- preemptorJob := preemptors.Pop().(*api.JobInfo)
-
- stmt := framework.NewStatement(ssn)
- assigned := false
- for {
- // If job is not request more resource, then stop preempting.
- if !ssn.JobStarving(preemptorJob) {
- break
- }
-
- // If not preemptor tasks, next job.
- if preemptorTasks[preemptorJob.UID].Empty() {
- klog.V(3).Infof("No preemptor task in job <%s/%s>.",
- preemptorJob.Namespace, preemptorJob.Name)
- break
- }
-
- preemptor := preemptorTasks[preemptorJob.UID].Pop().(*api.TaskInfo)
-
- if preempted, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool {
- // Ignore non running task.
- if task.Status != api.Running {
- return false
- }
- // Ignore task with empty resource request.
- if task.Resreq.IsEmpty() {
- return false
- }
- job, found := ssn.Jobs[task.Job]
- if !found {
- return false
- }
- // Preempt other jobs within queue
- return job.Queue == preemptorJob.Queue && preemptor.Job != task.Job
- }, ph); preempted {
- assigned = true
- }
- }
-
- // Commit changes only if job is pipelined, otherwise try next job.
- if ssn.JobPipelined(preemptorJob) {
- stmt.Commit()
- } else {
- stmt.Discard()
- continue
- }
-
- if assigned {
- preemptors.Push(preemptorJob)
- }
- }
-
- // Preemption between Task within Job.
- for _, job := range underRequest {
- // Fix: preemptor numbers lose when in same job
- preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
- for _, task := range job.TaskStatusIndex[api.Pending] {
- preemptorTasks[job.UID].Push(task)
- }
- for {
- if _, found := preemptorTasks[job.UID]; !found {
- break
- }
-
- if preemptorTasks[job.UID].Empty() {
- break
- }
-
- preemptor := preemptorTasks[job.UID].Pop().(*api.TaskInfo)
-
- stmt := framework.NewStatement(ssn)
- assigned, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool {
- // Ignore non running task.
- if task.Status != api.Running {
- return false
- }
- // Ignore task with empty resource request.
- if task.Resreq.IsEmpty() {
- return false
- }
- // Preempt tasks within job.
- return preemptor.Job == task.Job
- }, ph)
- stmt.Commit()
-
- // If no preemption, next job.
- if !assigned {
- break
- }
- }
- }
- }
-
- // call victimTasksFn to evict tasks
- victimTasks(ssn)
-}
-
-func (alloc *Action) UnInitialize() {}
-
-func preempt(
- ssn *framework.Session,
- stmt *framework.Statement,
- preemptor *api.TaskInfo,
- filter func(*api.TaskInfo) bool,
- predicateHelper util.PredicateHelper,
-) (bool, error) {
- assigned := false
-
- allNodes := ssn.NodeList
-
- predicateNodes, _ := predicateHelper.PredicateNodes(preemptor, allNodes, ssn.PredicateFn)
-
- nodeScores := util.PrioritizeNodes(preemptor, predicateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn)
-
- selectedNodes := util.SortNodes(nodeScores)
- for _, node := range selectedNodes {
- klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.",
- preemptor.Namespace, preemptor.Name, node.Name)
-
- var preemptees []*api.TaskInfo
- for _, task := range node.Tasks {
- if filter == nil {
- preemptees = append(preemptees, task.Clone())
- } else if filter(task) {
- preemptees = append(preemptees, task.Clone())
- }
- }
- victims := ssn.Preemptable(preemptor, preemptees)
- metrics.UpdatePreemptionVictimsCount(len(victims))
-
- if err := util.ValidateVictims(preemptor, node, victims); err != nil {
- klog.V(3).Infof("No validated victims on Node <%s>: %v", node.Name, err)
- continue
- }
-
- victimsQueue := util.NewPriorityQueue(func(l, r interface{}) bool {
- return !ssn.TaskOrderFn(l, r)
- })
- for _, victim := range victims {
- victimsQueue.Push(victim)
- }
- // Preempt victims for tasks, pick lowest priority task first.
- preempted := api.EmptyResource()
-
- for !victimsQueue.Empty() {
- // If reclaimed enough resources, break loop to avoid Sub panic.
- if preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
- break
- }
- preemptee := victimsQueue.Pop().(*api.TaskInfo)
- klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>",
- preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name)
- if err := stmt.Evict(preemptee, "preempt"); err != nil {
- klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v",
- preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err)
- continue
- }
- preempted.Add(preemptee.Resreq)
- }
-
- metrics.RegisterPreemptionAttempts()
- klog.V(3).Infof("Preempted <%v> for Task <%s/%s> requested <%v>.",
- preempted, preemptor.Namespace, preemptor.Name, preemptor.InitResreq)
-
- if preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) {
- if err := stmt.Pipeline(preemptor, node.Name); err != nil {
- klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>",
- preemptor.Namespace, preemptor.Name, node.Name)
- }
-
- // Ignore pipeline error, will be corrected in next scheduling loop.
- assigned = true
-
- break
- }
- }
-
- return assigned, nil
-}
-
-func victimTasks(ssn *framework.Session) {
- stmt := framework.NewStatement(ssn)
- victimTasks := ssn.VictimTasks()
- for _, victim := range victimTasks {
- if err := stmt.Evict(victim.Clone(), "evict"); err != nil {
- klog.Errorf("Failed to evict Task <%s/%s>: %v",
- victim.Namespace, victim.Name, err)
- continue
- }
- }
- stmt.Commit()
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package reclaim
-
-import (
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/util"
-)
-
-type Action struct{}
-
-func New() *Action {
- return &Action{}
-}
-
-func (ra *Action) Name() string {
- return "reclaim"
-}
-
-func (ra *Action) Initialize() {}
-
-func (ra *Action) Execute(ssn *framework.Session) {
- klog.V(3).Infof("Enter Reclaim ...")
- defer klog.V(3).Infof("Leaving Reclaim ...")
-
- queues := util.NewPriorityQueue(ssn.QueueOrderFn)
- queueMap := map[api.QueueID]*api.QueueInfo{}
-
- preemptorsMap := map[api.QueueID]*util.PriorityQueue{}
- preemptorTasks := map[api.JobID]*util.PriorityQueue{}
-
- klog.V(3).Infof("There are <%d> Jobs and <%d> Queues in total for scheduling.",
- len(ssn.Jobs), len(ssn.Queues))
-
- for _, job := range ssn.Jobs {
- if job.IsPending() {
- continue
- }
-
- if vr := ssn.JobValid(job); vr != nil && !vr.Pass {
- klog.V(4).Infof("Job <%s/%s> Queue <%s> skip reclaim, reason: %v, message %v", job.Namespace, job.Name, job.Queue, vr.Reason, vr.Message)
- continue
- }
-
- if queue, found := ssn.Queues[job.Queue]; !found {
- klog.Errorf("Failed to find Queue <%s> for Job <%s/%s>",
- job.Queue, job.Namespace, job.Name)
- continue
- } else if _, existed := queueMap[queue.UID]; !existed {
- klog.V(4).Infof("Added Queue <%s> for Job <%s/%s>", queue.Name, job.Namespace, job.Name)
- queueMap[queue.UID] = queue
- queues.Push(queue)
- }
-
- if len(job.TaskStatusIndex[api.Pending]) != 0 {
- if _, found := preemptorsMap[job.Queue]; !found {
- preemptorsMap[job.Queue] = util.NewPriorityQueue(ssn.JobOrderFn)
- }
- preemptorsMap[job.Queue].Push(job)
- preemptorTasks[job.UID] = util.NewPriorityQueue(ssn.TaskOrderFn)
- for _, task := range job.TaskStatusIndex[api.Pending] {
- preemptorTasks[job.UID].Push(task)
- }
- }
- }
-
- for {
- // If no queues, break
- if queues.Empty() {
- break
- }
-
- var job *api.JobInfo
- var task *api.TaskInfo
-
- queue := queues.Pop().(*api.QueueInfo)
- if ssn.Overused(queue) {
- klog.V(3).Infof("Queue <%s> is overused, ignore it.", queue.Name)
- continue
- }
-
- // Found "high" priority job
- jobs, found := preemptorsMap[queue.UID]
- if !found || jobs.Empty() {
- continue
- } else {
- job = jobs.Pop().(*api.JobInfo)
- }
-
- // Found "high" priority task to reclaim others
- if tasks, found := preemptorTasks[job.UID]; !found || tasks.Empty() {
- continue
- } else {
- task = tasks.Pop().(*api.TaskInfo)
- }
-
- // Check whether the queue is overused on dimension that the task requested
- taskRequest := task.Resreq.ResourceNames()
- if underusedResources := ssn.UnderusedResources(queue); underusedResources != nil && !underusedResources.Contains(taskRequest) {
- klog.V(3).Infof("Queue <%s> is overused when considering task <%s>, ignore it.", queue.Name, task.Name)
- continue
- }
-
- assigned := false
- for _, n := range ssn.Nodes {
- // If predicates failed, next node.
- if err := ssn.PredicateFn(task, n); err != nil {
- continue
- }
-
- klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.",
- task.Namespace, task.Name, n.Name)
-
- var reclaimees []*api.TaskInfo
- for _, task := range n.Tasks {
- // Ignore non running task.
- if task.Status != api.Running {
- continue
- }
-
- if j, found := ssn.Jobs[task.Job]; !found {
- continue
- } else if j.Queue != job.Queue {
- q := ssn.Queues[j.Queue]
- if !q.Reclaimable() {
- continue
- }
- // Clone task to avoid modify Task's status on node.
- reclaimees = append(reclaimees, task.Clone())
- }
- }
- victims := ssn.Reclaimable(task, reclaimees)
-
- if err := util.ValidateVictims(task, n, victims); err != nil {
- klog.V(3).Infof("No validated victims on Node <%s>: %v", n.Name, err)
- continue
- }
-
- resreq := task.InitResreq.Clone()
- reclaimed := api.EmptyResource()
-
- // Reclaim victims for tasks.
- for _, reclaimee := range victims {
- klog.Errorf("Try to reclaim Task <%s/%s> for Tasks <%s/%s>",
- reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name)
- if err := ssn.Evict(reclaimee, "reclaim"); err != nil {
- klog.Errorf("Failed to reclaim Task <%s/%s> for Tasks <%s/%s>: %v",
- reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name, err)
- continue
- }
- reclaimed.Add(reclaimee.Resreq)
- // If reclaimed enough resources, break loop to avoid Sub panic.
- if resreq.LessEqual(reclaimed, api.Zero) {
- break
- }
- }
-
- klog.V(3).Infof("Reclaimed <%v> for task <%s/%s> requested <%v>.",
- reclaimed, task.Namespace, task.Name, task.InitResreq)
-
- if task.InitResreq.LessEqual(reclaimed, api.Zero) {
- if err := ssn.Pipeline(task, n.Name); err != nil {
- klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>",
- task.Namespace, task.Name, n.Name)
- }
-
- // Ignore error of pipeline, will be corrected in next scheduling loop.
- assigned = true
-
- break
- }
- }
-
- if assigned {
- jobs.Push(job)
- }
- queues.Push(queue)
- }
-}
-
-func (ra *Action) UnInitialize() {
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
-)
-
-// ClusterInfo is a snapshot of cluster by cache.
-type ClusterInfo struct {
- Jobs map[JobID]*JobInfo
- Nodes map[string]*NodeInfo
- Queues map[QueueID]*QueueInfo
- NamespaceInfo map[NamespaceName]*NamespaceInfo
- RevocableNodes map[string]*NodeInfo
- NodeList []string
-}
-
-func (ci ClusterInfo) String() string {
- str := "Cache:\n"
-
- if len(ci.Nodes) != 0 {
- str += "Nodes:\n"
- for _, n := range ci.Nodes {
- str += fmt.Sprintf("\t %s: idle(%v) used(%v) allocatable(%v) pods(%d)\n",
- n.Name, n.Idle, n.Used, n.Allocatable, len(n.Tasks))
-
- i := 0
- for _, p := range n.Tasks {
- str += fmt.Sprintf("\t\t %d: %v\n", i, p)
- i++
- }
- }
- }
-
- if len(ci.Jobs) != 0 {
- str += "Jobs:\n"
- for _, job := range ci.Jobs {
- str += fmt.Sprintf("\t Job(%s) name(%s) minAvailable(%v)\n",
- job.UID, job.Name, job.MinAvailable)
-
- i := 0
- for _, task := range job.Tasks {
- str += fmt.Sprintf("\t\t %d: %v\n", i, task)
- i++
- }
- }
- }
-
- if len(ci.NamespaceInfo) != 0 {
- str += "Namespaces:\n"
- for _, ns := range ci.NamespaceInfo {
- str += fmt.Sprintf("\t Namespace(%s) Weight(%v)\n",
- ns.Name, ns.Weight)
- }
- }
-
- if len(ci.NodeList) != 0 {
- str += fmt.Sprintf("NodeList: %v\n", ci.NodeList)
- }
-
- return str
-}
-
-
-
/*
-Copyright 2020 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- v1 "k8s.io/api/core/v1"
-)
-
-// GPUDevice include gpu id, memory and the pods that are sharing it.
-type GPUDevice struct {
- // GPU ID
- ID int
- // The pods that are sharing this GPU
- PodMap map[string]*v1.Pod
- // memory per card
- Memory uint
-}
-
-// NewGPUDevice creates a device
-func NewGPUDevice(id int, mem uint) *GPUDevice {
- return &GPUDevice{
- ID: id,
- Memory: mem,
- PodMap: map[string]*v1.Pod{},
- }
-}
-
-// getUsedGPUMemory calculates the used memory of the device.
-func (g *GPUDevice) getUsedGPUMemory() uint {
- res := uint(0)
- for _, pod := range g.PodMap {
- if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
- continue
- } else {
- gpuRequest := GetGPUResourceOfPod(pod)
- res += gpuRequest
- }
- }
- return res
-}
-
-// GetGPUResourceOfPod returns the GPU resource required by the pod.
-func GetGPUResourceOfPod(pod *v1.Pod) uint {
- var mem uint
- for _, container := range pod.Spec.Containers {
- mem += getGPUResourceOfContainer(&container)
- }
- return mem
-}
-
-// getGPUResourceOfPod returns the GPU resource required by the container.
-func getGPUResourceOfContainer(container *v1.Container) uint {
- var mem uint
- if val, ok := container.Resources.Limits[VolcanoGPUResource]; ok {
- mem = uint(val.Value())
- }
- return mem
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
- clientcache "k8s.io/client-go/tools/cache"
-)
-
-// PodKey returns the string key of a pod.
-func PodKey(pod *v1.Pod) TaskID {
- key, err := clientcache.MetaNamespaceKeyFunc(pod)
- if err != nil {
- return TaskID(fmt.Sprintf("%v/%v", pod.Namespace, pod.Name))
- }
- return TaskID(key)
-}
-
-func getTaskStatus(pod *v1.Pod) TaskStatus {
- switch pod.Status.Phase {
- case v1.PodRunning:
- if pod.DeletionTimestamp != nil {
- return Releasing
- }
-
- return Running
- case v1.PodPending:
- if pod.DeletionTimestamp != nil {
- return Releasing
- }
-
- if len(pod.Spec.NodeName) == 0 {
- return Pending
- }
- return Bound
- case v1.PodUnknown:
- return Unknown
- case v1.PodSucceeded:
- return Succeeded
- case v1.PodFailed:
- return Failed
- }
-
- return Unknown
-}
-
-// AllocatedStatus checks whether the tasks has AllocatedStatus
-func AllocatedStatus(status TaskStatus) bool {
- switch status {
- case Bound, Binding, Running, Allocated:
- return true
- default:
- return false
- }
-}
-
-// MergeErrors is used to merge multiple errors into single error
-func MergeErrors(errs ...error) error {
- msg := "errors: "
-
- foundErr := false
- i := 1
-
- for _, e := range errs {
- if e != nil {
- if foundErr {
- msg = fmt.Sprintf("%s, %d: ", msg, i)
- } else {
- msg = fmt.Sprintf("%s %d: ", msg, i)
- }
-
- msg = fmt.Sprintf("%s%v", msg, e)
- foundErr = true
- i++
- }
- }
-
- if foundErr {
- return fmt.Errorf("%s", msg)
- }
-
- return nil
-}
-
-// JobTerminated checks whether job was terminated.
-func JobTerminated(job *JobInfo) bool {
- return job.PodGroup == nil && len(job.Tasks) == 0
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package helpers
-
-import (
- "math"
-
- v1 "k8s.io/api/core/v1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-// Min is used to find the min of two resource types
-func Min(l, r *api.Resource) *api.Resource {
- res := &api.Resource{}
-
- res.MilliCPU = math.Min(l.MilliCPU, r.MilliCPU)
- res.Memory = math.Min(l.Memory, r.Memory)
-
- if l.ScalarResources == nil || r.ScalarResources == nil {
- return res
- }
-
- res.ScalarResources = map[v1.ResourceName]float64{}
- for lName, lQuant := range l.ScalarResources {
- res.ScalarResources[lName] = math.Min(lQuant, r.ScalarResources[lName])
- }
-
- return res
-}
-
-// Max returns the resource object with larger value in each dimension.
-func Max(l, r *api.Resource) *api.Resource {
- res := &api.Resource{}
-
- res.MilliCPU = math.Max(l.MilliCPU, r.MilliCPU)
- res.Memory = math.Max(l.Memory, r.Memory)
-
- if l.ScalarResources == nil && r.ScalarResources == nil {
- return res
- }
- res.ScalarResources = map[v1.ResourceName]float64{}
- if l.ScalarResources != nil {
- for lName, lQuant := range l.ScalarResources {
- if lQuant > 0 {
- res.ScalarResources[lName] = lQuant
- }
- }
- }
- if r.ScalarResources != nil {
- for rName, rQuant := range r.ScalarResources {
- if rQuant > 0 {
- maxQuant := math.Max(rQuant, res.ScalarResources[rName])
- res.ScalarResources[rName] = maxQuant
- }
- }
- }
- return res
-}
-
-// Share is used to determine the share
-func Share(l, r float64) float64 {
- var share float64
- if r == 0 {
- if l == 0 {
- share = 0
- } else {
- share = 1
- }
- } else {
- share = l / r
- }
-
- return share
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "encoding/json"
- "errors"
- "fmt"
- "sort"
- "strconv"
- "strings"
- "time"
-
- v1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/types"
- "k8s.io/klog"
- volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-// DisruptionBudget define job min pod available and max pod unvailable value
-type DisruptionBudget struct {
- MinAvailable string
- MaxUnavilable string
-}
-
-// NewDisruptionBudget create disruption budget for job
-func NewDisruptionBudget(minAvailable, maxUnavilable string) *DisruptionBudget {
- disruptionBudget := &DisruptionBudget{
- MinAvailable: minAvailable,
- MaxUnavilable: maxUnavilable,
- }
- return disruptionBudget
-}
-
-// Clone return a clone of DisruptionBudget
-func (db *DisruptionBudget) Clone() *DisruptionBudget {
- return &DisruptionBudget{
- MinAvailable: db.MinAvailable,
- MaxUnavilable: db.MaxUnavilable,
- }
-}
-
-// JobWaitingTime is maximum waiting time that a job could stay Pending in service level agreement
-// when job waits longer than waiting time, it should be inqueue at once, and cluster should reserve resources for it
-const JobWaitingTime = "sla-waiting-time"
-
-// TaskID is UID type for Task
-type TaskID types.UID
-
-// TransactionContext holds all the fields that needed by scheduling transaction
-type TransactionContext struct {
- NodeName string
- Status TaskStatus
-}
-
-// Clone return a clone of TransactionContext
-func (ctx *TransactionContext) Clone() *TransactionContext {
- if ctx == nil {
- return nil
- }
- clone := *ctx
- return &clone
-}
-
-type TopologyInfo struct {
- Policy string
- ResMap map[int]v1.ResourceList // key: numa ID
-}
-
-func (info *TopologyInfo) Clone() *TopologyInfo {
- copyInfo := &TopologyInfo{
- Policy: info.Policy,
- ResMap: make(map[int]v1.ResourceList),
- }
-
- for numaId, resList := range info.ResMap {
- copyInfo.ResMap[numaId] = resList.DeepCopy()
- }
-
- return copyInfo
-}
-
-// TaskInfo will have all infos about the task
-type TaskInfo struct {
- UID TaskID
- Job JobID
-
- Name string
- Namespace string
-
- // Resreq is the resource that used when task running.
- Resreq *Resource
- // InitResreq is the resource that used to launch a task.
- InitResreq *Resource
-
- TransactionContext
- // LastTransaction holds the context of last scheduling transaction
- LastTransaction *TransactionContext
-
- Priority int32
- VolumeReady bool
- Preemptable bool
- BestEffort bool
-
- // RevocableZone support set volcano.sh/revocable-zone annotaion or label for pod/podgroup
- // we only support empty value or * value for this version and we will support specify revocable zone name for futrue release
- // empty value means workload can not use revocable node
- // * value means workload can use all the revocable node for during node active revocable time.
- RevocableZone string
-
- NumaInfo *TopologyInfo
- PodVolumes *volumescheduling.PodVolumes
- Pod *v1.Pod
-}
-
-func getJobID(pod *v1.Pod) JobID {
- if gn, found := pod.Annotations[v1beta1.KubeGroupNameAnnotationKey]; found && len(gn) != 0 {
- // Make sure Pod and PodGroup belong to the same namespace.
- jobID := fmt.Sprintf("%s/%s", pod.Namespace, gn)
- return JobID(jobID)
- }
-
- return ""
-}
-
-func getTaskID(pod *v1.Pod) TaskID {
- if ts, found := pod.Annotations[batch.TaskSpecKey]; found && len(ts) != 0 {
- return TaskID(ts)
- }
-
- return ""
-}
-
-const TaskPriorityAnnotation = "volcano.sh/task-priority"
-
-// NewTaskInfo creates new taskInfo object for a Pod
-func NewTaskInfo(pod *v1.Pod) *TaskInfo {
- initResReq := GetPodResourceRequest(pod)
- resReq := initResReq
- bestEffort := initResReq.IsEmpty()
- preemptable := GetPodPreemptable(pod)
- revocableZone := GetPodRevocableZone(pod)
- topologyInfo := GetPodTopologyInfo(pod)
-
- jobID := getJobID(pod)
-
- ti := &TaskInfo{
- UID: TaskID(pod.UID),
- Job: jobID,
- Name: pod.Name,
- Namespace: pod.Namespace,
- Priority: 1,
- Pod: pod,
- Resreq: resReq,
- InitResreq: initResReq,
- Preemptable: preemptable,
- BestEffort: bestEffort,
- RevocableZone: revocableZone,
- NumaInfo: topologyInfo,
- TransactionContext: TransactionContext{
- NodeName: pod.Spec.NodeName,
- Status: getTaskStatus(pod),
- },
- }
-
- if pod.Spec.Priority != nil {
- ti.Priority = *pod.Spec.Priority
- }
-
- if taskPriority, ok := pod.Annotations[TaskPriorityAnnotation]; ok {
- if priority, err := strconv.ParseInt(taskPriority, 10, 32); err == nil {
- ti.Priority = int32(priority)
- }
- }
-
- return ti
-}
-
-// GetTransactionContext get transaction context of a task
-func (ti *TaskInfo) GetTransactionContext() TransactionContext {
- return ti.TransactionContext
-}
-
-// GenerateLastTxContext generate and set context of last transaction for a task
-func (ti *TaskInfo) GenerateLastTxContext() {
- ctx := ti.GetTransactionContext()
- ti.LastTransaction = &ctx
-}
-
-// ClearLastTxContext clear context of last transaction for a task
-func (ti *TaskInfo) ClearLastTxContext() {
- ti.LastTransaction = nil
-}
-
-func (ti *TaskInfo) SetPodResourceDecision() error {
- if ti.NumaInfo == nil || len(ti.NumaInfo.ResMap) == 0 {
- return nil
- }
-
- klog.V(4).Infof("%v/%v resource decision: %v", ti.Namespace, ti.Name, ti.NumaInfo.ResMap)
- decision := PodResourceDecision{
- NUMAResources: ti.NumaInfo.ResMap,
- }
-
- layout, err := json.Marshal(&decision)
- if err != nil {
- return err
- }
-
- metav1.SetMetaDataAnnotation(&ti.Pod.ObjectMeta, topologyDecisionAnnotation, string(layout[:]))
- return nil
-}
-
-func (ti *TaskInfo) UnsetPodResourceDecision() {
- delete(ti.Pod.Annotations, topologyDecisionAnnotation)
-}
-
-// Clone is used for cloning a task
-func (ti *TaskInfo) Clone() *TaskInfo {
- return &TaskInfo{
- UID: ti.UID,
- Job: ti.Job,
- Name: ti.Name,
- Namespace: ti.Namespace,
- Priority: ti.Priority,
- PodVolumes: ti.PodVolumes,
- Pod: ti.Pod,
- Resreq: ti.Resreq.Clone(),
- InitResreq: ti.InitResreq.Clone(),
- VolumeReady: ti.VolumeReady,
- Preemptable: ti.Preemptable,
- BestEffort: ti.BestEffort,
- RevocableZone: ti.RevocableZone,
- NumaInfo: ti.NumaInfo.Clone(),
- TransactionContext: TransactionContext{
- NodeName: ti.NodeName,
- Status: ti.Status,
- },
- LastTransaction: ti.LastTransaction.Clone(),
- }
-}
-
-func (ti *TaskInfo) GetTaskSpecKey() TaskID {
- if ti.Pod == nil {
- return ""
- }
- return getTaskID(ti.Pod)
-}
-
-// String returns the taskInfo details in a string
-func (ti TaskInfo) String() string {
- if ti.NumaInfo == nil {
- return fmt.Sprintf("Task (%v:%v/%v): job %v, status %v, pri %v"+
- "resreq %v, preemptable %v, revocableZone %v",
- ti.UID, ti.Namespace, ti.Name, ti.Job, ti.Status, ti.Priority,
- ti.Resreq, ti.Preemptable, ti.RevocableZone)
- }
-
- return fmt.Sprintf("Task (%v:%v/%v): job %v, status %v, pri %v"+
- "resreq %v, preemptable %v, revocableZone %v, numaInfo %v",
- ti.UID, ti.Namespace, ti.Name, ti.Job, ti.Status, ti.Priority,
- ti.Resreq, ti.Preemptable, ti.RevocableZone, *ti.NumaInfo)
-}
-
-// JobID is the type of JobInfo's ID.
-type JobID types.UID
-
-type tasksMap map[TaskID]*TaskInfo
-
-// NodeResourceMap stores resource in a node
-type NodeResourceMap map[string]*Resource
-
-// JobInfo will have all info of a Job
-type JobInfo struct {
- UID JobID
-
- Name string
- Namespace string
-
- Queue QueueID
-
- Priority int32
-
- MinAvailable int32
-
- WaitingTime *time.Duration
-
- JobFitErrors string
- NodesFitErrors map[TaskID]*FitErrors
-
- // All tasks of the Job.
- TaskStatusIndex map[TaskStatus]tasksMap
- Tasks tasksMap
- TaskMinAvailable map[TaskID]int32
- TaskMinAvailableTotal int32
-
- Allocated *Resource
- TotalRequest *Resource
-
- CreationTimestamp metav1.Time
- PodGroup *PodGroup
-
- ScheduleStartTimestamp metav1.Time
-
- Preemptable bool
-
- // RevocableZone support set volcano.sh/revocable-zone annotaion or label for pod/podgroup
- // we only support empty value or * value for this version and we will support specify revocable zone name for futrue release
- // empty value means workload can not use revocable node
- // * value means workload can use all the revocable node for during node active revocable time.
- RevocableZone string
- Budget *DisruptionBudget
-}
-
-// NewJobInfo creates a new jobInfo for set of tasks
-func NewJobInfo(uid JobID, tasks ...*TaskInfo) *JobInfo {
- job := &JobInfo{
- UID: uid,
- MinAvailable: 0,
- NodesFitErrors: make(map[TaskID]*FitErrors),
- Allocated: EmptyResource(),
- TotalRequest: EmptyResource(),
- TaskStatusIndex: map[TaskStatus]tasksMap{},
- Tasks: tasksMap{},
- TaskMinAvailable: map[TaskID]int32{},
- }
-
- for _, task := range tasks {
- job.AddTaskInfo(task)
- }
-
- return job
-}
-
-// UnsetPodGroup removes podGroup details from a job
-func (ji *JobInfo) UnsetPodGroup() {
- ji.PodGroup = nil
-}
-
-// SetPodGroup sets podGroup details to a job
-func (ji *JobInfo) SetPodGroup(pg *PodGroup) {
- ji.Name = pg.Name
- ji.Namespace = pg.Namespace
- ji.MinAvailable = pg.Spec.MinMember
- ji.Queue = QueueID(pg.Spec.Queue)
- ji.CreationTimestamp = pg.GetCreationTimestamp()
-
- var err error
- ji.WaitingTime, err = ji.extractWaitingTime(pg)
- if err != nil {
- klog.Warningf("Error occurs in parsing waiting time for job <%s/%s>, err: %s.",
- pg.Namespace, pg.Name, err.Error())
- ji.WaitingTime = nil
- }
-
- ji.Preemptable = ji.extractPreemptable(pg)
- ji.RevocableZone = ji.extractRevocableZone(pg)
- ji.Budget = ji.extractBudget(pg)
-
- taskMinAvailableTotal := int32(0)
- for task, member := range pg.Spec.MinTaskMember {
- ji.TaskMinAvailable[TaskID(task)] = member
- taskMinAvailableTotal += member
- }
- ji.TaskMinAvailableTotal = taskMinAvailableTotal
-
- ji.PodGroup = pg
-}
-
-// extractWaitingTime reads sla waiting time for job from podgroup annotations
-// TODO: should also read from given field in volcano job spec
-func (ji *JobInfo) extractWaitingTime(pg *PodGroup) (*time.Duration, error) {
- if _, exist := pg.Annotations[JobWaitingTime]; !exist {
- return nil, nil
- }
-
- jobWaitingTime, err := time.ParseDuration(pg.Annotations[JobWaitingTime])
- if err != nil {
- return nil, err
- }
-
- if jobWaitingTime <= 0 {
- return nil, errors.New("invalid sla waiting time")
- }
-
- return &jobWaitingTime, nil
-}
-
-// extractPreemptable return volcano.sh/preemptable value for job
-func (ji *JobInfo) extractPreemptable(pg *PodGroup) bool {
- // check annotaion first
- if len(pg.Annotations) > 0 {
- if value, found := pg.Annotations[v1beta1.PodPreemptable]; found {
- b, err := strconv.ParseBool(value)
- if err != nil {
- klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
- return false
- }
- return b
- }
- }
-
- // it annotation does not exit, check label
- if len(pg.Labels) > 0 {
- if value, found := pg.Labels[v1beta1.PodPreemptable]; found {
- b, err := strconv.ParseBool(value)
- if err != nil {
- klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
- return false
- }
- return b
- }
- }
-
- return false
-}
-
-// extractRevocableZone return volcano.sh/revocable-zone value for pod/podgroup
-func (ji *JobInfo) extractRevocableZone(pg *PodGroup) string {
- // check annotation first
- if len(pg.Annotations) > 0 {
- if value, found := pg.Annotations[v1beta1.RevocableZone]; found {
- if value != "*" {
- return ""
- }
- return value
- }
-
- if value, found := pg.Annotations[v1beta1.PodPreemptable]; found {
- if b, err := strconv.ParseBool(value); err == nil && b {
- return "*"
- }
- }
- }
-
- return ""
-}
-
-// extractBudget return budget value for job
-func (ji *JobInfo) extractBudget(pg *PodGroup) *DisruptionBudget {
- if len(pg.Annotations) > 0 {
- if value, found := pg.Annotations[v1beta1.JDBMinAvailable]; found {
- return NewDisruptionBudget(value, "")
- } else if value, found := pg.Annotations[v1beta1.JDBMaxUnavailable]; found {
- return NewDisruptionBudget("", value)
- }
- }
-
- return NewDisruptionBudget("", "")
-}
-
-// GetMinResources return the min resources of podgroup.
-func (ji *JobInfo) GetMinResources() *Resource {
- if ji.PodGroup.Spec.MinResources == nil {
- return EmptyResource()
- }
-
- return NewResource(*ji.PodGroup.Spec.MinResources)
-}
-
-func (ji *JobInfo) addTaskIndex(ti *TaskInfo) {
- if _, found := ji.TaskStatusIndex[ti.Status]; !found {
- ji.TaskStatusIndex[ti.Status] = tasksMap{}
- }
- ji.TaskStatusIndex[ti.Status][ti.UID] = ti
-}
-
-// AddTaskInfo is used to add a task to a job
-func (ji *JobInfo) AddTaskInfo(ti *TaskInfo) {
- ji.Tasks[ti.UID] = ti
- ji.addTaskIndex(ti)
- ji.TotalRequest.Add(ti.Resreq)
- if AllocatedStatus(ti.Status) {
- ji.Allocated.Add(ti.Resreq)
- }
-}
-
-// UpdateTaskStatus is used to update task's status in a job.
-// If error occurs both task and job are guaranteed to be in the original state.
-func (ji *JobInfo) UpdateTaskStatus(task *TaskInfo, status TaskStatus) error {
- if err := validateStatusUpdate(task.Status, status); err != nil {
- return err
- }
-
- // First remove the task (if exist) from the task list.
- if _, found := ji.Tasks[task.UID]; found {
- if err := ji.DeleteTaskInfo(task); err != nil {
- return err
- }
- }
-
- // Update task's status to the target status once task addition is guaranteed to succeed.
- task.Status = status
- ji.AddTaskInfo(task)
-
- return nil
-}
-
-func (ji *JobInfo) deleteTaskIndex(ti *TaskInfo) {
- if tasks, found := ji.TaskStatusIndex[ti.Status]; found {
- delete(tasks, ti.UID)
-
- if len(tasks) == 0 {
- delete(ji.TaskStatusIndex, ti.Status)
- }
- }
-}
-
-// DeleteTaskInfo is used to delete a task from a job
-func (ji *JobInfo) DeleteTaskInfo(ti *TaskInfo) error {
- if task, found := ji.Tasks[ti.UID]; found {
- ji.TotalRequest.Sub(task.Resreq)
- if AllocatedStatus(task.Status) {
- ji.Allocated.Sub(task.Resreq)
- }
- delete(ji.Tasks, task.UID)
- ji.deleteTaskIndex(task)
- return nil
- }
-
- return fmt.Errorf("failed to find task <%v/%v> in job <%v/%v>",
- ti.Namespace, ti.Name, ji.Namespace, ji.Name)
-}
-
-// Clone is used to clone a jobInfo object
-func (ji *JobInfo) Clone() *JobInfo {
- info := &JobInfo{
- UID: ji.UID,
- Name: ji.Name,
- Namespace: ji.Namespace,
- Queue: ji.Queue,
- Priority: ji.Priority,
-
- MinAvailable: ji.MinAvailable,
- WaitingTime: ji.WaitingTime,
- JobFitErrors: ji.JobFitErrors,
- NodesFitErrors: make(map[TaskID]*FitErrors),
- Allocated: EmptyResource(),
- TotalRequest: EmptyResource(),
-
- PodGroup: ji.PodGroup.Clone(),
-
- TaskStatusIndex: map[TaskStatus]tasksMap{},
- TaskMinAvailable: ji.TaskMinAvailable,
- TaskMinAvailableTotal: ji.TaskMinAvailableTotal,
- Tasks: tasksMap{},
- Preemptable: ji.Preemptable,
- RevocableZone: ji.RevocableZone,
- Budget: ji.Budget.Clone(),
- }
-
- ji.CreationTimestamp.DeepCopyInto(&info.CreationTimestamp)
-
- for _, task := range ji.Tasks {
- info.AddTaskInfo(task.Clone())
- }
-
- return info
-}
-
-// String returns a jobInfo object in string format
-func (ji JobInfo) String() string {
- res := ""
-
- i := 0
- for _, task := range ji.Tasks {
- res += fmt.Sprintf("\n\t %d: %v", i, task)
- i++
- }
-
- return fmt.Sprintf("Job (%v): namespace %v (%v), name %v, minAvailable %d, podGroup %+v, preemptable %+v, revocableZone %+v, minAvailable %+v, maxAvailable %+v",
- ji.UID, ji.Namespace, ji.Queue, ji.Name, ji.MinAvailable, ji.PodGroup, ji.Preemptable, ji.RevocableZone, ji.Budget.MinAvailable, ji.Budget.MaxUnavilable) + res
-}
-
-// FitError returns detailed information on why a job's task failed to fit on
-// each available node
-func (ji *JobInfo) FitError() string {
- sortReasonsHistogram := func(reasons map[string]int) []string {
- reasonStrings := []string{}
- for k, v := range reasons {
- reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k))
- }
- sort.Strings(reasonStrings)
- return reasonStrings
- }
-
- // Stat histogram for all tasks of the job
- reasons := make(map[string]int)
- for status, taskMap := range ji.TaskStatusIndex {
- reasons[status.String()] += len(taskMap)
- }
- reasons["minAvailable"] = int(ji.MinAvailable)
- reasonMsg := fmt.Sprintf("%v, %v", scheduling.PodGroupNotReady, strings.Join(sortReasonsHistogram(reasons), ", "))
-
- // Stat histogram for pending tasks only
- reasons = make(map[string]int)
- for uid := range ji.TaskStatusIndex[Pending] {
- reason, _ := ji.TaskSchedulingReason(uid)
- reasons[reason]++
- }
- if len(reasons) > 0 {
- reasonMsg += "; " + fmt.Sprintf("%s: %s", Pending.String(), strings.Join(sortReasonsHistogram(reasons), ", "))
- }
- return reasonMsg
-}
-
-// TaskSchedulingReason get detailed reason and message of the given task
-// It returns detailed reason and message for tasks based on last scheduling transaction.
-func (ji *JobInfo) TaskSchedulingReason(tid TaskID) (reason string, msg string) {
- taskInfo, exists := ji.Tasks[tid]
- if !exists {
- return "", ""
- }
-
- // Get detailed scheduling reason based on LastTransaction
- ctx := taskInfo.GetTransactionContext()
- if taskInfo.LastTransaction != nil {
- ctx = *taskInfo.LastTransaction
- }
-
- msg = ji.JobFitErrors
- switch status := ctx.Status; status {
- case Allocated, Pipelined:
- // Pod is schedulable
- msg = fmt.Sprintf("Pod %s/%s can possibly be assigned to %s", taskInfo.Namespace, taskInfo.Name, ctx.NodeName)
- if status == Pipelined {
- msg += " once resource is released"
- }
- return PodReasonSchedulable, msg
- case Pending:
- if fe := ji.NodesFitErrors[tid]; fe != nil {
- // Pod is not schedulable
- return PodReasonUnschedulable, fe.Error()
- }
- // Pod is not scheduled yet
- return PodReasonUndetermined, msg
- default:
- return status.String(), msg
- }
-}
-
-// ReadyTaskNum returns the number of tasks that are ready or that is best-effort.
-func (ji *JobInfo) ReadyTaskNum() int32 {
- occupied := 0
- occupied += len(ji.TaskStatusIndex[Bound])
- occupied += len(ji.TaskStatusIndex[Binding])
- occupied += len(ji.TaskStatusIndex[Running])
- occupied += len(ji.TaskStatusIndex[Allocated])
- occupied += len(ji.TaskStatusIndex[Succeeded])
-
- if tasks, found := ji.TaskStatusIndex[Pending]; found {
- for _, task := range tasks {
- if task.BestEffort {
- occupied++
- }
- }
- }
-
- return int32(occupied)
-}
-
-// WaitingTaskNum returns the number of tasks that are pipelined.
-func (ji *JobInfo) WaitingTaskNum() int32 {
- return int32(len(ji.TaskStatusIndex[Pipelined]))
-}
-
-// CheckTaskMinAvailable returns whether each task of job is valid.
-func (ji *JobInfo) CheckTaskMinAvailable() bool {
- // if job minAvailable is less than sumof(task minAvailable), skip this check
- if ji.MinAvailable < ji.TaskMinAvailableTotal {
- return true
- }
-
- actual := map[TaskID]int32{}
- for status, tasks := range ji.TaskStatusIndex {
- if AllocatedStatus(status) ||
- status == Succeeded ||
- status == Pipelined ||
- status == Pending {
- for _, task := range tasks {
- actual[getTaskID(task.Pod)]++
- }
- }
- }
-
- klog.V(4).Infof("job %s/%s actual: %+v, ji.TaskMinAvailable: %+v", ji.Name, ji.Namespace, actual, ji.TaskMinAvailable)
- for task, minAvailable := range ji.TaskMinAvailable {
- if act, ok := actual[task]; !ok || act < minAvailable {
- return false
- }
- }
-
- return true
-}
-
-// CheckTaskMinAvailableReady return ready pods meet task minavaliable.
-func (ji *JobInfo) CheckTaskMinAvailableReady() bool {
- if ji.MinAvailable < ji.TaskMinAvailableTotal {
- return true
- }
- occupiedMap := map[TaskID]int32{}
- for status, tasks := range ji.TaskStatusIndex {
- if AllocatedStatus(status) ||
- status == Succeeded {
- for _, task := range tasks {
- occupiedMap[getTaskID(task.Pod)] += 1
- }
- continue
- }
-
- if status == Pending {
- for _, task := range tasks {
- if task.InitResreq.IsEmpty() {
- occupiedMap[getTaskID(task.Pod)] += 1
- }
- }
- }
- }
- for taskId, minNum := range ji.TaskMinAvailable {
- if occupiedMap[taskId] < minNum {
- klog.V(4).Infof("Job %s/%s Task %s occupied %v less than task min avaliable", ji.Namespace, ji.Name, taskId, occupiedMap[taskId])
- return false
- }
- }
- return true
-}
-
-// CheckTaskMinAvailableReady return ready pods meet task minavaliable.
-func (ji *JobInfo) CheckTaskMinAvailablePipelined() bool {
- if ji.MinAvailable < ji.TaskMinAvailableTotal {
- return true
- }
- occupiedMap := map[TaskID]int32{}
- for status, tasks := range ji.TaskStatusIndex {
- if AllocatedStatus(status) ||
- status == Succeeded ||
- status == Pipelined {
- for _, task := range tasks {
- occupiedMap[getTaskID(task.Pod)] += 1
- }
- continue
- }
-
- if status == Pending {
- for _, task := range tasks {
- if task.InitResreq.IsEmpty() {
- occupiedMap[getTaskID(task.Pod)] += 1
- }
- }
- }
- }
- for taskId, minNum := range ji.TaskMinAvailable {
- if occupiedMap[taskId] < minNum {
- klog.V(4).Infof("Job %s/%s Task %s occupied %v less than task min avaliable", ji.Namespace, ji.Name, taskId, occupiedMap[taskId])
- return false
- }
- }
- return true
-}
-
-// ValidTaskNum returns the number of tasks that are valid.
-func (ji *JobInfo) ValidTaskNum() int32 {
- occupied := 0
- for status, tasks := range ji.TaskStatusIndex {
- if AllocatedStatus(status) ||
- status == Succeeded ||
- status == Pipelined ||
- status == Pending {
- occupied += len(tasks)
- }
- }
-
- return int32(occupied)
-}
-
-// Ready returns whether job is ready for run
-func (ji *JobInfo) Ready() bool {
- occupied := ji.ReadyTaskNum()
-
- return occupied >= ji.MinAvailable
-}
-
-// IsPending returns whether job is in pending status
-func (ji *JobInfo) IsPending() bool {
- if ji.PodGroup == nil || ji.PodGroup.Status.Phase == scheduling.PodGroupPending || ji.PodGroup.Status.Phase == "" {
- return true
- }
-
- return false
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/klog"
-)
-
-// NamespaceName is name of namespace
-type NamespaceName string
-
-const (
- // NamespaceWeightKey is the key in ResourceQuota.spec.hard indicating the weight of this namespace
- NamespaceWeightKey = "volcano.sh/namespace.weight"
- // DefaultNamespaceWeight is the default weight of namespace
- DefaultNamespaceWeight = 1
-)
-
-// NamespaceInfo records information of namespace
-type NamespaceInfo struct {
- // Name is the name of this namespace
- Name NamespaceName
- // Weight is the highest weight among many ResourceQuota.
- Weight int64
-}
-
-// GetWeight returns weight of a namespace, any invalid case would get default value
-func (n *NamespaceInfo) GetWeight() int64 {
- if n == nil || n.Weight == 0 {
- return DefaultNamespaceWeight
- }
- return n.Weight
-}
-
-type quotaItem struct {
- name string
- weight int64
-}
-
-func quotaItemKeyFunc(obj interface{}) (string, error) {
- item, ok := obj.(*quotaItem)
- if !ok {
- return "", fmt.Errorf("obj with type %T could not parse", obj)
- }
- return item.name, nil
-}
-
-// for big root heap
-func quotaItemLessFunc(a interface{}, b interface{}) bool {
- A := a.(*quotaItem)
- B := b.(*quotaItem)
- return A.weight > B.weight
-}
-
-// NamespaceCollection will record all details about namespace
-type NamespaceCollection struct {
- Name string
-
- quotaWeight *cache.Heap
-}
-
-// NewNamespaceCollection creates new NamespaceCollection object to record all information about a namespace
-func NewNamespaceCollection(name string) *NamespaceCollection {
- n := &NamespaceCollection{
- Name: name,
- quotaWeight: cache.NewHeap(quotaItemKeyFunc, quotaItemLessFunc),
- }
- return n
-}
-
-func (n *NamespaceCollection) deleteWeight(q *quotaItem) {
- n.quotaWeight.Delete(q)
-}
-
-func (n *NamespaceCollection) updateWeight(q *quotaItem) {
- n.quotaWeight.Update(q)
-}
-
-func itemFromQuota(quota *v1.ResourceQuota) *quotaItem {
- var weight int64 = DefaultNamespaceWeight
-
- quotaWeight, ok := quota.Spec.Hard[NamespaceWeightKey]
- if ok {
- weight = quotaWeight.Value()
- }
-
- item := "aItem{
- name: quota.Name,
- weight: weight,
- }
- return item
-}
-
-// Update modify the registered information according quota object
-func (n *NamespaceCollection) Update(quota *v1.ResourceQuota) {
- n.updateWeight(itemFromQuota(quota))
-}
-
-// Delete remove the registered information according quota object
-func (n *NamespaceCollection) Delete(quota *v1.ResourceQuota) {
- n.deleteWeight(itemFromQuota(quota))
-}
-
-// Snapshot will clone a NamespaceInfo without Heap according NamespaceCollection
-func (n *NamespaceCollection) Snapshot() *NamespaceInfo {
- var weight int64 = DefaultNamespaceWeight
-
- obj, err := n.quotaWeight.Pop()
- if err != nil {
- klog.Warningf("namespace %s, quota weight meets error %v when pop", n.Name, err)
- } else {
- item := obj.(*quotaItem)
- weight = item.weight
- n.quotaWeight.Add(item)
- }
-
- return &NamespaceInfo{
- Name: NamespaceName(n.Name),
- Weight: weight,
- }
-}
-
-
-
/*
- Copyright 2021 The Volcano Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
- "strconv"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-type AllocateFailError struct {
- Reason string
-}
-
-func (o *AllocateFailError) Error() string {
- return o.Reason
-}
-
-// NodeInfo is node level aggregated information.
-type NodeInfo struct {
- Name string
- Node *v1.Node
-
- // The state of node
- State NodeState
-
- // The releasing resource on that node
- Releasing *Resource
- // The pipelined resource on that node
- Pipelined *Resource
- // The idle resource on that node
- Idle *Resource
- // The used resource on that node, including running and terminating
- // pods
- Used *Resource
-
- Allocatable *Resource
- Capability *Resource
-
- Tasks map[TaskID]*TaskInfo
- NumaInfo *NumatopoInfo
- NumaChgFlag NumaChgFlag
- NumaSchedulerInfo *NumatopoInfo
- RevocableZone string
-
- // Used to store custom information
- Others map[string]interface{}
- GPUDevices map[int]*GPUDevice
-
- // enable node resource oversubscription
- OversubscriptionNode bool
- // OfflineJobEvicting true means node resource usage too high then dispatched pod can not use oversubscription resource
- OfflineJobEvicting bool
-
- // Resource Oversubscription feature: the Oversubscription Resource reported in annotation
- OversubscriptionResource *Resource
-}
-
-// FutureIdle returns resources that will be idle in the future:
-//
-// That is current idle resources plus released resources minus pipelined resources.
-func (ni *NodeInfo) FutureIdle() *Resource {
- return ni.Idle.Clone().Add(ni.Releasing).Sub(ni.Pipelined)
-}
-
-// GetNodeAllocatable return node Allocatable without OversubscriptionResource resource
-func (ni *NodeInfo) GetNodeAllocatable() *Resource {
- return NewResource(ni.Node.Status.Allocatable)
-}
-
-// NodeState defines the current state of node.
-type NodeState struct {
- Phase NodePhase
- Reason string
-}
-
-// NewNodeInfo is used to create new nodeInfo object
-func NewNodeInfo(node *v1.Node) *NodeInfo {
- nodeInfo := &NodeInfo{
- Releasing: EmptyResource(),
- Pipelined: EmptyResource(),
- Idle: EmptyResource(),
- Used: EmptyResource(),
-
- Allocatable: EmptyResource(),
- Capability: EmptyResource(),
-
- OversubscriptionResource: EmptyResource(),
- Tasks: make(map[TaskID]*TaskInfo),
-
- GPUDevices: make(map[int]*GPUDevice),
- }
-
- nodeInfo.setOversubscription(node)
-
- if node != nil {
- nodeInfo.Name = node.Name
- nodeInfo.Node = node
- nodeInfo.Idle = NewResource(node.Status.Allocatable).Add(nodeInfo.OversubscriptionResource)
- nodeInfo.Allocatable = NewResource(node.Status.Allocatable).Add(nodeInfo.OversubscriptionResource)
- nodeInfo.Capability = NewResource(node.Status.Capacity).Add(nodeInfo.OversubscriptionResource)
- }
- nodeInfo.setNodeGPUInfo(node)
- nodeInfo.setNodeState(node)
- nodeInfo.setRevocableZone(node)
-
- return nodeInfo
-}
-
-// RefreshNumaSchedulerInfoByCrd used to update scheduler numa information based the CRD numatopo
-func (ni *NodeInfo) RefreshNumaSchedulerInfoByCrd() {
- if ni.NumaInfo == nil {
- ni.NumaSchedulerInfo = nil
- return
- }
-
- tmp := ni.NumaInfo.DeepCopy()
- if ni.NumaChgFlag == NumaInfoMoreFlag {
- ni.NumaSchedulerInfo = tmp
- } else if ni.NumaChgFlag == NumaInfoLessFlag {
- numaResMap := ni.NumaSchedulerInfo.NumaResMap
- for resName, resInfo := range tmp.NumaResMap {
- klog.V(5).Infof("resource %s Allocatable : current %v new %v on node %s",
- resName, numaResMap[resName], resInfo, ni.Name)
- if numaResMap[resName].Allocatable.Size() >= resInfo.Allocatable.Size() {
- numaResMap[resName].Allocatable = resInfo.Allocatable.Clone()
- numaResMap[resName].Capacity = resInfo.Capacity
- }
- }
- }
-
- ni.NumaChgFlag = NumaInfoResetFlag
-}
-
-// Clone used to clone nodeInfo Object
-func (ni *NodeInfo) Clone() *NodeInfo {
- res := NewNodeInfo(ni.Node)
-
- for _, p := range ni.Tasks {
- res.AddTask(p)
- }
- if ni.NumaInfo != nil {
- res.NumaInfo = ni.NumaInfo.DeepCopy()
- }
-
- if ni.NumaSchedulerInfo != nil {
- res.NumaSchedulerInfo = ni.NumaSchedulerInfo.DeepCopy()
- klog.V(5).Infof("node[%s]", ni.Name)
- for resName, resInfo := range res.NumaSchedulerInfo.NumaResMap {
- klog.V(5).Infof("current resource %s : %v", resName, resInfo)
- }
-
- klog.V(5).Infof("current Policies : %v", res.NumaSchedulerInfo.Policies)
- }
-
- res.Others = ni.Others
- return res
-}
-
-// Ready returns whether node is ready for scheduling
-func (ni *NodeInfo) Ready() bool {
- return ni.State.Phase == Ready
-}
-
-func (ni *NodeInfo) setRevocableZone(node *v1.Node) {
- if node == nil {
- klog.Warningf("the argument node is null.")
- return
- }
-
- revocableZone := ""
- if len(node.Labels) > 0 {
- if value, found := node.Labels[v1beta1.RevocableZone]; found {
- revocableZone = value
- }
- }
- ni.RevocableZone = revocableZone
-}
-
-// Check node if enable Oversubscription and set Oversubscription resources
-// Only support oversubscription cpu and memory resource for this version
-func (ni *NodeInfo) setOversubscription(node *v1.Node) {
- if node == nil {
- return
- }
-
- ni.OversubscriptionNode = false
- ni.OfflineJobEvicting = false
- if len(node.Labels) > 0 {
- if value, found := node.Labels[OversubscriptionNode]; found {
- b, err := strconv.ParseBool(value)
- if err == nil {
- ni.OversubscriptionNode = b
- } else {
- ni.OversubscriptionNode = false
- }
- klog.V(5).Infof("Set node %s Oversubscription to %v", node.Name, ni.OversubscriptionNode)
- }
- }
-
- if len(node.Annotations) > 0 {
- if value, found := node.Annotations[OfflineJobEvicting]; found {
- b, err := strconv.ParseBool(value)
- if err == nil {
- ni.OfflineJobEvicting = b
- } else {
- ni.OfflineJobEvicting = false
- }
- klog.V(5).Infof("Set node %s OfflineJobEvicting to %v", node.Name, ni.OfflineJobEvicting)
- }
- if value, found := node.Annotations[OversubscriptionCPU]; found {
- ni.OversubscriptionResource.MilliCPU, _ = strconv.ParseFloat(value, 64)
- klog.V(5).Infof("Set node %s Oversubscription CPU to %v", node.Name, ni.OversubscriptionResource.MilliCPU)
- }
- if value, found := node.Annotations[OversubscriptionMemory]; found {
- ni.OversubscriptionResource.Memory, _ = strconv.ParseFloat(value, 64)
- klog.V(5).Infof("Set node %s Oversubscription Memory to %v", node.Name, ni.OversubscriptionResource.Memory)
- }
- }
-}
-
-func (ni *NodeInfo) setNodeState(node *v1.Node) {
- // If node is nil, the node is un-initialized in cache
- if node == nil {
- ni.State = NodeState{
- Phase: NotReady,
- Reason: "UnInitialized",
- }
- return
- }
-
- // set NodeState according to resources
- if !ni.Used.LessEqual(ni.Allocatable, Zero) {
- ni.State = NodeState{
- Phase: NotReady,
- Reason: "OutOfSync",
- }
- return
- }
-
- // If node not ready, e.g. power off
- for _, cond := range node.Status.Conditions {
- if cond.Type == v1.NodeReady && cond.Status != v1.ConditionTrue {
- ni.State = NodeState{
- Phase: NotReady,
- Reason: "NotReady",
- }
- klog.Warningf("set the node %s status to %s.", node.Name, NotReady.String())
- return
- }
- }
-
- // Node is ready (ignore node conditions because of taint/toleration)
- ni.State = NodeState{
- Phase: Ready,
- Reason: "",
- }
-
- klog.V(4).Infof("set the node %s status to %s.", node.Name, Ready.String())
-}
-
-func (ni *NodeInfo) setNodeGPUInfo(node *v1.Node) {
- if node == nil {
- return
- }
- memory, ok := node.Status.Capacity[VolcanoGPUResource]
- if !ok {
- return
- }
- totalMemory := memory.Value()
-
- res, ok := node.Status.Capacity[VolcanoGPUNumber]
- if !ok {
- return
- }
- gpuNumber := res.Value()
- if gpuNumber == 0 {
- klog.Warningf("invalid %s=%s", VolcanoGPUNumber, res.String())
- return
- }
-
- memoryPerCard := uint(totalMemory / gpuNumber)
- for i := 0; i < int(gpuNumber); i++ {
- ni.GPUDevices[i] = NewGPUDevice(i, memoryPerCard)
- }
-}
-
-// SetNode sets kubernetes node object to nodeInfo object
-func (ni *NodeInfo) SetNode(node *v1.Node) {
- ni.setNodeState(node)
- if !ni.Ready() {
- klog.Warningf("Failed to set node info for %s, phase: %s, reason: %s",
- ni.Name, ni.State.Phase, ni.State.Reason)
- return
- }
-
- // Dry run, make sure all fields other than `State` are in the original state.
- copy := ni.Clone()
- copy.setNode(node)
- copy.setNodeState(node)
- if !copy.Ready() {
- klog.Warningf("SetNode makes node %s not ready, phase: %s, reason: %s",
- copy.Name, copy.State.Phase, copy.State.Reason)
- // Set state of node to !Ready, left other fields untouched
- ni.State = copy.State
- return
- }
-
- ni.setNode(node)
-}
-
-// setNode sets kubernetes node object to nodeInfo object without assertion
-func (ni *NodeInfo) setNode(node *v1.Node) {
- ni.setOversubscription(node)
- ni.setNodeGPUInfo(node)
- ni.setRevocableZone(node)
-
- ni.Name = node.Name
- ni.Node = node
-
- ni.Allocatable = NewResource(node.Status.Allocatable).Add(ni.OversubscriptionResource)
- ni.Capability = NewResource(node.Status.Capacity).Add(ni.OversubscriptionResource)
- ni.Releasing = EmptyResource()
- ni.Pipelined = EmptyResource()
- ni.Idle = NewResource(node.Status.Allocatable).Add(ni.OversubscriptionResource)
- ni.Used = EmptyResource()
-
- for _, ti := range ni.Tasks {
- switch ti.Status {
- case Releasing:
- ni.Idle.sub(ti.Resreq) // sub without assertion
- ni.Releasing.Add(ti.Resreq)
- ni.Used.Add(ti.Resreq)
- ni.AddGPUResource(ti.Pod)
- case Pipelined:
- ni.Pipelined.Add(ti.Resreq)
- default:
- ni.Idle.sub(ti.Resreq) // sub without assertion
- ni.Used.Add(ti.Resreq)
- ni.AddGPUResource(ti.Pod)
- }
- }
-}
-
-func (ni *NodeInfo) allocateIdleResource(ti *TaskInfo) error {
- if ti.Resreq.LessEqual(ni.Idle, Zero) {
- ni.Idle.Sub(ti.Resreq)
- return nil
- }
-
- return &AllocateFailError{Reason: fmt.Sprintf(
- "cannot allocate resource, <%s> idle: %s <%s/%s> req: %s",
- ni.Name, ni.Idle.String(), ti.Namespace, ti.Name, ti.Resreq.String(),
- )}
-}
-
-// AddTask is used to add a task in nodeInfo object
-//
-// If error occurs both task and node are guaranteed to be in the original state.
-func (ni *NodeInfo) AddTask(task *TaskInfo) error {
- if len(task.NodeName) > 0 && len(ni.Name) > 0 && task.NodeName != ni.Name {
- return fmt.Errorf("task <%v/%v> already on different node <%v>",
- task.Namespace, task.Name, task.NodeName)
- }
-
- key := PodKey(task.Pod)
- if _, found := ni.Tasks[key]; found {
- return fmt.Errorf("task <%v/%v> already on node <%v>",
- task.Namespace, task.Name, ni.Name)
- }
-
- // Node will hold a copy of task to make sure the status
- // change will not impact resource in node.
- ti := task.Clone()
-
- if ni.Node != nil {
- switch ti.Status {
- case Releasing:
- if err := ni.allocateIdleResource(ti); err != nil {
- return err
- }
- ni.Releasing.Add(ti.Resreq)
- ni.Used.Add(ti.Resreq)
- ni.AddGPUResource(ti.Pod)
- case Pipelined:
- ni.Pipelined.Add(ti.Resreq)
- default:
- if err := ni.allocateIdleResource(ti); err != nil {
- return err
- }
- ni.Used.Add(ti.Resreq)
- ni.AddGPUResource(ti.Pod)
- }
- }
-
- if ni.NumaInfo != nil {
- ni.NumaInfo.AddTask(ti)
- }
-
- // Update task node name upon successful task addition.
- task.NodeName = ni.Name
- ti.NodeName = ni.Name
- ni.Tasks[key] = ti
-
- return nil
-}
-
-// RemoveTask used to remove a task from nodeInfo object.
-//
-// If error occurs both task and node are guaranteed to be in the original state.
-func (ni *NodeInfo) RemoveTask(ti *TaskInfo) error {
- key := PodKey(ti.Pod)
-
- task, found := ni.Tasks[key]
- if !found {
- klog.Warningf("failed to find task <%v/%v> on host <%v>",
- ti.Namespace, ti.Name, ni.Name)
- return nil
- }
-
- if ni.Node != nil {
- switch task.Status {
- case Releasing:
- ni.Releasing.Sub(task.Resreq)
- ni.Idle.Add(task.Resreq)
- ni.Used.Sub(task.Resreq)
- ni.SubGPUResource(ti.Pod)
- case Pipelined:
- ni.Pipelined.Sub(task.Resreq)
- default:
- ni.Idle.Add(task.Resreq)
- ni.Used.Sub(task.Resreq)
- ni.SubGPUResource(ti.Pod)
- }
- }
-
- if ni.NumaInfo != nil {
- ni.NumaInfo.RemoveTask(ti)
- }
-
- delete(ni.Tasks, key)
-
- return nil
-}
-
-// UpdateTask is used to update a task in nodeInfo object.
-//
-// If error occurs both task and node are guaranteed to be in the original state.
-func (ni *NodeInfo) UpdateTask(ti *TaskInfo) error {
- if err := ni.RemoveTask(ti); err != nil {
- return err
- }
-
- if err := ni.AddTask(ti); err != nil {
- // This should never happen if task removal was successful,
- // because only possible error during task addition is when task is still on a node.
- klog.Fatalf("Failed to add Task <%s,%s> to Node <%s> during task update",
- ti.Namespace, ti.Name, ni.Name)
- }
- return nil
-}
-
-// String returns nodeInfo details in string format
-func (ni NodeInfo) String() string {
- tasks := ""
-
- i := 0
- for _, task := range ni.Tasks {
- tasks += fmt.Sprintf("\n\t %d: %v", i, task)
- i++
- }
-
- return fmt.Sprintf("Node (%s): allocatable<%v> idle <%v>, used <%v>, releasing <%v>, oversubscribution <%v>, "+
- "state <phase %s, reaseon %s>, oversubscributionNode <%v>, offlineJobEvicting <%v>,taints <%v>%s",
- ni.Name, ni.Allocatable, ni.Idle, ni.Used, ni.Releasing, ni.OversubscriptionResource, ni.State.Phase, ni.State.Reason, ni.OversubscriptionNode, ni.OfflineJobEvicting, ni.Node.Spec.Taints, tasks)
-}
-
-// Pods returns all pods running in that node
-func (ni *NodeInfo) Pods() (pods []*v1.Pod) {
- for _, t := range ni.Tasks {
- pods = append(pods, t.Pod)
- }
-
- return
-}
-
-// GetDevicesIdleGPUMemory returns all the idle GPU memory by gpu card.
-func (ni *NodeInfo) GetDevicesIdleGPUMemory() map[int]uint {
- devicesAllGPUMemory := ni.getDevicesAllGPUMemory()
- devicesUsedGPUMemory := ni.getDevicesUsedGPUMemory()
- res := map[int]uint{}
- for id, allMemory := range devicesAllGPUMemory {
- if usedMemory, found := devicesUsedGPUMemory[id]; found {
- res[id] = allMemory - usedMemory
- } else {
- res[id] = allMemory
- }
- }
- return res
-}
-
-func (ni *NodeInfo) getDevicesUsedGPUMemory() map[int]uint {
- res := map[int]uint{}
- for _, device := range ni.GPUDevices {
- res[device.ID] = device.getUsedGPUMemory()
- }
- return res
-}
-
-func (ni *NodeInfo) getDevicesAllGPUMemory() map[int]uint {
- res := map[int]uint{}
- for _, device := range ni.GPUDevices {
- res[device.ID] = device.Memory
- }
- return res
-}
-
-// AddGPUResource adds the pod to GPU pool if it is assigned
-func (ni *NodeInfo) AddGPUResource(pod *v1.Pod) {
- gpuRes := GetGPUResourceOfPod(pod)
- if gpuRes > 0 {
- id := GetGPUIndex(pod)
- if dev := ni.GPUDevices[id]; dev != nil {
- dev.PodMap[string(pod.UID)] = pod
- }
- }
-}
-
-// SubGPUResource frees the gpu hold by the pod
-func (ni *NodeInfo) SubGPUResource(pod *v1.Pod) {
- gpuRes := GetGPUResourceOfPod(pod)
- if gpuRes > 0 {
- id := GetGPUIndex(pod)
- if dev := ni.GPUDevices[id]; dev != nil {
- delete(dev.PodMap, string(pod.UID))
- }
- }
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "encoding/json"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
-
- nodeinfov1alpha1 "volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1"
-)
-
-// NumaChgFlag indicate node numainfo changed status
-type NumaChgFlag int
-
-const (
- // NumaInfoResetFlag indicate reset operate
- NumaInfoResetFlag NumaChgFlag = 0b00
- // NumaInfoMoreFlag indicate the received allocatable resource is getting more
- NumaInfoMoreFlag NumaChgFlag = 0b11
- // NumaInfoLessFlag indicate the received allocatable resource is getting less
- NumaInfoLessFlag NumaChgFlag = 0b10
-)
-
-// PodResourceDecision is resource allocation determinated by scheduler,
-// and passed to kubelet through pod annotation.
-type PodResourceDecision struct {
- // NUMAResources is resource list with numa info indexed by numa id.
- NUMAResources map[int]v1.ResourceList `json:"numa,omitempty"`
-}
-
-// ResourceInfo is the allocatable information for the resource
-type ResourceInfo struct {
- Allocatable cpuset.CPUSet
- Capacity int
- AllocatablePerNuma map[int]float64 // key: NUMA ID
- UsedPerNuma map[int]float64 // key: NUMA ID
-}
-
-// NumatopoInfo is the information about topology manager on the node
-type NumatopoInfo struct {
- Namespace string
- Name string
- Policies map[nodeinfov1alpha1.PolicyName]string
- NumaResMap map[string]*ResourceInfo
- CPUDetail topology.CPUDetails
- ResReserved v1.ResourceList
-}
-
-// DeepCopy used to copy NumatopoInfo
-func (info *NumatopoInfo) DeepCopy() *NumatopoInfo {
- numaInfo := &NumatopoInfo{
- Namespace: info.Namespace,
- Name: info.Name,
- Policies: make(map[nodeinfov1alpha1.PolicyName]string),
- NumaResMap: make(map[string]*ResourceInfo),
- CPUDetail: topology.CPUDetails{},
- ResReserved: make(v1.ResourceList),
- }
-
- policies := info.Policies
- for name, policy := range policies {
- numaInfo.Policies[name] = policy
- }
-
- for resName, resInfo := range info.NumaResMap {
- tmpInfo := &ResourceInfo{
- AllocatablePerNuma: make(map[int]float64),
- UsedPerNuma: make(map[int]float64),
- }
- tmpInfo.Capacity = resInfo.Capacity
- tmpInfo.Allocatable = resInfo.Allocatable.Clone()
-
- for numaId, data := range resInfo.AllocatablePerNuma {
- tmpInfo.AllocatablePerNuma[numaId] = data
- }
-
- for numaID, data := range resInfo.UsedPerNuma {
- tmpInfo.UsedPerNuma[numaID] = data
- }
-
- numaInfo.NumaResMap[resName] = tmpInfo
- }
-
- cpuDetail := info.CPUDetail
- for cpuID, detail := range cpuDetail {
- numaInfo.CPUDetail[cpuID] = detail
- }
-
- resReserved := info.ResReserved
- for resName, res := range resReserved {
- numaInfo.ResReserved[resName] = res
- }
-
- return numaInfo
-}
-
-// Compare is the function to show the change of the resource on kubelet
-// return val:
-// - true : the resource on kubelet is getting more or no change
-// - false : the resource on kubelet is getting less
-func (info *NumatopoInfo) Compare(newInfo *NumatopoInfo) bool {
- for resName := range info.NumaResMap {
- oldSize := info.NumaResMap[resName].Allocatable.Size()
- newSize := newInfo.NumaResMap[resName].Allocatable.Size()
- if oldSize <= newSize {
- return true
- }
- }
-
- return false
-}
-
-// Allocate is the function to remove the allocated resource
-func (info *NumatopoInfo) Allocate(resSets ResNumaSets) {
- for resName := range resSets {
- info.NumaResMap[resName].Allocatable = info.NumaResMap[resName].Allocatable.Difference(resSets[resName])
- }
-}
-
-// Release is the function to reclaim the allocated resource
-func (info *NumatopoInfo) Release(resSets ResNumaSets) {
- for resName := range resSets {
- info.NumaResMap[resName].Allocatable = info.NumaResMap[resName].Allocatable.Union(resSets[resName])
- }
-}
-
-func GetPodResourceNumaInfo(ti *TaskInfo) map[int]v1.ResourceList {
- if ti.NumaInfo != nil && len(ti.NumaInfo.ResMap) > 0 {
- return ti.NumaInfo.ResMap
- }
-
- if _, ok := ti.Pod.Annotations[topologyDecisionAnnotation]; !ok {
- return nil
- }
-
- decision := PodResourceDecision{}
- err := json.Unmarshal([]byte(ti.Pod.Annotations[topologyDecisionAnnotation]), &decision)
- if err != nil {
- return nil
- }
-
- return decision.NUMAResources
-}
-
-// AddTask is the function to update the used resource of per numa node
-func (info *NumatopoInfo) AddTask(ti *TaskInfo) {
- numaInfo := GetPodResourceNumaInfo(ti)
- if numaInfo == nil {
- return
- }
-
- for numaID, resList := range numaInfo {
- for resName, quantity := range resList {
- info.NumaResMap[string(resName)].UsedPerNuma[numaID] += ResQuantity2Float64(resName, quantity)
- }
- }
-}
-
-// RemoveTask is the function to update the used resource of per numa node
-func (info *NumatopoInfo) RemoveTask(ti *TaskInfo) {
- decision := GetPodResourceNumaInfo(ti)
- if decision == nil {
- return
- }
-
- for numaID, resList := range ti.NumaInfo.ResMap {
- for resName, quantity := range resList {
- info.NumaResMap[string(resName)].UsedPerNuma[numaID] -= ResQuantity2Float64(resName, quantity)
- }
- }
-}
-
-// GenerateNodeResNumaSets return the idle resource sets of all node
-func GenerateNodeResNumaSets(nodes map[string]*NodeInfo) map[string]ResNumaSets {
- nodeSlice := make(map[string]ResNumaSets)
- for _, node := range nodes {
- if node.NumaSchedulerInfo == nil {
- continue
- }
-
- resMaps := make(ResNumaSets)
- for resName, resMap := range node.NumaSchedulerInfo.NumaResMap {
- resMaps[resName] = resMap.Allocatable.Clone()
- }
-
- nodeSlice[node.Name] = resMaps
- }
-
- return nodeSlice
-}
-
-// GenerateNumaNodes return the numa IDs of all node
-func GenerateNumaNodes(nodes map[string]*NodeInfo) map[string][]int {
- nodeNumaMap := make(map[string][]int)
-
- for _, node := range nodes {
- if node.NumaSchedulerInfo == nil {
- continue
- }
-
- nodeNumaMap[node.Name] = node.NumaSchedulerInfo.CPUDetail.NUMANodes().ToSlice()
- }
-
- return nodeNumaMap
-}
-
-// ResNumaSets is the set map of the resource
-type ResNumaSets map[string]cpuset.CPUSet
-
-// Allocate is to remove the allocated resource which is assigned to task
-func (resSets ResNumaSets) Allocate(taskSets ResNumaSets) {
- for resName := range taskSets {
- if _, ok := resSets[resName]; !ok {
- continue
- }
- resSets[resName] = resSets[resName].Difference(taskSets[resName])
- }
-}
-
-// Release is to reclaim the allocated resource which is assigned to task
-func (resSets ResNumaSets) Release(taskSets ResNumaSets) {
- for resName := range taskSets {
- if _, ok := resSets[resName]; !ok {
- continue
- }
- resSets[resName] = resSets[resName].Union(taskSets[resName])
- }
-}
-
-// Clone is the copy action
-func (resSets ResNumaSets) Clone() ResNumaSets {
- newSets := make(ResNumaSets)
- for resName := range resSets {
- newSets[resName] = resSets[resName].Clone()
- }
-
- return newSets
-}
-
-
-
/*
-Copyright 2019 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "volcano.sh/apis/pkg/apis/scheduling"
-)
-
-// PodGroupPhase is the phase of a pod group at the current time.
-type PodGroupPhase string
-
-// These are the valid phase of podGroups.
-const (
- // PodGroupVersionV1Beta1 represents PodGroupVersion of v1beta1
- PodGroupVersionV1Beta1 string = "v1beta1"
-)
-
-// PodGroup is a collection of Pod; used for batch workload.
-type PodGroup struct {
- scheduling.PodGroup
-
- // Version represents the version of PodGroup
- Version string
-}
-
-func (pg *PodGroup) Clone() *PodGroup {
- return &PodGroup{
- PodGroup: *pg.PodGroup.DeepCopy(),
- Version: pg.Version,
- }
-}
-
-
-
/*
-Copyright 2019 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "encoding/json"
- "fmt"
- "strconv"
- "strings"
- "time"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-// Refer k8s.io/kubernetes/pkg/scheduler/algorithm/predicates/predicates.go#GetResourceRequest.
-//
-// GetResourceRequest returns a *Resource that covers the largest width in each resource dimension.
-// Because init-containers run sequentially, we collect the max in each dimension iteratively.
-// In contrast, we sum the resource vectors for regular containers since they run simultaneously.
-//
-// To be consistent with kubernetes default scheduler, it is only used for predicates of actions(e.g.
-// allocate, backfill, preempt, reclaim), please use GetPodResourceWithoutInitContainers for other cases.
-//
-// Example:
-//
-// Pod:
-// InitContainers
-// IC1:
-// CPU: 2
-// Memory: 1G
-// IC2:
-// CPU: 2
-// Memory: 3G
-// Containers
-// C1:
-// CPU: 2
-// Memory: 1G
-// C2:
-// CPU: 1
-// Memory: 1G
-//
-// Result: CPU: 3, Memory: 3G
-
-// GetPodResourceRequest returns all the resource required for that pod
-func GetPodResourceRequest(pod *v1.Pod) *Resource {
- result := GetPodResourceWithoutInitContainers(pod)
-
- // take max_resource(sum_pod, any_init_container)
- for _, container := range pod.Spec.InitContainers {
- result.SetMaxResource(NewResource(container.Resources.Requests))
- }
-
- return result
-}
-
-// GetPodPreemptable return volcano.sh/preemptable value for pod
-func GetPodPreemptable(pod *v1.Pod) bool {
- // check annotaion first
- if len(pod.Annotations) > 0 {
- if value, found := pod.Annotations[v1beta1.PodPreemptable]; found {
- b, err := strconv.ParseBool(value)
- if err != nil {
- klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
- return false
- }
- return b
- }
- }
-
- // it annotation does not exit, check label
- if len(pod.Labels) > 0 {
- if value, found := pod.Labels[v1beta1.PodPreemptable]; found {
- b, err := strconv.ParseBool(value)
- if err != nil {
- klog.Warningf("invalid %s=%s", v1beta1.PodPreemptable, value)
- return false
- }
- return b
- }
- }
-
- return false
-}
-
-// GetPodRevocableZone return volcano.sh/revocable-zone value for pod/podgroup
-func GetPodRevocableZone(pod *v1.Pod) string {
- if len(pod.Annotations) > 0 {
- if value, found := pod.Annotations[v1beta1.RevocableZone]; found {
- if value != "*" {
- return ""
- }
- return value
- }
-
- if value, found := pod.Annotations[v1beta1.PodPreemptable]; found {
- if b, err := strconv.ParseBool(value); err == nil && b {
- return "*"
- }
- }
- }
- return ""
-}
-
-// GetPodTopologyInfo return volcano.sh/numa-topology-policy value for pod
-func GetPodTopologyInfo(pod *v1.Pod) *TopologyInfo {
- info := TopologyInfo{
- ResMap: make(map[int]v1.ResourceList),
- }
-
- if len(pod.Annotations) > 0 {
- if value, found := pod.Annotations[v1beta1.NumaPolicyKey]; found {
- info.Policy = value
- }
-
- if value, found := pod.Annotations[topologyDecisionAnnotation]; found {
- decision := PodResourceDecision{}
- err := json.Unmarshal([]byte(value), &decision)
- if err == nil {
- info.ResMap = decision.NUMAResources
- }
- }
- }
-
- return &info
-}
-
-// GetPodResourceWithoutInitContainers returns Pod's resource request, it does not contain
-// init containers' resource request.
-func GetPodResourceWithoutInitContainers(pod *v1.Pod) *Resource {
- result := EmptyResource()
- for _, container := range pod.Spec.Containers {
- result.Add(NewResource(container.Resources.Requests))
- }
-
- return result
-}
-
-// GetGPUIndex returns the ID of the GPU
-func GetGPUIndex(pod *v1.Pod) int {
- if len(pod.Annotations) > 0 {
- value, found := pod.Annotations[GPUIndex]
- if found {
- id, err := strconv.Atoi(value)
- if err != nil {
- klog.Errorf("invalid %s=%s", GPUIndex, value)
- return -1
- }
- return id
- }
- }
-
- return -1
-}
-
-func escapeJSONPointer(p string) string {
- // Escaping reference name using https://tools.ietf.org/html/rfc6901
- p = strings.Replace(p, "~", "~0", -1)
- p = strings.Replace(p, "/", "~1", -1)
- return p
-}
-
-// AddGPUIndexPatch returns the patch adding GPU index
-func AddGPUIndexPatch(id int) string {
- return fmt.Sprintf(`[{"op": "add", "path": "/metadata/annotations/%s", "value":"%d"},`+
- `{"op": "add", "path": "/metadata/annotations/%s", "value": "%d"}]`,
- escapeJSONPointer(PredicateTime), time.Now().UnixNano(),
- escapeJSONPointer(GPUIndex), id)
-}
-
-// RemoveGPUIndexPatch returns the patch removing GPU index
-func RemoveGPUIndexPatch() string {
- return fmt.Sprintf(`[{"op": "remove", "path": "/metadata/annotations/%s"},`+
- `{"op": "remove", "path": "/metadata/annotations/%s"}]`, escapeJSONPointer(PredicateTime), escapeJSONPointer(GPUIndex))
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "k8s.io/apimachinery/pkg/types"
-
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-// QueueID is UID type, serves as unique ID for each queue
-type QueueID types.UID
-
-// QueueInfo will have all details about queue
-type QueueInfo struct {
- UID QueueID
- Name string
-
- Weight int32
-
- // Weights is a list of slash sperated float numbers.
- // Each of them is a weight corresponding the
- // hierarchy level.
- Weights string
- // Hierarchy is a list of node name along the
- // path from the root to the node itself.
- Hierarchy string
-
- Queue *scheduling.Queue
-}
-
-// NewQueueInfo creates new queueInfo object
-func NewQueueInfo(queue *scheduling.Queue) *QueueInfo {
- return &QueueInfo{
- UID: QueueID(queue.Name),
- Name: queue.Name,
-
- Weight: queue.Spec.Weight,
- Hierarchy: queue.Annotations[v1beta1.KubeHierarchyAnnotationKey],
- Weights: queue.Annotations[v1beta1.KubeHierarchyWeightAnnotationKey],
-
- Queue: queue,
- }
-}
-
-// Clone is used to clone queueInfo object
-func (q *QueueInfo) Clone() *QueueInfo {
- return &QueueInfo{
- UID: q.UID,
- Name: q.Name,
- Weight: q.Weight,
- Hierarchy: q.Hierarchy,
- Weights: q.Weights,
- Queue: q.Queue,
- }
-}
-
-// Reclaimable return whether queue is reclaimable
-func (q *QueueInfo) Reclaimable() bool {
- if q == nil {
- return false
- }
-
- if q.Queue == nil {
- return false
- }
-
- if q.Queue.Spec.Reclaimable == nil {
- return true
- }
-
- return *q.Queue.Spec.Reclaimable
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
- "math"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
- v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
-
- "volcano.sh/volcano/pkg/scheduler/util/assert"
-)
-
-const (
- // GPUResourceName need to follow https://github.com/NVIDIA/k8s-device-plugin/blob/66a35b71ac4b5cbfb04714678b548bd77e5ba719/server.go#L20
- GPUResourceName = "nvidia.com/gpu"
-)
-
-const (
- minResource float64 = 0.1
-)
-
-// DimensionDefaultValue means default value for black resource dimension
-type DimensionDefaultValue int
-
-const (
- // Zero means resource dimension not defined will be treated as zero
- Zero DimensionDefaultValue = 0
- // Infinity means resource dimension not defined will be treated as infinity
- Infinity DimensionDefaultValue = -1
-)
-
-// Resource struct defines all the resource type
-type Resource struct {
- MilliCPU float64
- Memory float64
-
- // ScalarResources
- ScalarResources map[v1.ResourceName]float64
-
- // MaxTaskNum is only used by predicates; it should NOT
- // be accounted in other operators, e.g. Add.
- MaxTaskNum int
-}
-
-// EmptyResource creates a empty resource object and returns
-func EmptyResource() *Resource {
- return &Resource{}
-}
-
-// NewResource creates a new resource object from resource list
-func NewResource(rl v1.ResourceList) *Resource {
- r := EmptyResource()
- for rName, rQuant := range rl {
- switch rName {
- case v1.ResourceCPU:
- r.MilliCPU += float64(rQuant.MilliValue())
- case v1.ResourceMemory:
- r.Memory += float64(rQuant.Value())
- case v1.ResourcePods:
- r.MaxTaskNum += int(rQuant.Value())
- default:
- //NOTE: When converting this back to k8s resource, we need record the format as well as / 1000
- if v1helper.IsScalarResourceName(rName) {
- r.AddScalar(rName, float64(rQuant.MilliValue()))
- }
- }
- }
- return r
-}
-
-// ResFloat642Quantity transform resource quantity
-func ResFloat642Quantity(resName v1.ResourceName, quantity float64) resource.Quantity {
- var resQuantity *resource.Quantity
- switch resName {
- case v1.ResourceCPU:
- resQuantity = resource.NewMilliQuantity(int64(quantity), resource.DecimalSI)
- default:
- resQuantity = resource.NewQuantity(int64(quantity), resource.BinarySI)
- }
-
- return *resQuantity
-}
-
-// ResQuantity2Float64 transform resource quantity
-func ResQuantity2Float64(resName v1.ResourceName, quantity resource.Quantity) float64 {
- var resQuantity float64
- switch resName {
- case v1.ResourceCPU:
- resQuantity = float64(quantity.MilliValue())
- default:
- resQuantity = float64(quantity.Value())
- }
-
- return resQuantity
-}
-
-// Clone is used to clone a resource type, which is a deep copy function.
-func (r *Resource) Clone() *Resource {
- clone := &Resource{
- MilliCPU: r.MilliCPU,
- Memory: r.Memory,
- MaxTaskNum: r.MaxTaskNum,
- }
-
- if r.ScalarResources != nil {
- clone.ScalarResources = make(map[v1.ResourceName]float64)
- for k, v := range r.ScalarResources {
- clone.ScalarResources[k] = v
- }
- }
-
- return clone
-}
-
-// String returns resource details in string format
-func (r *Resource) String() string {
- str := fmt.Sprintf("cpu %0.2f, memory %0.2f", r.MilliCPU, r.Memory)
- for rName, rQuant := range r.ScalarResources {
- str = fmt.Sprintf("%s, %s %0.2f", str, rName, rQuant)
- }
- return str
-}
-
-// ResourceNames returns all resource types
-func (r *Resource) ResourceNames() ResourceNameList {
- resNames := ResourceNameList{}
-
- if r.MilliCPU >= minResource {
- resNames = append(resNames, v1.ResourceCPU)
- }
-
- if r.Memory >= minResource {
- resNames = append(resNames, v1.ResourceMemory)
- }
-
- for rName, rMount := range r.ScalarResources {
- if rMount >= minResource {
- resNames = append(resNames, rName)
- }
- }
-
- return resNames
-}
-
-// Get returns the resource value for that particular resource type
-func (r *Resource) Get(rn v1.ResourceName) float64 {
- switch rn {
- case v1.ResourceCPU:
- return r.MilliCPU
- case v1.ResourceMemory:
- return r.Memory
- default:
- if r.ScalarResources == nil {
- return 0
- }
- return r.ScalarResources[rn]
- }
-}
-
-// IsEmpty returns false if any kind of resource is not less than min value, otherwise returns true
-func (r *Resource) IsEmpty() bool {
- if !(r.MilliCPU < minResource && r.Memory < minResource) {
- return false
- }
-
- for _, rQuant := range r.ScalarResources {
- if rQuant >= minResource {
- return false
- }
- }
-
- return true
-}
-
-// IsZero returns false if the given kind of resource is not less than min value
-func (r *Resource) IsZero(rn v1.ResourceName) bool {
- switch rn {
- case v1.ResourceCPU:
- return r.MilliCPU < minResource
- case v1.ResourceMemory:
- return r.Memory < minResource
- default:
- if r.ScalarResources == nil {
- return true
- }
-
- _, found := r.ScalarResources[rn]
- assert.Assertf(found, "unknown resource %s", rn)
-
- return r.ScalarResources[rn] < minResource
- }
-}
-
-// Add is used to add two given resources
-func (r *Resource) Add(rr *Resource) *Resource {
- r.MilliCPU += rr.MilliCPU
- r.Memory += rr.Memory
-
- for rName, rQuant := range rr.ScalarResources {
- if r.ScalarResources == nil {
- r.ScalarResources = map[v1.ResourceName]float64{}
- }
- r.ScalarResources[rName] += rQuant
- }
-
- return r
-}
-
-// Sub subtracts two Resource objects with assertion.
-func (r *Resource) Sub(rr *Resource) *Resource {
- assert.Assertf(rr.LessEqual(r, Zero), "resource is not sufficient to do operation: <%v> sub <%v>", r, rr)
- return r.sub(rr)
-}
-
-// sub subtracts two Resource objects.
-func (r *Resource) sub(rr *Resource) *Resource {
- r.MilliCPU -= rr.MilliCPU
- r.Memory -= rr.Memory
-
- if r.ScalarResources == nil {
- return r
- }
- for rrName, rrQuant := range rr.ScalarResources {
- r.ScalarResources[rrName] -= rrQuant
- }
-
- return r
-}
-
-// Multi multiples the resource with ratio provided
-func (r *Resource) Multi(ratio float64) *Resource {
- r.MilliCPU *= ratio
- r.Memory *= ratio
- for rName, rQuant := range r.ScalarResources {
- r.ScalarResources[rName] = rQuant * ratio
- }
- return r
-}
-
-// SetMaxResource compares with ResourceList and takes max value for each Resource.
-func (r *Resource) SetMaxResource(rr *Resource) {
- if r == nil || rr == nil {
- return
- }
-
- if rr.MilliCPU > r.MilliCPU {
- r.MilliCPU = rr.MilliCPU
- }
- if rr.Memory > r.Memory {
- r.Memory = rr.Memory
- }
-
- for rrName, rrQuant := range rr.ScalarResources {
- if r.ScalarResources == nil {
- r.ScalarResources = make(map[v1.ResourceName]float64)
- for k, v := range rr.ScalarResources {
- r.ScalarResources[k] = v
- }
- return
- }
- _, ok := r.ScalarResources[rrName]
- if !ok || rrQuant > r.ScalarResources[rrName] {
- r.ScalarResources[rrName] = rrQuant
- }
- }
-}
-
-//FitDelta Computes the delta between a resource object representing available
-//resources an operand representing resources being requested. Any
-//field that is less than 0 after the operation represents an
-//insufficient resource.
-func (r *Resource) FitDelta(rr *Resource) *Resource {
- if rr.MilliCPU > 0 {
- r.MilliCPU -= rr.MilliCPU + minResource
- }
-
- if rr.Memory > 0 {
- r.Memory -= rr.Memory + minResource
- }
-
- if r.ScalarResources == nil {
- r.ScalarResources = make(map[v1.ResourceName]float64)
- }
-
- for rrName, rrQuant := range rr.ScalarResources {
- if rrQuant > 0 {
- _, ok := r.ScalarResources[rrName]
- if !ok {
- r.ScalarResources[rrName] = 0
- }
- r.ScalarResources[rrName] -= rrQuant + minResource
- }
- }
-
- return r
-}
-
-// Less returns true only on condition that all dimensions of resources in r are less than that of rr,
-// Otherwise returns false.
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
-func (r *Resource) Less(rr *Resource, defaultValue DimensionDefaultValue) bool {
- lessFunc := func(l, r float64) bool {
- return l < r
- }
-
- if !lessFunc(r.MilliCPU, rr.MilliCPU) {
- return false
- }
- if !lessFunc(r.Memory, rr.Memory) {
- return false
- }
-
- for resourceName, leftValue := range r.ScalarResources {
- rightValue, ok := rr.ScalarResources[resourceName]
- if !ok && defaultValue == Infinity {
- continue
- }
-
- if !lessFunc(leftValue, rightValue) {
- return false
- }
- }
- return true
-}
-
-// LessEqual returns true only on condition that all dimensions of resources in r are less than or equal with that of rr,
-// Otherwise returns false.
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
-func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) bool {
- lessEqualFunc := func(l, r, diff float64) bool {
- if l < r || math.Abs(l-r) < diff {
- return true
- }
- return false
- }
-
- if !lessEqualFunc(r.MilliCPU, rr.MilliCPU, minResource) {
- return false
- }
- if !lessEqualFunc(r.Memory, rr.Memory, minResource) {
- return false
- }
-
- for resourceName, leftValue := range r.ScalarResources {
- rightValue, ok := rr.ScalarResources[resourceName]
- if !ok && defaultValue == Infinity {
- continue
- }
-
- if !lessEqualFunc(leftValue, rightValue, minResource) {
- return false
- }
- }
- return true
-}
-
-// LessPartly returns true if there exists any dimension whose resource amount in r is less than that in rr.
-// Otherwise returns false.
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
-func (r *Resource) LessPartly(rr *Resource, defaultValue DimensionDefaultValue) bool {
- lessFunc := func(l, r float64) bool {
- return l < r
- }
-
- if lessFunc(r.MilliCPU, rr.MilliCPU) || lessFunc(r.Memory, rr.Memory) {
- return true
- }
-
- for resourceName, leftValue := range r.ScalarResources {
- rightValue, ok := rr.ScalarResources[resourceName]
- if !ok && defaultValue == Infinity {
- return true
- }
-
- if lessFunc(leftValue, rightValue) {
- return true
- }
- }
- return false
-}
-
-// LessEqualPartly returns true if there exists any dimension whose resource amount in r is less than or equal with that in rr.
-// Otherwise returns false.
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
-func (r *Resource) LessEqualPartly(rr *Resource, defaultValue DimensionDefaultValue) bool {
- lessEqualFunc := func(l, r, diff float64) bool {
- if l < r || math.Abs(l-r) < diff {
- return true
- }
- return false
- }
-
- if lessEqualFunc(r.MilliCPU, rr.MilliCPU, minResource) || lessEqualFunc(r.Memory, rr.Memory, minResource) {
- return true
- }
-
- for resourceName, leftValue := range r.ScalarResources {
- rightValue, ok := rr.ScalarResources[resourceName]
- if !ok && defaultValue == Infinity {
- return true
- }
-
- if lessEqualFunc(leftValue, rightValue, minResource) {
- return true
- }
- }
- return false
-}
-
-// Equal returns true only on condition that values in all dimension are equal with each other for r and rr
-// Otherwise returns false.
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
-func (r *Resource) Equal(rr *Resource, defaultValue DimensionDefaultValue) bool {
- equalFunc := func(l, r, diff float64) bool {
- return l == r || math.Abs(l-r) < diff
- }
-
- if !equalFunc(r.MilliCPU, rr.MilliCPU, minResource) || !equalFunc(r.Memory, rr.Memory, minResource) {
- return false
- }
-
- for resourceName, leftValue := range r.ScalarResources {
- rightValue := rr.ScalarResources[resourceName]
- if !equalFunc(leftValue, rightValue, minResource) {
- return false
- }
- }
- return true
-}
-
-// Diff calculate the difference between two resource object
-// Note: if `defaultValue` equals `Infinity`, the difference between two values will be `Infinity`, marked as -1
-func (r *Resource) Diff(rr *Resource, defaultValue DimensionDefaultValue) (*Resource, *Resource) {
- leftRes := r.Clone()
- rightRes := rr.Clone()
- increasedVal := EmptyResource()
- decreasedVal := EmptyResource()
- r.setDefaultValue(leftRes, rightRes, defaultValue)
-
- if leftRes.MilliCPU > rightRes.MilliCPU {
- increasedVal.MilliCPU = leftRes.MilliCPU - rightRes.MilliCPU
- } else {
- decreasedVal.MilliCPU = rightRes.MilliCPU - leftRes.MilliCPU
- }
-
- if leftRes.Memory > rightRes.Memory {
- increasedVal.Memory = leftRes.Memory - rightRes.Memory
- } else {
- decreasedVal.Memory = rightRes.Memory - leftRes.Memory
- }
-
- increasedVal.ScalarResources = make(map[v1.ResourceName]float64)
- decreasedVal.ScalarResources = make(map[v1.ResourceName]float64)
- for lName, lQuant := range leftRes.ScalarResources {
- rQuant := rightRes.ScalarResources[lName]
- if lQuant == -1 {
- increasedVal.ScalarResources[lName] = -1
- continue
- }
- if rQuant == -1 {
- decreasedVal.ScalarResources[lName] = -1
- continue
- }
- if lQuant > rQuant {
- increasedVal.ScalarResources[lName] = lQuant - rQuant
- } else {
- decreasedVal.ScalarResources[lName] = rQuant - lQuant
- }
- }
-
- return increasedVal, decreasedVal
-}
-
-// AddScalar adds a resource by a scalar value of this resource.
-func (r *Resource) AddScalar(name v1.ResourceName, quantity float64) {
- r.SetScalar(name, r.ScalarResources[name]+quantity)
-}
-
-// SetScalar sets a resource by a scalar value of this resource.
-func (r *Resource) SetScalar(name v1.ResourceName, quantity float64) {
- // Lazily allocate scalar resource map.
- if r.ScalarResources == nil {
- r.ScalarResources = map[v1.ResourceName]float64{}
- }
- r.ScalarResources[name] = quantity
-}
-
-// MinDimensionResource is used to reset the r resource dimension which is less than rr
-// e.g r resource is <cpu 2000.00, memory 4047845376.00, hugepages-2Mi 0.00, hugepages-1Gi 0.00>
-// rr resource is <cpu 3000.00, memory 1000.00>
-// return r resource is <cpu 2000.00, memory 1000.00, hugepages-2Mi 0.00, hugepages-1Gi 0.00>
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. Its value can only be one of 'Zero' and 'Infinity'"
-func (r *Resource) MinDimensionResource(rr *Resource, defaultValue DimensionDefaultValue) *Resource {
- if rr.MilliCPU < r.MilliCPU {
- r.MilliCPU = rr.MilliCPU
- }
- if rr.Memory < r.Memory {
- r.Memory = rr.Memory
- }
-
- if r.ScalarResources == nil {
- return r
- }
-
- if rr.ScalarResources == nil {
- if defaultValue == Infinity {
- return r
- }
-
- for name := range r.ScalarResources {
- r.ScalarResources[name] = 0
- }
- return r
- }
-
- for name, quant := range r.ScalarResources {
- rQuant, ok := rr.ScalarResources[name]
- if ok {
- r.ScalarResources[name] = math.Min(quant, rQuant)
- } else {
- if defaultValue == Infinity {
- continue
- }
-
- r.ScalarResources[name] = 0
- }
- }
- return r
-}
-
-// setDefaultValue sets default value for resource dimension not defined of ScalarResource in leftResource and rightResource
-// @param defaultValue "default value for resource dimension not defined in ScalarResources. It can only be one of 'Zero' or 'Infinity'"
-func (r *Resource) setDefaultValue(leftResource, rightResource *Resource, defaultValue DimensionDefaultValue) {
- if leftResource.ScalarResources == nil {
- leftResource.ScalarResources = map[v1.ResourceName]float64{}
- }
- if rightResource.ScalarResources == nil {
- rightResource.ScalarResources = map[v1.ResourceName]float64{}
- }
- for resourceName := range leftResource.ScalarResources {
- _, ok := rightResource.ScalarResources[resourceName]
- if !ok {
- if defaultValue == Zero {
- rightResource.ScalarResources[resourceName] = 0
- } else if defaultValue == Infinity {
- rightResource.ScalarResources[resourceName] = -1
- }
- }
- }
-
- for resourceName := range rightResource.ScalarResources {
- _, ok := leftResource.ScalarResources[resourceName]
- if !ok {
- if defaultValue == Zero {
- leftResource.ScalarResources[resourceName] = 0
- } else if defaultValue == Infinity {
- leftResource.ScalarResources[resourceName] = -1
- }
- }
- }
-}
-
-// ParseResourceList parses the given configuration map into an API
-// ResourceList or returns an error.
-func ParseResourceList(m map[string]string) (v1.ResourceList, error) {
- if len(m) == 0 {
- return nil, nil
- }
- rl := make(v1.ResourceList)
- for k, v := range m {
- switch v1.ResourceName(k) {
- // CPU, memory, local storage, and PID resources are supported.
- case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceEphemeralStorage:
- q, err := resource.ParseQuantity(v)
- if err != nil {
- return nil, err
- }
- if q.Sign() == -1 {
- return nil, fmt.Errorf("resource quantity for %q cannot be negative: %v", k, v)
- }
- rl[v1.ResourceName(k)] = q
- default:
- return nil, fmt.Errorf("cannot reserve %q resource", k)
- }
- }
- return rl, nil
-}
-
-func GetMinResource() float64 {
- return minResource
-}
-
-// ResourceNameList struct defines resource name collection
-type ResourceNameList []v1.ResourceName
-
-// Contains judges whether rr is subset of r
-func (r ResourceNameList) Contains(rr ResourceNameList) bool {
- for _, rrName := range ([]v1.ResourceName)(rr) {
- isResourceExist := false
- for _, rName := range ([]v1.ResourceName)(r) {
- if rName == rrName {
- isResourceExist = true
- break
- }
- }
- if !isResourceExist {
- return false
- }
- }
- return true
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "k8s.io/apimachinery/pkg/types"
-
- "volcano.sh/apis/pkg/apis/scheduling"
-)
-
-// ClusterID is UID type, serves as unique ID for each queue
-type ClusterID types.UID
-
-// SiloClusterInfo will have all details about queue
-type SiloClusterInfo struct {
- UID ClusterID
- Cluster *scheduling.Cluster
-}
-
-// NewSiloClusterInfo creates new queueInfo object
-func NewSiloClusterInfo(cluster *scheduling.Cluster) *SiloClusterInfo {
- return &SiloClusterInfo{
- UID: ClusterID(cluster.Name),
- Cluster: cluster,
- }
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/types"
-)
-
-func buildNode(name string, alloc v1.ResourceList) *v1.Node {
- return &v1.Node{
- ObjectMeta: metav1.ObjectMeta{
- Name: name,
- },
- Status: v1.NodeStatus{
- Capacity: alloc,
- Allocatable: alloc,
- },
- }
-}
-
-func buildPod(ns, n, nn string, p v1.PodPhase, req v1.ResourceList, owner []metav1.OwnerReference, labels map[string]string) *v1.Pod {
- return &v1.Pod{
- ObjectMeta: metav1.ObjectMeta{
- UID: types.UID(fmt.Sprintf("%v-%v", ns, n)),
- Name: n,
- Namespace: ns,
- OwnerReferences: owner,
- Labels: labels,
- },
- Status: v1.PodStatus{
- Phase: p,
- },
- Spec: v1.PodSpec{
- NodeName: nn,
- Containers: []v1.Container{
- {
- Resources: v1.ResourceRequirements{
- Requests: req,
- },
- },
- },
- },
- }
-}
-
-func buildResourceList(cpu string, memory string) v1.ResourceList {
- return v1.ResourceList{
- v1.ResourceCPU: resource.MustParse(cpu),
- v1.ResourceMemory: resource.MustParse(memory),
- }
-}
-
-func buildResource(cpu string, memory string) *Resource {
- return NewResource(v1.ResourceList{
- v1.ResourceCPU: resource.MustParse(cpu),
- v1.ResourceMemory: resource.MustParse(memory),
- })
-}
-
-func buildOwnerReference(owner string) metav1.OwnerReference {
- controller := true
- return metav1.OwnerReference{
- Controller: &controller,
- UID: types.UID(owner),
- }
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package api
-
-import (
- k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-)
-
-// TaskStatus defines the status of a task/pod.
-type TaskStatus int
-
-const (
- // Pending means the task is pending in the apiserver.
- Pending TaskStatus = 1 << iota
-
- // Allocated means the scheduler assigns a host to it.
- Allocated
-
- // Pipelined means the scheduler assigns a host to wait for releasing resource.
- Pipelined
-
- // Binding means the scheduler send Bind request to apiserver.
- Binding
-
- // Bound means the task/Pod bounds to a host.
- Bound
-
- // Running means a task is running on the host.
- Running
-
- // Releasing means a task/pod is deleted.
- Releasing
-
- // Succeeded means that all containers in the pod have voluntarily terminated
- // with a container exit code of 0, and the system is not going to restart any of these containers.
- Succeeded
-
- // Failed means that all containers in the pod have terminated, and at least one container has
- // terminated in a failure (exited with a non-zero exit code or was stopped by the system).
- Failed
-
- // Unknown means the status of task/pod is unknown to the scheduler.
- Unknown
-)
-
-func (ts TaskStatus) String() string {
- switch ts {
- case Pending:
- return "Pending"
- case Allocated:
- return "Allocated"
- case Pipelined:
- return "Pipelined"
- case Binding:
- return "Binding"
- case Bound:
- return "Bound"
- case Running:
- return "Running"
- case Releasing:
- return "Releasing"
- case Succeeded:
- return "Succeeded"
- case Failed:
- return "Failed"
- default:
- return "Unknown"
- }
-}
-
-// NodePhase defines the phase of node
-type NodePhase int
-
-const (
- // Ready means the node is ready for scheduling
- Ready NodePhase = 1 << iota
- // NotReady means the node is not ready for scheduling
- NotReady
-)
-
-func (np NodePhase) String() string {
- switch np {
- case Ready:
- return "Ready"
- case NotReady:
- return "NotReady"
- }
-
- return "Unknown"
-}
-
-// validateStatusUpdate validates whether the status transfer is valid.
-func validateStatusUpdate(oldStatus, newStatus TaskStatus) error {
- return nil
-}
-
-// LessFn is the func declaration used by sort or priority queue.
-type LessFn func(interface{}, interface{}) bool
-
-// CompareFn is the func declaration used by sort or priority queue.
-type CompareFn func(interface{}, interface{}) int
-
-// ValidateFn is the func declaration used to check object's status.
-type ValidateFn func(interface{}) bool
-
-// ValidateResult is struct to which can used to determine the result
-type ValidateResult struct {
- Pass bool
- Reason string
- Message string
-}
-
-// ValidateExFn is the func declaration used to validate the result.
-type ValidateExFn func(interface{}) *ValidateResult
-
-// VoteFn is the func declaration used to check object's complicated status.
-type VoteFn func(interface{}) int
-
-// JobEnqueuedFn is the func declaration used to call after job enqueued.
-type JobEnqueuedFn func(interface{})
-
-// PredicateFn is the func declaration used to predicate node for task.
-type PredicateFn func(*TaskInfo, *NodeInfo) error
-
-// BestNodeFn is the func declaration used to return the nodeScores to plugins.
-type BestNodeFn func(*TaskInfo, map[float64][]*NodeInfo) *NodeInfo
-
-// EvictableFn is the func declaration used to evict tasks.
-type EvictableFn func(*TaskInfo, []*TaskInfo) ([]*TaskInfo, int)
-
-// NodeOrderFn is the func declaration used to get priority score for a node for a particular task.
-type NodeOrderFn func(*TaskInfo, *NodeInfo) (float64, error)
-
-// BatchNodeOrderFn is the func declaration used to get priority score for ALL nodes for a particular task.
-type BatchNodeOrderFn func(*TaskInfo, []*NodeInfo) (map[string]float64, error)
-
-// NodeMapFn is the func declaration used to get priority score for a node for a particular task.
-type NodeMapFn func(*TaskInfo, *NodeInfo) (float64, error)
-
-// NodeReduceFn is the func declaration used to reduce priority score for a node for a particular task.
-type NodeReduceFn func(*TaskInfo, k8sframework.NodeScoreList) error
-
-// NodeOrderMapFn is the func declaration used to get priority score of all plugins for a node for a particular task.
-type NodeOrderMapFn func(*TaskInfo, *NodeInfo) (map[string]float64, float64, error)
-
-// NodeOrderReduceFn is the func declaration used to reduce priority score of all nodes for a plugin for a particular task.
-type NodeOrderReduceFn func(*TaskInfo, map[string]k8sframework.NodeScoreList) (map[string]float64, error)
-
-// TargetJobFn is the func declaration used to select the target job satisfies some conditions
-type TargetJobFn func([]*JobInfo) *JobInfo
-
-// ReservedNodesFn is the func declaration used to select the reserved nodes
-type ReservedNodesFn func()
-
-// VictimTasksFn is the func declaration used to select victim tasks
-type VictimTasksFn func() []*TaskInfo
-
-// UnderUsedResourceFn is the func declaration used to get under used resource list for queue
-type UnderUsedResourceFn func(*QueueInfo) ResourceNameList
-
-
-
package api
-
-import (
- "fmt"
- "sort"
- "strings"
-)
-
-const (
- // NodePodNumberExceeded means pods in node exceed the allocatable pod number
- NodePodNumberExceeded = "node(s) pod number exceeded"
- // NodeResourceFitFailed means node could not fit the request of pod
- NodeResourceFitFailed = "node(s) resource fit failed"
-
- // AllNodeUnavailableMsg is the default error message
- AllNodeUnavailableMsg = "all nodes are unavailable"
-)
-
-// These are reasons for a pod's transition to a condition.
-const (
- // PodReasonUnschedulable reason in PodScheduled PodCondition means that the scheduler
- // can't schedule the pod right now, for example due to insufficient resources in the cluster.
- PodReasonUnschedulable = "Unschedulable"
- // PodReasonSchedulable reason in PodScheduled PodCondition means that the scheduler
- // can schedule the pod right now, but not bind yet
- PodReasonSchedulable = "Schedulable"
- // PodReasonUndetermined reason in PodScheduled PodCondition means that the scheduler
- // skips scheduling the pod which left the pod `Undetermined`, for example due to unschedulable pod already occurred.
- PodReasonUndetermined = "Undetermined"
-)
-
-// FitErrors is set of FitError on many nodes
-type FitErrors struct {
- nodes map[string]*FitError
- err string
-}
-
-// NewFitErrors returns an FitErrors
-func NewFitErrors() *FitErrors {
- f := new(FitErrors)
- f.nodes = make(map[string]*FitError)
- return f
-}
-
-// SetError set the common error message in FitErrors
-func (f *FitErrors) SetError(err string) {
- f.err = err
-}
-
-// SetNodeError set the node error in FitErrors
-func (f *FitErrors) SetNodeError(nodeName string, err error) {
- var fe *FitError
- switch obj := err.(type) {
- case *FitError:
- obj.NodeName = nodeName
- fe = obj
- default:
- fe = &FitError{
- NodeName: nodeName,
- Reasons: []string{obj.Error()},
- }
- }
-
- f.nodes[nodeName] = fe
-}
-
-// Error returns the final error message
-func (f *FitErrors) Error() string {
- reasons := make(map[string]int)
-
- for _, node := range f.nodes {
- for _, reason := range node.Reasons {
- reasons[reason]++
- }
- }
-
- sortReasonsHistogram := func() []string {
- reasonStrings := []string{}
- for k, v := range reasons {
- reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k))
- }
- sort.Strings(reasonStrings)
- return reasonStrings
- }
- if f.err == "" {
- f.err = AllNodeUnavailableMsg
- }
- reasonMsg := fmt.Sprintf(f.err+": %v.", strings.Join(sortReasonsHistogram(), ", "))
- return reasonMsg
-}
-
-// FitError describe the reason why task could not fit that node
-type FitError struct {
- taskNamespace string
- taskName string
- NodeName string
- Reasons []string
-}
-
-// NewFitError return FitError by message
-func NewFitError(task *TaskInfo, node *NodeInfo, message ...string) *FitError {
- fe := &FitError{
- taskName: task.Name,
- taskNamespace: task.Namespace,
- NodeName: node.Name,
- Reasons: message,
- }
- return fe
-}
-
-// Error returns the final error message
-func (f *FitError) Error() string {
- return fmt.Sprintf("task %s/%s on node %s fit failed: %s", f.taskNamespace, f.taskName, f.NodeName, strings.Join(f.Reasons, ", "))
-}
-
-
-
/*
- Copyright 2021 The Volcano Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-
-package cache
-
-import (
- "context"
- "fmt"
- "os"
- "strconv"
- "strings"
- "sync"
- "time"
-
- v1 "k8s.io/api/core/v1"
- schedulingv1 "k8s.io/api/scheduling/v1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/runtime"
- utilruntime "k8s.io/apimachinery/pkg/util/runtime"
- "k8s.io/apimachinery/pkg/util/wait"
- "k8s.io/client-go/informers"
- infov1 "k8s.io/client-go/informers/core/v1"
- schedv1 "k8s.io/client-go/informers/scheduling/v1"
- storagev1 "k8s.io/client-go/informers/storage/v1"
- storagev1alpha1 "k8s.io/client-go/informers/storage/v1alpha1"
- "k8s.io/client-go/kubernetes"
- corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
- "k8s.io/client-go/rest"
- "k8s.io/client-go/tools/cache"
- "k8s.io/client-go/tools/record"
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
- podutil "k8s.io/kubernetes/pkg/api/v1/pod"
- volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/apis/pkg/apis/scheduling"
- schedulingscheme "volcano.sh/apis/pkg/apis/scheduling/scheme"
- vcv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- vcclient "volcano.sh/apis/pkg/client/clientset/versioned"
- "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
- vcinformer "volcano.sh/apis/pkg/client/informers/externalversions"
- cpuinformerv1 "volcano.sh/apis/pkg/client/informers/externalversions/nodeinfo/v1alpha1"
- vcinformerv1 "volcano.sh/apis/pkg/client/informers/externalversions/scheduling/v1beta1"
- "volcano.sh/volcano/cmd/scheduler/app/options"
- schedulingapi "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/metrics"
-)
-
-func init() {
- schemeBuilder := runtime.SchemeBuilder{
- v1.AddToScheme,
- }
-
- utilruntime.Must(schemeBuilder.AddToScheme(scheme.Scheme))
-}
-
-// New returns a Cache implementation.
-func New(config *rest.Config, schedulerName string, defaultQueue string, nodeSelectors []string) Cache {
- return newSchedulerCache(config, schedulerName, defaultQueue, nodeSelectors)
-}
-
-// SchedulerCache cache for the kube batch
-type SchedulerCache struct {
- sync.Mutex
-
- kubeClient *kubernetes.Clientset
- vcClient *vcclient.Clientset
- defaultQueue string
- // schedulerName is the name for volcano scheduler
- schedulerName string
- nodeSelectorLabels map[string]string
-
- podInformer infov1.PodInformer
- nodeInformer infov1.NodeInformer
- podGroupInformerV1beta1 vcinformerv1.PodGroupInformer
- queueInformerV1beta1 vcinformerv1.QueueInformer
- pvInformer infov1.PersistentVolumeInformer
- pvcInformer infov1.PersistentVolumeClaimInformer
- scInformer storagev1.StorageClassInformer
- pcInformer schedv1.PriorityClassInformer
- quotaInformer infov1.ResourceQuotaInformer
- csiNodeInformer storagev1.CSINodeInformer
- csiDriverInformer storagev1.CSIDriverInformer
- csiStorageCapacityInformer storagev1alpha1.CSIStorageCapacityInformer
- cpuInformer cpuinformerv1.NumatopologyInformer
-
- Binder Binder
- Evictor Evictor
- StatusUpdater StatusUpdater
- PodGroupBinder BatchBinder
- VolumeBinder VolumeBinder
-
- Recorder record.EventRecorder
-
- Jobs map[schedulingapi.JobID]*schedulingapi.JobInfo
- Nodes map[string]*schedulingapi.NodeInfo
- Queues map[schedulingapi.QueueID]*schedulingapi.QueueInfo
- PriorityClasses map[string]*schedulingv1.PriorityClass
- NodeList []string
- defaultPriorityClass *schedulingv1.PriorityClass
- defaultPriority int32
-
- NamespaceCollection map[string]*schedulingapi.NamespaceCollection
-
- errTasks workqueue.RateLimitingInterface
- deletedJobs workqueue.RateLimitingInterface
-
- informerFactory informers.SharedInformerFactory
- vcInformerFactory vcinformer.SharedInformerFactory
-
- BindFlowChannel chan *schedulingapi.TaskInfo
- bindCache []*schedulingapi.TaskInfo
- batchNum int
-}
-
-type defaultBinder struct {
- kubeclient *kubernetes.Clientset
-}
-
-//Bind will send bind request to api server
-func (db *defaultBinder) Bind(kubeClient *kubernetes.Clientset, tasks []*schedulingapi.TaskInfo) (error, []*schedulingapi.TaskInfo) {
- var errTasks []*schedulingapi.TaskInfo
- for _, task := range tasks {
- p := task.Pod
- if err := kubeClient.CoreV1().Pods(p.Namespace).Bind(context.TODO(),
- &v1.Binding{
- ObjectMeta: metav1.ObjectMeta{Namespace: p.Namespace, Name: p.Name, UID: p.UID, Annotations: p.Annotations},
- Target: v1.ObjectReference{
- Kind: "Node",
- Name: task.NodeName,
- },
- },
- metav1.CreateOptions{}); err != nil {
- klog.Errorf("Failed to bind pod <%v/%v> to node %s : %#v", p.Namespace, p.Name, task.NodeName, err)
- errTasks = append(errTasks, task)
- }
- }
-
- if len(errTasks) > 0 {
- return fmt.Errorf("failed to bind pods"), errTasks
- }
-
- return nil, nil
-}
-
-func NewBinder() *defaultBinder {
- return &defaultBinder{}
-}
-
-type defaultEvictor struct {
- kubeclient *kubernetes.Clientset
- recorder record.EventRecorder
-}
-
-// Evict will send delete pod request to api server
-func (de *defaultEvictor) Evict(p *v1.Pod, reason string) error {
- klog.V(3).Infof("Evicting pod %v/%v, because of %v", p.Namespace, p.Name, reason)
-
- evictMsg := fmt.Sprintf("Pod is evicted, because of %v", reason)
- annotations := map[string]string{}
- // record that we are evicting the pod
- de.recorder.AnnotatedEventf(p, annotations, v1.EventTypeWarning, "Evict", evictMsg)
-
- pod := p.DeepCopy()
- condition := &v1.PodCondition{
- Type: v1.PodReady,
- Status: v1.ConditionFalse,
- Reason: "Evict",
- Message: evictMsg,
- }
- if !podutil.UpdatePodCondition(&pod.Status, condition) {
- klog.V(1).Infof("UpdatePodCondition: existed condition, not update")
- klog.V(1).Infof("%+v", pod.Status.Conditions)
- return nil
- }
- if _, err := de.kubeclient.CoreV1().Pods(p.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}); err != nil {
- klog.Errorf("Failed to update pod <%v/%v> status: %v", pod.Namespace, pod.Name, err)
- return err
- }
- if err := de.kubeclient.CoreV1().Pods(p.Namespace).Delete(context.TODO(), p.Name, metav1.DeleteOptions{}); err != nil {
- klog.Errorf("Failed to evict pod <%v/%v>: %#v", p.Namespace, p.Name, err)
- return err
- }
-
- return nil
-}
-
-// defaultStatusUpdater is the default implementation of the StatusUpdater interface
-type defaultStatusUpdater struct {
- kubeclient *kubernetes.Clientset
- vcclient *vcclient.Clientset
-}
-
-// following the same logic as podutil.UpdatePodCondition
-func podConditionHaveUpdate(status *v1.PodStatus, condition *v1.PodCondition) bool {
- lastTransitionTime := metav1.Now()
- // Try to find this pod condition.
- _, oldCondition := podutil.GetPodCondition(status, condition.Type)
-
- if oldCondition == nil {
- // We are adding new pod condition.
- return true
- }
- // We are updating an existing condition, so we need to check if it has changed.
- if condition.Status == oldCondition.Status {
- lastTransitionTime = oldCondition.LastTransitionTime
- }
-
- isEqual := condition.Status == oldCondition.Status &&
- condition.Reason == oldCondition.Reason &&
- condition.Message == oldCondition.Message &&
- condition.LastProbeTime.Equal(&oldCondition.LastProbeTime) &&
- lastTransitionTime.Equal(&oldCondition.LastTransitionTime)
-
- // Return true if one of the fields have changed.
- return !isEqual
-}
-
-// UpdatePodCondition will Update pod with podCondition
-func (su *defaultStatusUpdater) UpdatePodCondition(pod *v1.Pod, condition *v1.PodCondition) (*v1.Pod, error) {
- klog.V(3).Infof("Updating pod condition for %s/%s to (%s==%s)", pod.Namespace, pod.Name, condition.Type, condition.Status)
- if podutil.UpdatePodCondition(&pod.Status, condition) {
- return su.kubeclient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{})
- }
- return pod, nil
-}
-
-// UpdatePodGroup will Update pod with podCondition
-func (su *defaultStatusUpdater) UpdatePodGroup(pg *schedulingapi.PodGroup) (*schedulingapi.PodGroup, error) {
- podgroup := &vcv1beta1.PodGroup{}
- if err := schedulingscheme.Scheme.Convert(&pg.PodGroup, podgroup, nil); err != nil {
- klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
- return nil, err
- }
-
- updated, err := su.vcclient.SchedulingV1beta1().PodGroups(podgroup.Namespace).Update(context.TODO(), podgroup, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Error while updating PodGroup with error: %v", err)
- return nil, err
- }
-
- podGroupInfo := &schedulingapi.PodGroup{Version: schedulingapi.PodGroupVersionV1Beta1}
- if err := schedulingscheme.Scheme.Convert(updated, &podGroupInfo.PodGroup, nil); err != nil {
- klog.Errorf("Error while converting v1alpha.PodGroup to api.PodGroup with error: %v", err)
- return nil, err
- }
-
- return podGroupInfo, nil
-}
-
-type defaultVolumeBinder struct {
- volumeBinder volumescheduling.SchedulerVolumeBinder
-}
-
-// AllocateVolumes allocates volume on the host to the task
-func (dvb *defaultVolumeBinder) AllocateVolumes(task *schedulingapi.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error {
- allBound, err := dvb.volumeBinder.AssumePodVolumes(task.Pod, hostname, podVolumes)
- task.VolumeReady = allBound
-
- return err
-}
-
-// GetPodVolumes get pod volume on the host
-func (dvb *defaultVolumeBinder) GetPodVolumes(task *schedulingapi.TaskInfo,
- node *v1.Node) (podVolumes *volumescheduling.PodVolumes, err error) {
- boundClaims, claimsToBind, _, err := dvb.volumeBinder.GetPodVolumes(task.Pod)
- if err != nil {
- return nil, err
- }
-
- podVolumes, _, err = dvb.volumeBinder.FindPodVolumes(task.Pod, boundClaims, claimsToBind, node)
- return podVolumes, err
-}
-
-// BindVolumes binds volumes to the task
-func (dvb *defaultVolumeBinder) BindVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) error {
- // If task's volumes are ready, did not bind them again.
- if task.VolumeReady {
- return nil
- }
-
- return dvb.volumeBinder.BindPodVolumes(task.Pod, podVolumes)
-}
-
-type podgroupBinder struct {
- kubeclient *kubernetes.Clientset
- vcclient *vcclient.Clientset
-}
-
-// Bind will add silo cluster annotaion on pod and podgroup
-func (pgb *podgroupBinder) Bind(job *schedulingapi.JobInfo, cluster string) (*schedulingapi.JobInfo, error) {
- if len(job.Tasks) == 0 {
- klog.V(4).Infof("Job pods have not been created yet")
- return job, nil
- }
- for _, task := range job.Tasks {
- pod := task.Pod
- pod.Annotations[batch.ForwardClusterKey] = cluster
- pod.ResourceVersion = ""
- _, err := pgb.kubeclient.CoreV1().Pods(pod.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Error while update pod annotation with error: %v", err)
- return nil, err
- }
- }
-
- pg := job.PodGroup
- pg.Annotations[batch.ForwardClusterKey] = cluster
- podgroup := &vcv1beta1.PodGroup{}
- if err := schedulingscheme.Scheme.Convert(&pg.PodGroup, podgroup, nil); err != nil {
- klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
- return nil, err
- }
- newPg, err := pgb.vcclient.SchedulingV1beta1().PodGroups(pg.Namespace).Update(context.TODO(), podgroup, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Error while update PodGroup annotation with error: %v", err)
- return nil, err
- }
- job.PodGroup.ResourceVersion = newPg.ResourceVersion
- klog.V(4).Infof("Bind PodGroup <%s> successfully", job.PodGroup.Name)
- return job, nil
-}
-
-func newSchedulerCache(config *rest.Config, schedulerName string, defaultQueue string, nodeSelectors []string) *SchedulerCache {
- kubeClient, err := kubernetes.NewForConfig(config)
- if err != nil {
- panic(fmt.Sprintf("failed init kubeClient, with err: %v", err))
- }
- vcClient, err := vcclient.NewForConfig(config)
- if err != nil {
- panic(fmt.Sprintf("failed init vcClient, with err: %v", err))
- }
- eventClient, err := kubernetes.NewForConfig(config)
- if err != nil {
- panic(fmt.Sprintf("failed init eventClient, with err: %v", err))
- }
-
- // create default queue
- reclaimable := true
- defaultQue := vcv1beta1.Queue{
- ObjectMeta: metav1.ObjectMeta{
- Name: defaultQueue,
- },
- Spec: vcv1beta1.QueueSpec{
- Reclaimable: &reclaimable,
- Weight: 1,
- },
- }
- if _, err := vcClient.SchedulingV1beta1().Queues().Create(context.TODO(), &defaultQue, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) {
- panic(fmt.Sprintf("failed init default queue, with err: %v", err))
- }
-
- sc := &SchedulerCache{
- Jobs: make(map[schedulingapi.JobID]*schedulingapi.JobInfo),
- Nodes: make(map[string]*schedulingapi.NodeInfo),
- Queues: make(map[schedulingapi.QueueID]*schedulingapi.QueueInfo),
- PriorityClasses: make(map[string]*schedulingv1.PriorityClass),
- errTasks: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
- deletedJobs: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
- kubeClient: kubeClient,
- vcClient: vcClient,
- defaultQueue: defaultQueue,
- schedulerName: schedulerName,
- nodeSelectorLabels: make(map[string]string),
- NamespaceCollection: make(map[string]*schedulingapi.NamespaceCollection),
-
- NodeList: []string{},
- }
- if len(nodeSelectors) > 0 {
- for _, nodeSelectorLabel := range nodeSelectors {
- nodeSelectorLabelLen := len(nodeSelectorLabel)
- if nodeSelectorLabelLen <= 0 {
- continue
- }
- // check input
- index := strings.Index(nodeSelectorLabel, ":")
- if index < 0 || index >= (nodeSelectorLabelLen-1) {
- continue
- }
- nodeSelectorLabelName := strings.TrimSpace(nodeSelectorLabel[:index])
- nodeSelectorLabelValue := strings.TrimSpace(nodeSelectorLabel[index+1:])
- key := nodeSelectorLabelName + ":" + nodeSelectorLabelValue
- sc.nodeSelectorLabels[key] = ""
- }
-
- }
- // Prepare event clients.
- broadcaster := record.NewBroadcaster()
- broadcaster.StartRecordingToSink(&corev1.EventSinkImpl{Interface: eventClient.CoreV1().Events("")})
- sc.Recorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: schedulerName})
-
- sc.BindFlowChannel = make(chan *schedulingapi.TaskInfo, 5000)
- sc.Binder = GetBindMethod()
-
- var batchNum int
- batchNum, err = strconv.Atoi(os.Getenv("BATCH_BIND_NUM"))
- if err == nil && batchNum > 0 {
- sc.batchNum = batchNum
- } else {
- sc.batchNum = 1
- }
-
- sc.Evictor = &defaultEvictor{
- kubeclient: sc.kubeClient,
- recorder: sc.Recorder,
- }
-
- sc.StatusUpdater = &defaultStatusUpdater{
- kubeclient: sc.kubeClient,
- vcclient: sc.vcClient,
- }
-
- sc.PodGroupBinder = &podgroupBinder{
- kubeclient: sc.kubeClient,
- vcclient: sc.vcClient,
- }
-
- informerFactory := informers.NewSharedInformerFactory(sc.kubeClient, 0)
- sc.informerFactory = informerFactory
- mySchedulerPodName, c := getMultiSchedulerInfo()
-
- // create informer for node information
- sc.nodeInformer = informerFactory.Core().V1().Nodes()
- sc.nodeInformer.Informer().AddEventHandlerWithResyncPeriod(
- cache.FilteringResourceEventHandler{
- FilterFunc: func(obj interface{}) bool {
- node, ok := obj.(*v1.Node)
- if !ok {
- klog.Errorf("Cannot convert to *v1.Node: %v", obj)
- return false
- }
- if !responsibleForNode(node.Name, mySchedulerPodName, c) {
- return false
- }
- if len(sc.nodeSelectorLabels) == 0 {
- return true
- }
- for labelName, labelValue := range node.Labels {
- key := labelName + ":" + labelValue
- if _, ok := sc.nodeSelectorLabels[key]; ok {
- return true
- }
- }
- klog.Infof("node %s ignore add/update/delete into schedulerCache", node.Name)
- return false
- },
- Handler: cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddNode,
- UpdateFunc: sc.UpdateNode,
- DeleteFunc: sc.DeleteNode,
- },
- },
- 0,
- )
-
- sc.podInformer = informerFactory.Core().V1().Pods()
- sc.pvcInformer = informerFactory.Core().V1().PersistentVolumeClaims()
- sc.pvInformer = informerFactory.Core().V1().PersistentVolumes()
- sc.scInformer = informerFactory.Storage().V1().StorageClasses()
- sc.csiNodeInformer = informerFactory.Storage().V1().CSINodes()
- sc.csiDriverInformer = informerFactory.Storage().V1().CSIDrivers()
- sc.csiStorageCapacityInformer = informerFactory.Storage().V1alpha1().CSIStorageCapacities()
-
- var capacityCheck *volumescheduling.CapacityCheck
- if options.ServerOpts.EnableCSIStorage {
- capacityCheck = &volumescheduling.CapacityCheck{
- CSIDriverInformer: sc.csiDriverInformer,
- CSIStorageCapacityInformer: sc.csiStorageCapacityInformer,
- }
- } else {
- capacityCheck = nil
- }
-
- sc.VolumeBinder = &defaultVolumeBinder{
- volumeBinder: volumescheduling.NewVolumeBinder(
- sc.kubeClient,
- sc.podInformer,
- sc.nodeInformer,
- sc.csiNodeInformer,
- sc.pvcInformer,
- sc.pvInformer,
- sc.scInformer,
- capacityCheck,
- 30*time.Second,
- ),
- }
-
- // create informer for pod information
- sc.podInformer.Informer().AddEventHandler(
- cache.FilteringResourceEventHandler{
- FilterFunc: func(obj interface{}) bool {
- switch v := obj.(type) {
- case *v1.Pod:
- if !responsibleForPod(v, schedulerName, mySchedulerPodName, c) {
- if len(v.Spec.NodeName) == 0 {
- return false
- }
- if !responsibleForNode(v.Spec.NodeName, mySchedulerPodName, c) {
- return false
- }
- }
- return true
- default:
- return false
- }
- },
- Handler: cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddPod,
- UpdateFunc: sc.UpdatePod,
- DeleteFunc: sc.DeletePod,
- },
- })
-
- if options.ServerOpts.EnablePriorityClass {
- sc.pcInformer = informerFactory.Scheduling().V1().PriorityClasses()
- sc.pcInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddPriorityClass,
- UpdateFunc: sc.UpdatePriorityClass,
- DeleteFunc: sc.DeletePriorityClass,
- })
- }
-
- sc.quotaInformer = informerFactory.Core().V1().ResourceQuotas()
- sc.quotaInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddResourceQuota,
- UpdateFunc: sc.UpdateResourceQuota,
- DeleteFunc: sc.DeleteResourceQuota,
- })
-
- vcinformers := vcinformer.NewSharedInformerFactory(sc.vcClient, 0)
- sc.vcInformerFactory = vcinformers
-
- // create informer for PodGroup(v1beta1) information
- sc.podGroupInformerV1beta1 = vcinformers.Scheduling().V1beta1().PodGroups()
- sc.podGroupInformerV1beta1.Informer().AddEventHandler(
- cache.FilteringResourceEventHandler{
- FilterFunc: func(obj interface{}) bool {
- switch v := obj.(type) {
- case *vcv1beta1.PodGroup:
- return responsibleForPodGroup(v, mySchedulerPodName, c)
- default:
- return false
- }
- },
- Handler: cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddPodGroupV1beta1,
- UpdateFunc: sc.UpdatePodGroupV1beta1,
- DeleteFunc: sc.DeletePodGroupV1beta1,
- },
- })
-
- // create informer(v1beta1) for Queue information
- sc.queueInformerV1beta1 = vcinformers.Scheduling().V1beta1().Queues()
- sc.queueInformerV1beta1.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddQueueV1beta1,
- UpdateFunc: sc.UpdateQueueV1beta1,
- DeleteFunc: sc.DeleteQueueV1beta1,
- })
-
- sc.cpuInformer = vcinformers.Nodeinfo().V1alpha1().Numatopologies()
- sc.cpuInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
- AddFunc: sc.AddNumaInfoV1alpha1,
- UpdateFunc: sc.UpdateNumaInfoV1alpha1,
- DeleteFunc: sc.DeleteNumaInfoV1alpha1,
- })
- return sc
-}
-
-// Run starts the schedulerCache
-func (sc *SchedulerCache) Run(stopCh <-chan struct{}) {
- sc.informerFactory.Start(stopCh)
- sc.vcInformerFactory.Start(stopCh)
- // Re-sync error tasks.
- go wait.Until(sc.processResyncTask, 0, stopCh)
-
- // Cleanup jobs.
- go wait.Until(sc.processCleanupJob, 0, stopCh)
-
- go wait.Until(sc.processBindTask, time.Millisecond*20, stopCh)
-}
-
-// WaitForCacheSync sync the cache with the api server
-func (sc *SchedulerCache) WaitForCacheSync(stopCh <-chan struct{}) {
- sc.informerFactory.WaitForCacheSync(stopCh)
- sc.vcInformerFactory.WaitForCacheSync(stopCh)
-}
-
-// findJobAndTask returns job and the task info
-func (sc *SchedulerCache) findJobAndTask(taskInfo *schedulingapi.TaskInfo) (*schedulingapi.JobInfo, *schedulingapi.TaskInfo, error) {
- job, found := sc.Jobs[taskInfo.Job]
- if !found {
- return nil, nil, fmt.Errorf("failed to find Job %v for Task %v",
- taskInfo.Job, taskInfo.UID)
- }
-
- task, found := job.Tasks[taskInfo.UID]
- if !found {
- return nil, nil, fmt.Errorf("failed to find task in status %v by id %v",
- taskInfo.Status, taskInfo.UID)
- }
-
- return job, task, nil
-}
-
-// Evict will evict the pod.
-//
-// If error occurs both task and job are guaranteed to be in the original state.
-func (sc *SchedulerCache) Evict(taskInfo *schedulingapi.TaskInfo, reason string) error {
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- job, task, err := sc.findJobAndTask(taskInfo)
-
- if err != nil {
- return err
- }
-
- node, found := sc.Nodes[task.NodeName]
- if !found {
- return fmt.Errorf("failed to bind Task %v to host %v, host does not exist",
- task.UID, task.NodeName)
- }
-
- originalStatus := task.Status
- if err := job.UpdateTaskStatus(task, schedulingapi.Releasing); err != nil {
- return err
- }
-
- // Add new task to node.
- if err := node.UpdateTask(task); err != nil {
- // After failing to update task to a node we need to revert task status from Releasing,
- // otherwise task might be stuck in the Releasing state indefinitely.
- if err := job.UpdateTaskStatus(task, originalStatus); err != nil {
- klog.Errorf("Task <%s/%s> will be resynchronized after failing to revert status "+
- "from %s to %s after failing to update Task on Node <%s>: %v",
- task.Namespace, task.Name, task.Status, originalStatus, node.Name, err)
- sc.resyncTask(task)
- }
- return err
- }
-
- p := task.Pod
-
- go func() {
- err := sc.Evictor.Evict(p, reason)
- if err != nil {
- sc.resyncTask(task)
- }
- }()
-
- podgroup := &vcv1beta1.PodGroup{}
- if err := schedulingscheme.Scheme.Convert(&job.PodGroup.PodGroup, podgroup, nil); err != nil {
- klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
- return err
- }
- sc.Recorder.Eventf(podgroup, v1.EventTypeNormal, "Evict", reason)
- return nil
-}
-
-// Bind binds task to the target host.
-func (sc *SchedulerCache) Bind(tasks []*schedulingapi.TaskInfo) error {
- go func(taskArray []*schedulingapi.TaskInfo) {
- tmp := time.Now()
- err, errTasks := sc.Binder.Bind(sc.kubeClient, taskArray)
- if err == nil {
- klog.V(3).Infof("bind ok, latency %v", time.Since(tmp))
- for _, task := range tasks {
- sc.Recorder.Eventf(task.Pod, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v",
- task.Namespace, task.Name, task.NodeName)
- }
- } else {
- for _, task := range errTasks {
- klog.V(2).Infof("resyncTask task %s", task.Name)
- sc.resyncTask(task)
- }
- }
- }(tasks)
-
- return nil
-}
-
-// BindPodGroup binds job to silo cluster
-func (sc *SchedulerCache) BindPodGroup(job *schedulingapi.JobInfo, cluster string) error {
- if _, err := sc.PodGroupBinder.Bind(job, cluster); err != nil {
- klog.Errorf("Bind job <%s> to cluster <%s> failed: %v", job.Name, cluster, err)
- return err
- }
- return nil
-}
-
-// GetPodVolumes get pod volume on the host
-func (sc *SchedulerCache) GetPodVolumes(task *schedulingapi.TaskInfo, node *v1.Node) (*volumescheduling.PodVolumes, error) {
- return sc.VolumeBinder.GetPodVolumes(task, node)
-}
-
-// AllocateVolumes allocates volume on the host to the task
-func (sc *SchedulerCache) AllocateVolumes(task *schedulingapi.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error {
- return sc.VolumeBinder.AllocateVolumes(task, hostname, podVolumes)
-}
-
-// BindVolumes binds volumes to the task
-func (sc *SchedulerCache) BindVolumes(task *schedulingapi.TaskInfo, podVolumes *volumescheduling.PodVolumes) error {
- return sc.VolumeBinder.BindVolumes(task, podVolumes)
-}
-
-// Client returns the kubernetes clientSet
-func (sc *SchedulerCache) Client() kubernetes.Interface {
- return sc.kubeClient
-}
-
-// SharedInformerFactory returns the scheduler SharedInformerFactory
-func (sc *SchedulerCache) SharedInformerFactory() informers.SharedInformerFactory {
- return sc.informerFactory
-}
-
-// UpdateSchedulerNumaInfo used to update scheduler node cache NumaSchedulerInfo
-func (sc *SchedulerCache) UpdateSchedulerNumaInfo(AllocatedSets map[string]schedulingapi.ResNumaSets) error {
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- for nodeName, sets := range AllocatedSets {
- if _, found := sc.Nodes[nodeName]; !found {
- continue
- }
-
- numaInfo := sc.Nodes[nodeName].NumaSchedulerInfo
- if numaInfo == nil {
- continue
- }
-
- numaInfo.Allocate(sets)
- }
- return nil
-}
-
-// taskUnschedulable updates pod status of pending task
-func (sc *SchedulerCache) taskUnschedulable(task *schedulingapi.TaskInfo, reason, message string) error {
- pod := task.Pod
-
- condition := &v1.PodCondition{
- Type: v1.PodScheduled,
- Status: v1.ConditionFalse,
- Reason: reason, // Add more reasons in order to distinguish more specific scenario of pending tasks
- Message: message,
- }
-
- if podConditionHaveUpdate(&pod.Status, condition) {
- pod = pod.DeepCopy()
-
- // The reason field in 'Events' should be "FailedScheduling", there is not constants defined for this in
- // k8s core, so using the same string here.
- // The reason field in PodCondition can be "Unschedulable"
- sc.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", message)
- if _, err := sc.StatusUpdater.UpdatePodCondition(pod, condition); err != nil {
- return err
- }
- } else {
- klog.V(4).Infof("task unscheduleable %s/%s, message: %s, skip by no condition update", pod.Namespace, pod.Name, message)
- }
-
- return nil
-}
-
-func (sc *SchedulerCache) deleteJob(job *schedulingapi.JobInfo) {
- klog.V(3).Infof("Try to delete Job <%v:%v/%v>", job.UID, job.Namespace, job.Name)
-
- sc.deletedJobs.AddRateLimited(job)
-}
-
-func (sc *SchedulerCache) processCleanupJob() {
- obj, shutdown := sc.deletedJobs.Get()
- if shutdown {
- return
- }
-
- defer sc.deletedJobs.Done(obj)
-
- job, found := obj.(*schedulingapi.JobInfo)
- if !found {
- klog.Errorf("Failed to convert <%v> to *JobInfo", obj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- if schedulingapi.JobTerminated(job) {
- delete(sc.Jobs, job.UID)
- klog.V(3).Infof("Job <%v:%v/%v> was deleted.", job.UID, job.Namespace, job.Name)
- } else {
- // Retry
- sc.deleteJob(job)
- }
-}
-
-func (sc *SchedulerCache) resyncTask(task *schedulingapi.TaskInfo) {
- sc.errTasks.AddRateLimited(task)
-}
-
-func (sc *SchedulerCache) processResyncTask() {
- obj, shutdown := sc.errTasks.Get()
- if shutdown {
- return
- }
-
- defer sc.errTasks.Done(obj)
-
- task, ok := obj.(*schedulingapi.TaskInfo)
- if !ok {
- klog.Errorf("failed to convert %v to *schedulingapi.TaskInfo", obj)
- return
- }
-
- if err := sc.syncTask(task); err != nil {
- klog.Errorf("Failed to sync pod <%v/%v>, retry it.", task.Namespace, task.Name)
- sc.resyncTask(task)
- }
-}
-
-func (sc *SchedulerCache) AddBindTask(taskInfo *schedulingapi.TaskInfo) error {
- klog.V(5).Infof("add bind task %v/%v", taskInfo.Namespace, taskInfo.Name)
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
- job, task, err := sc.findJobAndTask(taskInfo)
- if err != nil {
- return err
- }
-
- node, found := sc.Nodes[taskInfo.NodeName]
- if !found {
- return fmt.Errorf("failed to bind Task %v to host %v, host does not exist",
- task.UID, taskInfo.NodeName)
- }
-
- originalStatus := task.Status
- if err := job.UpdateTaskStatus(task, schedulingapi.Binding); err != nil {
- return err
- }
-
- // Add task to the node.
- if err := node.AddTask(task); err != nil {
- // After failing to update task to a node we need to revert task status from Releasing,
- // otherwise task might be stuck in the Releasing state indefinitely.
- if err := job.UpdateTaskStatus(task, originalStatus); err != nil {
- klog.Errorf("Task <%s/%s> will be resynchronized after failing to revert status "+
- "from %s to %s after failing to update Task on Node <%s>: %v",
- task.Namespace, task.Name, task.Status, originalStatus, node.Name, err)
- sc.resyncTask(task)
- }
- return err
- }
-
- sc.BindFlowChannel <- taskInfo
-
- return nil
-}
-
-func (sc *SchedulerCache) processBindTask() {
- for {
- select {
- case taskInfo, ok := <-sc.BindFlowChannel:
- if !ok {
- return
- }
-
- sc.bindCache = append(sc.bindCache, taskInfo)
- if len(sc.bindCache) == sc.batchNum {
- sc.BindTask()
- }
- }
-
- if len(sc.BindFlowChannel) == 0 {
- break
- }
- }
-
- if len(sc.bindCache) == 0 {
- return
- }
-
- sc.BindTask()
-}
-
-func (sc *SchedulerCache) BindTask() {
- klog.V(5).Infof("batch bind task count %d", len(sc.bindCache))
- for _, task := range sc.bindCache {
- if err := sc.BindVolumes(task, task.PodVolumes); err != nil {
- klog.Errorf("task %s/%s bind Volumes failed: %#v", task.Namespace, task.Name, err)
- sc.resyncTask(task)
- return
- }
- }
-
- bindTasks := make([]*schedulingapi.TaskInfo, len(sc.bindCache))
- copy(bindTasks, sc.bindCache)
- if err := sc.Bind(bindTasks); err != nil {
- return
- }
-
- for _, task := range sc.bindCache {
- metrics.UpdateTaskScheduleDuration(metrics.Duration(task.Pod.CreationTimestamp.Time))
- }
-
- sc.bindCache = sc.bindCache[0:0]
- return
-}
-
-// Snapshot returns the complete snapshot of the cluster from cache
-func (sc *SchedulerCache) Snapshot() *schedulingapi.ClusterInfo {
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- snapshot := &schedulingapi.ClusterInfo{
- Nodes: make(map[string]*schedulingapi.NodeInfo),
- Jobs: make(map[schedulingapi.JobID]*schedulingapi.JobInfo),
- Queues: make(map[schedulingapi.QueueID]*schedulingapi.QueueInfo),
- NamespaceInfo: make(map[schedulingapi.NamespaceName]*schedulingapi.NamespaceInfo),
- RevocableNodes: make(map[string]*schedulingapi.NodeInfo),
- NodeList: make([]string, len(sc.NodeList)),
- }
-
- copy(snapshot.NodeList, sc.NodeList)
- for _, value := range sc.Nodes {
- value.RefreshNumaSchedulerInfoByCrd()
- }
-
- for _, value := range sc.Nodes {
- if !value.Ready() {
- continue
- }
-
- snapshot.Nodes[value.Name] = value.Clone()
-
- if value.RevocableZone != "" {
- snapshot.RevocableNodes[value.Name] = snapshot.Nodes[value.Name]
- }
- }
-
- for _, value := range sc.Queues {
- snapshot.Queues[value.UID] = value.Clone()
- }
-
- var cloneJobLock sync.Mutex
- var wg sync.WaitGroup
-
- cloneJob := func(value *schedulingapi.JobInfo) {
- defer wg.Done()
- if value.PodGroup != nil {
- value.Priority = sc.defaultPriority
-
- priName := value.PodGroup.Spec.PriorityClassName
- if priorityClass, found := sc.PriorityClasses[priName]; found {
- value.Priority = priorityClass.Value
- }
-
- klog.V(4).Infof("The priority of job <%s/%s> is <%s/%d>",
- value.Namespace, value.Name, priName, value.Priority)
- }
-
- clonedJob := value.Clone()
-
- cloneJobLock.Lock()
- snapshot.Jobs[value.UID] = clonedJob
- cloneJobLock.Unlock()
- }
-
- for _, value := range sc.NamespaceCollection {
- info := value.Snapshot()
- snapshot.NamespaceInfo[info.Name] = info
- klog.V(4).Infof("Namespace %s has weight %v",
- value.Name, info.GetWeight())
- }
-
- for _, value := range sc.Jobs {
- // If no scheduling spec, does not handle it.
- if value.PodGroup == nil {
- klog.V(4).Infof("The scheduling spec of Job <%v:%s/%s> is nil, ignore it.",
- value.UID, value.Namespace, value.Name)
-
- continue
- }
-
- if _, found := snapshot.Queues[value.Queue]; !found {
- klog.V(3).Infof("The Queue <%v> of Job <%v/%v> does not exist, ignore it.",
- value.Queue, value.Namespace, value.Name)
- continue
- }
-
- wg.Add(1)
- go cloneJob(value)
- }
- wg.Wait()
-
- klog.V(3).Infof("There are <%d> Jobs, <%d> Queues and <%d> Nodes in total for scheduling.",
- len(snapshot.Jobs), len(snapshot.Queues), len(snapshot.Nodes))
-
- return snapshot
-}
-
-// String returns information about the cache in a string format
-func (sc *SchedulerCache) String() string {
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- str := "Cache:\n"
-
- if len(sc.Nodes) != 0 {
- str += "Nodes:\n"
- for _, n := range sc.Nodes {
- str += fmt.Sprintf("\t %s: idle(%v) used(%v) allocatable(%v) pods(%d)\n",
- n.Name, n.Idle, n.Used, n.Allocatable, len(n.Tasks))
-
- i := 0
- for _, p := range n.Tasks {
- str += fmt.Sprintf("\t\t %d: %v\n", i, p)
- i++
- }
- }
- }
-
- if len(sc.Jobs) != 0 {
- str += "Jobs:\n"
- for _, job := range sc.Jobs {
- str += fmt.Sprintf("\t %s\n", job)
- }
- }
-
- if len(sc.NamespaceCollection) != 0 {
- str += "Namespaces:\n"
- for _, ns := range sc.NamespaceCollection {
- info := ns.Snapshot()
- str += fmt.Sprintf("\t Namespace(%s) Weight(%v)\n",
- info.Name, info.Weight)
- }
- }
-
- if len(sc.NodeList) != 0 {
- str += fmt.Sprintf("NodeList: %v\n", sc.NodeList)
- }
-
- return str
-}
-
-// RecordJobStatusEvent records related events according to job status.
-func (sc *SchedulerCache) RecordJobStatusEvent(job *schedulingapi.JobInfo) {
- pgUnschedulable := job.PodGroup != nil &&
- (job.PodGroup.Status.Phase == scheduling.PodGroupUnknown ||
- job.PodGroup.Status.Phase == scheduling.PodGroupPending ||
- job.PodGroup.Status.Phase == scheduling.PodGroupInqueue)
-
- // If pending or unschedulable, record unschedulable event.
- if pgUnschedulable {
- msg := fmt.Sprintf("%v/%v tasks in gang unschedulable: %v",
- len(job.TaskStatusIndex[schedulingapi.Pending]),
- len(job.Tasks),
- job.FitError())
- sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeWarning, string(scheduling.PodGroupUnschedulableType), msg)
- } else {
- sc.recordPodGroupEvent(job.PodGroup, v1.EventTypeNormal, string(scheduling.PodGroupScheduled), string(scheduling.PodGroupReady))
- }
-
- baseErrorMessage := job.JobFitErrors
- if baseErrorMessage == "" {
- baseErrorMessage = schedulingapi.AllNodeUnavailableMsg
- }
- // Update podCondition for tasks Allocated and Pending before job discarded
- for _, status := range []schedulingapi.TaskStatus{schedulingapi.Allocated, schedulingapi.Pending, schedulingapi.Pipelined} {
- for _, taskInfo := range job.TaskStatusIndex[status] {
- reason, msg := job.TaskSchedulingReason(taskInfo.UID)
- if len(msg) == 0 {
- msg = baseErrorMessage
- }
- if err := sc.taskUnschedulable(taskInfo, reason, msg); err != nil {
- klog.Errorf("Failed to update unschedulable task status <%s/%s>: %v",
- taskInfo.Namespace, taskInfo.Name, err)
- }
- }
- }
-}
-
-// UpdateJobStatus update the status of job and its tasks.
-func (sc *SchedulerCache) UpdateJobStatus(job *schedulingapi.JobInfo, updatePG bool) (*schedulingapi.JobInfo, error) {
- if updatePG {
- pg, err := sc.StatusUpdater.UpdatePodGroup(job.PodGroup)
- if err != nil {
- return nil, err
- }
- job.PodGroup = pg
- }
-
- sc.RecordJobStatusEvent(job)
-
- return job, nil
-}
-
-func (sc *SchedulerCache) recordPodGroupEvent(podGroup *schedulingapi.PodGroup, eventType, reason, msg string) {
- if podGroup == nil {
- return
- }
-
- pg := &vcv1beta1.PodGroup{}
- if err := schedulingscheme.Scheme.Convert(&podGroup.PodGroup, pg, nil); err != nil {
- klog.Errorf("Error while converting PodGroup to v1alpha1.PodGroup with error: %v", err)
- return
- }
- sc.Recorder.Eventf(pg, eventType, reason, msg)
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package cache
-
-import (
- "context"
- "fmt"
- "strconv"
-
- v1 "k8s.io/api/core/v1"
- schedulingv1 "k8s.io/api/scheduling/v1"
- "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/client-go/tools/cache"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
-
- nodeinfov1alpha1 "volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1"
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/apis/pkg/apis/scheduling/scheme"
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/apis/pkg/apis/utils"
- schedulingapi "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-func isTerminated(status schedulingapi.TaskStatus) bool {
- return status == schedulingapi.Succeeded || status == schedulingapi.Failed
-}
-
-// getOrCreateJob will return corresponding Job for pi if it exists, or it will create a Job and return it if
-// pi.Pod.Spec.SchedulerName is same as volcano scheduler's name, otherwise it will return nil.
-func (sc *SchedulerCache) getOrCreateJob(pi *schedulingapi.TaskInfo) *schedulingapi.JobInfo {
- if len(pi.Job) == 0 {
- if pi.Pod.Spec.SchedulerName != sc.schedulerName {
- klog.V(4).Infof("Pod %s/%s will not scheduled by %s, skip creating PodGroup and Job for it",
- pi.Pod.Namespace, pi.Pod.Name, sc.schedulerName)
- }
- return nil
- }
-
- if _, found := sc.Jobs[pi.Job]; !found {
- sc.Jobs[pi.Job] = schedulingapi.NewJobInfo(pi.Job)
- }
-
- return sc.Jobs[pi.Job]
-}
-
-func (sc *SchedulerCache) addTask(pi *schedulingapi.TaskInfo) error {
- if len(pi.NodeName) != 0 {
- if _, found := sc.Nodes[pi.NodeName]; !found {
- sc.Nodes[pi.NodeName] = schedulingapi.NewNodeInfo(nil)
- sc.Nodes[pi.NodeName].Name = pi.NodeName
- }
-
- node := sc.Nodes[pi.NodeName]
- if !isTerminated(pi.Status) {
- if err := node.AddTask(pi); err != nil {
- if _, outOfSync := err.(*schedulingapi.AllocateFailError); outOfSync {
- node.State = schedulingapi.NodeState{
- Phase: schedulingapi.NotReady,
- Reason: "OutOfSync",
- }
- }
- return err
- }
- } else {
- klog.V(4).Infof("Pod <%v/%v> is in status %s.", pi.Namespace, pi.Name, pi.Status.String())
- }
- }
-
- job := sc.getOrCreateJob(pi)
- if job != nil {
- job.AddTaskInfo(pi)
- }
-
- return nil
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) addPod(pod *v1.Pod) error {
- pi := schedulingapi.NewTaskInfo(pod)
-
- return sc.addTask(pi)
-}
-
-func (sc *SchedulerCache) syncTask(oldTask *schedulingapi.TaskInfo) error {
- newPod, err := sc.kubeClient.CoreV1().Pods(oldTask.Namespace).Get(context.TODO(), oldTask.Name, metav1.GetOptions{})
- if err != nil {
- if errors.IsNotFound(err) {
- err := sc.deleteTask(oldTask)
- if err != nil {
- klog.Errorf("Failed to delete Pod <%v/%v> and remove from cache: %s", oldTask.Namespace, oldTask.Name, err.Error())
- return err
- }
- klog.V(3).Infof("Pod <%v/%v> was deleted, removed from cache.", oldTask.Namespace, oldTask.Name)
-
- return nil
- }
- return fmt.Errorf("failed to get Pod <%v/%v>: err %v", oldTask.Namespace, oldTask.Name, err)
- }
-
- newTask := schedulingapi.NewTaskInfo(newPod)
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
- return sc.updateTask(oldTask, newTask)
-}
-
-func (sc *SchedulerCache) updateTask(oldTask, newTask *schedulingapi.TaskInfo) error {
- if err := sc.deleteTask(oldTask); err != nil {
- klog.Warningf("Failed to delete task: %v", err)
- }
-
- return sc.addTask(newTask)
-}
-
-// Check the pod allocated status in cache
-func (sc *SchedulerCache) allocatedPodInCache(pod *v1.Pod) bool {
- pi := schedulingapi.NewTaskInfo(pod)
-
- if job, found := sc.Jobs[pi.Job]; found {
- if t, found := job.Tasks[pi.UID]; found {
- return schedulingapi.AllocatedStatus(t.Status)
- }
- }
-
- return false
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) updatePod(oldPod, newPod *v1.Pod) error {
- //ignore the update event if pod is allocated in cache but not present in NodeName
- if sc.allocatedPodInCache(newPod) && newPod.Spec.NodeName == "" {
- klog.V(4).Infof("Pod <%s/%v> already in cache with allocated status, ignore the update event", newPod.Namespace, newPod.Name)
- return nil
- }
-
- if err := sc.deletePod(oldPod); err != nil {
- return err
- }
- //when delete pod, the ownerreference of pod will be set nil,just as orphan pod
- if len(utils.GetController(newPod)) == 0 {
- newPod.OwnerReferences = oldPod.OwnerReferences
- }
- return sc.addPod(newPod)
-}
-
-func (sc *SchedulerCache) deleteTask(pi *schedulingapi.TaskInfo) error {
- var jobErr, nodeErr, numaErr error
-
- if len(pi.Job) != 0 {
- if job, found := sc.Jobs[pi.Job]; found {
- jobErr = job.DeleteTaskInfo(pi)
- } else {
- jobErr = fmt.Errorf("failed to find Job <%v> for Task %v/%v",
- pi.Job, pi.Namespace, pi.Name)
- }
- }
-
- if len(pi.NodeName) != 0 {
- node := sc.Nodes[pi.NodeName]
- if node != nil {
- nodeErr = node.RemoveTask(pi)
- }
- }
-
- if jobErr != nil || nodeErr != nil {
- return schedulingapi.MergeErrors(jobErr, nodeErr, numaErr)
- }
-
- return nil
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) deletePod(pod *v1.Pod) error {
- pi := schedulingapi.NewTaskInfo(pod)
-
- // Delete the Task in cache to handle Binding status.
- task := pi
- if job, found := sc.Jobs[pi.Job]; found {
- if t, found := job.Tasks[pi.UID]; found {
- task = t
- }
- }
- if err := sc.deleteTask(task); err != nil {
- klog.Warningf("Failed to delete task: %v", err)
- }
-
- // If job was terminated, delete it.
- if job, found := sc.Jobs[pi.Job]; found && schedulingapi.JobTerminated(job) {
- sc.deleteJob(job)
- }
-
- return nil
-}
-
-// AddPod add pod to scheduler cache
-func (sc *SchedulerCache) AddPod(obj interface{}) {
- pod, ok := obj.(*v1.Pod)
- if !ok {
- klog.Errorf("Cannot convert to *v1.Pod: %v", obj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.addPod(pod)
- if err != nil {
- klog.Errorf("Failed to add pod <%s/%s> into cache: %v",
- pod.Namespace, pod.Name, err)
- return
- }
- klog.V(3).Infof("Added pod <%s/%v> into cache.", pod.Namespace, pod.Name)
-}
-
-// UpdatePod update pod to scheduler cache
-func (sc *SchedulerCache) UpdatePod(oldObj, newObj interface{}) {
- oldPod, ok := oldObj.(*v1.Pod)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *v1.Pod: %v", oldObj)
- return
- }
- newPod, ok := newObj.(*v1.Pod)
- if !ok {
- klog.Errorf("Cannot convert newObj to *v1.Pod: %v", newObj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.updatePod(oldPod, newPod)
- if err != nil {
- klog.Errorf("Failed to update pod %v in cache: %v", oldPod.Name, err)
- return
- }
-
- klog.V(4).Infof("Updated pod <%s/%v> in cache.", oldPod.Namespace, oldPod.Name)
-}
-
-// DeletePod delete pod from scheduler cache
-func (sc *SchedulerCache) DeletePod(obj interface{}) {
- var pod *v1.Pod
- switch t := obj.(type) {
- case *v1.Pod:
- pod = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- pod, ok = t.Obj.(*v1.Pod)
- if !ok {
- klog.Errorf("Cannot convert to *v1.Pod: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to *v1.Pod: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.deletePod(pod)
- if err != nil {
- klog.Errorf("Failed to delete pod %v from cache: %v", pod.Name, err)
- return
- }
-
- klog.V(3).Infof("Deleted pod <%s/%v> from cache.", pod.Namespace, pod.Name)
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) addNode(node *v1.Node) error {
- if sc.Nodes[node.Name] != nil {
- sc.Nodes[node.Name].SetNode(node)
- } else {
- sc.Nodes[node.Name] = schedulingapi.NewNodeInfo(node)
- }
- return nil
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) updateNode(oldNode, newNode *v1.Node) error {
- if sc.Nodes[newNode.Name] != nil {
- sc.Nodes[newNode.Name].SetNode(newNode)
- return nil
- }
-
- return fmt.Errorf("node <%s> does not exist", newNode.Name)
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) deleteNode(node *v1.Node) error {
- if _, ok := sc.Nodes[node.Name]; !ok {
- return fmt.Errorf("node <%s> does not exist", node.Name)
- }
-
- numaInfo := sc.Nodes[node.Name].NumaInfo
- if numaInfo != nil {
- klog.V(3).Infof("delete numatopo <%s/%s>", numaInfo.Namespace, numaInfo.Name)
- err := sc.vcClient.NodeinfoV1alpha1().Numatopologies().Delete(context.TODO(), numaInfo.Name, metav1.DeleteOptions{})
- if err != nil {
- klog.Errorf("delete numatopo <%s/%s> failed.", numaInfo.Namespace, numaInfo.Name)
- }
- }
-
- delete(sc.Nodes, node.Name)
-
- return nil
-}
-
-// AddNode add node to scheduler cache
-func (sc *SchedulerCache) AddNode(obj interface{}) {
- node, ok := obj.(*v1.Node)
- if !ok {
- klog.Errorf("Cannot convert to *v1.Node: %v", obj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.addNode(node)
- if err != nil {
- klog.Errorf("Failed to add node %s into cache: %v", node.Name, err)
- return
- }
- sc.NodeList = append(sc.NodeList, node.Name)
-}
-
-// UpdateNode update node to scheduler cache
-func (sc *SchedulerCache) UpdateNode(oldObj, newObj interface{}) {
- oldNode, ok := oldObj.(*v1.Node)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *v1.Node: %v", oldObj)
- return
- }
- newNode, ok := newObj.(*v1.Node)
- if !ok {
- klog.Errorf("Cannot convert newObj to *v1.Node: %v", newObj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.updateNode(oldNode, newNode)
- if err != nil {
- klog.Errorf("Failed to update node %v in cache: %v", oldNode.Name, err)
- return
- }
-}
-
-// DeleteNode delete node from scheduler cache
-func (sc *SchedulerCache) DeleteNode(obj interface{}) {
- var node *v1.Node
- switch t := obj.(type) {
- case *v1.Node:
- node = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- node, ok = t.Obj.(*v1.Node)
- if !ok {
- klog.Errorf("Cannot convert to *v1.Node: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to *v1.Node: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- err := sc.deleteNode(node)
- if err != nil {
- klog.Errorf("Failed to delete node %s from cache: %v", node.Name, err)
- return
- }
-
- for i, name := range sc.NodeList {
- if name == node.Name {
- sc.NodeList = append(sc.NodeList[:i], sc.NodeList[i+1:]...)
- break
- }
- }
-}
-
-func getJobID(pg *schedulingapi.PodGroup) schedulingapi.JobID {
- return schedulingapi.JobID(fmt.Sprintf("%s/%s", pg.Namespace, pg.Name))
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) setPodGroup(ss *schedulingapi.PodGroup) error {
- job := getJobID(ss)
- if _, found := sc.Jobs[job]; !found {
- sc.Jobs[job] = schedulingapi.NewJobInfo(job)
- }
-
- sc.Jobs[job].SetPodGroup(ss)
-
- // TODO(k82cn): set default queue in admission.
- if len(ss.Spec.Queue) == 0 {
- sc.Jobs[job].Queue = schedulingapi.QueueID(sc.defaultQueue)
- }
-
- return nil
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) updatePodGroup(newPodGroup *schedulingapi.PodGroup) error {
- return sc.setPodGroup(newPodGroup)
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) deletePodGroup(id schedulingapi.JobID) error {
- job, found := sc.Jobs[id]
- if !found {
- return fmt.Errorf("can not found job %v", id)
- }
-
- // Unset SchedulingSpec
- job.UnsetPodGroup()
-
- sc.deleteJob(job)
-
- return nil
-}
-
-// AddPodGroupV1beta1 add podgroup to scheduler cache
-func (sc *SchedulerCache) AddPodGroupV1beta1(obj interface{}) {
- ss, ok := obj.(*schedulingv1beta1.PodGroup)
- if !ok {
- klog.Errorf("Cannot convert to *schedulingv1beta1.PodGroup: %v", obj)
- return
- }
-
- podgroup := scheduling.PodGroup{}
- if err := scheme.Scheme.Convert(ss, &podgroup, nil); err != nil {
- klog.Errorf("Failed to convert podgroup from %T to %T", ss, podgroup)
- return
- }
-
- pg := &schedulingapi.PodGroup{PodGroup: podgroup, Version: schedulingapi.PodGroupVersionV1Beta1}
- klog.V(4).Infof("Add PodGroup(%s) into cache, spec(%#v)", ss.Name, ss.Spec)
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- if err := sc.setPodGroup(pg); err != nil {
- klog.Errorf("Failed to add PodGroup %s into cache: %v", ss.Name, err)
- return
- }
-}
-
-// UpdatePodGroupV1beta1 add podgroup to scheduler cache
-func (sc *SchedulerCache) UpdatePodGroupV1beta1(oldObj, newObj interface{}) {
- oldSS, ok := oldObj.(*schedulingv1beta1.PodGroup)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *schedulingv1beta1.SchedulingSpec: %v", oldObj)
- return
- }
- newSS, ok := newObj.(*schedulingv1beta1.PodGroup)
- if !ok {
- klog.Errorf("Cannot convert newObj to *schedulingv1beta1.SchedulingSpec: %v", newObj)
- return
- }
-
- if oldSS.ResourceVersion == newSS.ResourceVersion {
- return
- }
-
- podgroup := scheduling.PodGroup{}
- if err := scheme.Scheme.Convert(newSS, &podgroup, nil); err != nil {
- klog.Errorf("Failed to convert podgroup from %T to %T", newSS, podgroup)
- return
- }
-
- pg := &schedulingapi.PodGroup{PodGroup: podgroup, Version: schedulingapi.PodGroupVersionV1Beta1}
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- if err := sc.updatePodGroup(pg); err != nil {
- klog.Errorf("Failed to update SchedulingSpec %s into cache: %v", pg.Name, err)
- return
- }
-}
-
-// DeletePodGroupV1beta1 delete podgroup from scheduler cache
-func (sc *SchedulerCache) DeletePodGroupV1beta1(obj interface{}) {
- var ss *schedulingv1beta1.PodGroup
- switch t := obj.(type) {
- case *schedulingv1beta1.PodGroup:
- ss = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- ss, ok = t.Obj.(*schedulingv1beta1.PodGroup)
- if !ok {
- klog.Errorf("Cannot convert to podgroup: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to podgroup: %v", t)
- return
- }
-
- jobID := schedulingapi.JobID(fmt.Sprintf("%s/%s", ss.Namespace, ss.Name))
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- if err := sc.deletePodGroup(jobID); err != nil {
- klog.Errorf("Failed to delete podgroup %s from cache: %v", ss.Name, err)
- return
- }
-}
-
-// AddQueueV1beta1 add queue to scheduler cache
-func (sc *SchedulerCache) AddQueueV1beta1(obj interface{}) {
- ss, ok := obj.(*schedulingv1beta1.Queue)
- if !ok {
- klog.Errorf("Cannot convert to *schedulingv1beta1.Queue: %v", obj)
- return
- }
-
- queue := &scheduling.Queue{}
- if err := scheme.Scheme.Convert(ss, queue, nil); err != nil {
- klog.Errorf("Failed to convert queue from %T to %T", ss, queue)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- klog.V(4).Infof("Add Queue(%s) into cache, spec(%#v)", ss.Name, ss.Spec)
- sc.addQueue(queue)
-}
-
-// UpdateQueueV1beta1 update queue to scheduler cache
-func (sc *SchedulerCache) UpdateQueueV1beta1(oldObj, newObj interface{}) {
- oldSS, ok := oldObj.(*schedulingv1beta1.Queue)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *schedulingv1beta1.Queue: %v", oldObj)
- return
- }
- newSS, ok := newObj.(*schedulingv1beta1.Queue)
- if !ok {
- klog.Errorf("Cannot convert newObj to *schedulingv1beta1.Queue: %v", newObj)
- return
- }
-
- if oldSS.ResourceVersion == newSS.ResourceVersion {
- return
- }
-
- newQueue := &scheduling.Queue{}
- if err := scheme.Scheme.Convert(newSS, newQueue, nil); err != nil {
- klog.Errorf("Failed to convert queue from %T to %T", newSS, newQueue)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
- sc.updateQueue(newQueue)
-}
-
-// DeleteQueueV1beta1 delete queue from the scheduler cache
-func (sc *SchedulerCache) DeleteQueueV1beta1(obj interface{}) {
- var ss *schedulingv1beta1.Queue
- switch t := obj.(type) {
- case *schedulingv1beta1.Queue:
- ss = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- ss, ok = t.Obj.(*schedulingv1beta1.Queue)
- if !ok {
- klog.Errorf("Cannot convert to *schedulingv1beta1.Queue: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to *schedulingv1beta1.Queue: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
- sc.deleteQueue(schedulingapi.QueueID(ss.Name))
-}
-
-func (sc *SchedulerCache) addQueue(queue *scheduling.Queue) {
- qi := schedulingapi.NewQueueInfo(queue)
- sc.Queues[qi.UID] = qi
-}
-
-func (sc *SchedulerCache) updateQueue(queue *scheduling.Queue) {
- sc.addQueue(queue)
-}
-
-func (sc *SchedulerCache) deleteQueue(id schedulingapi.QueueID) {
- delete(sc.Queues, id)
-}
-
-//DeletePriorityClass delete priorityclass from the scheduler cache
-func (sc *SchedulerCache) DeletePriorityClass(obj interface{}) {
- var ss *schedulingv1.PriorityClass
- switch t := obj.(type) {
- case *schedulingv1.PriorityClass:
- ss = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- ss, ok = t.Obj.(*schedulingv1.PriorityClass)
- if !ok {
- klog.Errorf("Cannot convert to *schedulingv1.PriorityClass: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to *schedulingv1.PriorityClass: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- sc.deletePriorityClass(ss)
-}
-
-//UpdatePriorityClass update priorityclass to scheduler cache
-func (sc *SchedulerCache) UpdatePriorityClass(oldObj, newObj interface{}) {
- oldSS, ok := oldObj.(*schedulingv1.PriorityClass)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *schedulingv1.PriorityClass: %v", oldObj)
-
- return
- }
-
- newSS, ok := newObj.(*schedulingv1.PriorityClass)
- if !ok {
- klog.Errorf("Cannot convert newObj to *schedulingv1.PriorityClass: %v", newObj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- sc.deletePriorityClass(oldSS)
- sc.addPriorityClass(newSS)
-}
-
-//AddPriorityClass add priorityclass to scheduler cache
-func (sc *SchedulerCache) AddPriorityClass(obj interface{}) {
- ss, ok := obj.(*schedulingv1.PriorityClass)
- if !ok {
- klog.Errorf("Cannot convert to *schedulingv1.PriorityClass: %v", obj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- sc.addPriorityClass(ss)
-}
-
-func (sc *SchedulerCache) deletePriorityClass(pc *schedulingv1.PriorityClass) {
- if pc.GlobalDefault {
- sc.defaultPriorityClass = nil
- sc.defaultPriority = 0
- }
-
- delete(sc.PriorityClasses, pc.Name)
-}
-
-func (sc *SchedulerCache) addPriorityClass(pc *schedulingv1.PriorityClass) {
- if pc.GlobalDefault {
- if sc.defaultPriorityClass != nil {
- klog.Errorf("Updated default priority class from <%s> to <%s> forcefully.",
- sc.defaultPriorityClass.Name, pc.Name)
- }
- sc.defaultPriorityClass = pc
- sc.defaultPriority = pc.Value
- }
-
- sc.PriorityClasses[pc.Name] = pc
-}
-
-func (sc *SchedulerCache) updateResourceQuota(quota *v1.ResourceQuota) {
- collection, ok := sc.NamespaceCollection[quota.Namespace]
- if !ok {
- collection = schedulingapi.NewNamespaceCollection(quota.Namespace)
- sc.NamespaceCollection[quota.Namespace] = collection
- }
-
- collection.Update(quota)
-}
-
-func (sc *SchedulerCache) deleteResourceQuota(quota *v1.ResourceQuota) {
- collection, ok := sc.NamespaceCollection[quota.Namespace]
- if !ok {
- return
- }
-
- collection.Delete(quota)
-}
-
-// DeleteResourceQuota delete ResourceQuota from the scheduler cache
-func (sc *SchedulerCache) DeleteResourceQuota(obj interface{}) {
- var r *v1.ResourceQuota
- switch t := obj.(type) {
- case *v1.ResourceQuota:
- r = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- r, ok = t.Obj.(*v1.ResourceQuota)
- if !ok {
- klog.Errorf("Cannot convert to *v1.ResourceQuota: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to *v1.ResourceQuota: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- klog.V(3).Infof("Delete ResourceQuota <%s/%v> in cache", r.Namespace, r.Name)
- sc.deleteResourceQuota(r)
-}
-
-// UpdateResourceQuota update ResourceQuota to scheduler cache
-func (sc *SchedulerCache) UpdateResourceQuota(oldObj, newObj interface{}) {
- newR, ok := newObj.(*v1.ResourceQuota)
- if !ok {
- klog.Errorf("Cannot convert newObj to *v1.ResourceQuota: %v", newObj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- klog.V(3).Infof("Update ResourceQuota <%s/%v> in cache, with spec: %v.", newR.Namespace, newR.Name, newR.Spec.Hard)
- sc.updateResourceQuota(newR)
-}
-
-// AddResourceQuota add ResourceQuota to scheduler cache
-func (sc *SchedulerCache) AddResourceQuota(obj interface{}) {
- var r *v1.ResourceQuota
- switch t := obj.(type) {
- case *v1.ResourceQuota:
- r = t
- default:
- klog.Errorf("Cannot convert to *v1.ResourceQuota: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- klog.V(3).Infof("Add ResourceQuota <%s/%v> in cache, with spec: %v.", r.Namespace, r.Name, r.Spec.Hard)
- sc.updateResourceQuota(r)
-}
-
-func getNumaInfo(srcInfo *nodeinfov1alpha1.Numatopology) *schedulingapi.NumatopoInfo {
- numaInfo := &schedulingapi.NumatopoInfo{
- Namespace: srcInfo.Namespace,
- Name: srcInfo.Name,
- Policies: make(map[nodeinfov1alpha1.PolicyName]string),
- NumaResMap: make(map[string]*schedulingapi.ResourceInfo),
- CPUDetail: topology.CPUDetails{},
- ResReserved: make(v1.ResourceList),
- }
-
- policies := srcInfo.Spec.Policies
- for name, policy := range policies {
- numaInfo.Policies[name] = policy
- }
-
- numaResMap := srcInfo.Spec.NumaResMap
- for name, resInfo := range numaResMap {
- tmp := schedulingapi.ResourceInfo{}
- tmp.Capacity = resInfo.Capacity
- tmp.Allocatable = cpuset.MustParse(resInfo.Allocatable)
- numaInfo.NumaResMap[name] = &tmp
- }
-
- cpuDetail := srcInfo.Spec.CPUDetail
- for key, detail := range cpuDetail {
- cpuID, _ := strconv.Atoi(key)
- numaInfo.CPUDetail[cpuID] = topology.CPUInfo{
- NUMANodeID: detail.NUMANodeID,
- SocketID: detail.SocketID,
- CoreID: detail.CoreID,
- }
- }
-
- resReserved, err := schedulingapi.ParseResourceList(srcInfo.Spec.ResReserved)
- if err != nil {
- klog.Errorf("ParseResourceList failed, err=%v", err)
- } else {
- numaInfo.ResReserved = resReserved
- }
-
- return numaInfo
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) addNumaInfo(info *nodeinfov1alpha1.Numatopology) error {
- if sc.Nodes[info.Name] == nil {
- sc.Nodes[info.Name] = schedulingapi.NewNodeInfo(nil)
- sc.Nodes[info.Name].Name = info.Name
- }
-
- if sc.Nodes[info.Name].NumaInfo == nil {
- sc.Nodes[info.Name].NumaInfo = getNumaInfo(info)
- sc.Nodes[info.Name].NumaChgFlag = schedulingapi.NumaInfoMoreFlag
- } else {
- newLocalInfo := getNumaInfo(info)
- if sc.Nodes[info.Name].NumaInfo.Compare(newLocalInfo) {
- sc.Nodes[info.Name].NumaChgFlag = schedulingapi.NumaInfoMoreFlag
- } else {
- sc.Nodes[info.Name].NumaChgFlag = schedulingapi.NumaInfoLessFlag
- }
-
- sc.Nodes[info.Name].NumaInfo = newLocalInfo
- }
-
- for resName, NumaResInfo := range sc.Nodes[info.Name].NumaInfo.NumaResMap {
- klog.V(3).Infof("resource %s Allocatable %v on node[%s] into cache", resName, NumaResInfo, info.Name)
- }
-
- klog.V(3).Infof("Policies %v on node[%s] into cache, change= %v",
- sc.Nodes[info.Name].NumaInfo.Policies, info.Name, sc.Nodes[info.Name].NumaChgFlag)
- return nil
-}
-
-// Assumes that lock is already acquired.
-func (sc *SchedulerCache) deleteNumaInfo(info *nodeinfov1alpha1.Numatopology) {
- if sc.Nodes[info.Name] != nil {
- sc.Nodes[info.Name].NumaInfo = nil
- sc.Nodes[info.Name].NumaChgFlag = schedulingapi.NumaInfoResetFlag
- klog.V(3).Infof("delete numainfo in cahce for node<%s>", info.Name)
- }
-}
-
-// AddNumaInfoV1alpha1 add numa information to scheduler cache
-func (sc *SchedulerCache) AddNumaInfoV1alpha1(obj interface{}) {
- ss, ok := obj.(*nodeinfov1alpha1.Numatopology)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *nodeinfov1alpha1.Numatopology: %v", obj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- sc.addNumaInfo(ss)
-}
-
-// UpdateNumaInfoV1alpha1 update numa information to scheduler cache
-func (sc *SchedulerCache) UpdateNumaInfoV1alpha1(oldObj, newObj interface{}) {
- ss, ok := newObj.(*nodeinfov1alpha1.Numatopology)
- if !ok {
- klog.Errorf("Cannot convert oldObj to *nodeinfov1alpha1.Numatopology: %v", newObj)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
- sc.addNumaInfo(ss)
- klog.V(3).Infof("update numaInfo<%s> in cahce, with spec: Policy: %v, resMap: %v", ss.Name, ss.Spec.Policies, ss.Spec.NumaResMap)
-}
-
-// DeleteNumaInfoV1alpha1 delete numa information from scheduler cache
-func (sc *SchedulerCache) DeleteNumaInfoV1alpha1(obj interface{}) {
- var ss *nodeinfov1alpha1.Numatopology
- switch t := obj.(type) {
- case *nodeinfov1alpha1.Numatopology:
- ss = t
- case cache.DeletedFinalStateUnknown:
- var ok bool
- ss, ok = t.Obj.(*nodeinfov1alpha1.Numatopology)
- if !ok {
- klog.Errorf("Cannot convert to Numatopo: %v", t.Obj)
- return
- }
- default:
- klog.Errorf("Cannot convert to Numatopo: %v", t)
- return
- }
-
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
-
- sc.deleteNumaInfo(ss)
- klog.V(3).Infof("Delete numaInfo<%s> from cahce, with spec: Policy: %v, resMap: %v", ss.Name, ss.Spec.Policies, ss.Spec.NumaResMap)
-}
-
-// AddJob add job to scheduler cache
-func (sc *SchedulerCache) AddJob(obj interface{}) {
- job, ok := obj.(*schedulingapi.JobInfo)
- if !ok {
- klog.Errorf("Cannot convert to *api.JobInfo: %v", obj)
- return
- }
- sc.Mutex.Lock()
- defer sc.Mutex.Unlock()
- sc.Jobs[job.UID] = job
-}
-
-
-
package cache
-
-// bindMethodMap Binder management
-var bindMethodMap Binder
-
-// RegisterBindMethod register Bind Method
-func RegisterBindMethod(binder Binder) {
- bindMethodMap = binder
-}
-
-func GetBindMethod() Binder {
- return bindMethodMap
-}
-
-func init() {
- RegisterBindMethod(NewBinder())
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package cache
-
-import (
- "fmt"
- "os"
- "strconv"
- "strings"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
- "stathat.com/c/consistent"
-
- scheduling "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
-)
-
-// responsibleForPod returns false at following conditions:
-// 1. The current scheduler is not specified scheduler in Pod's spec.
-// 2. The Job which the Pod belongs is not assigned to current scheduler based on the hash algorithm in multi-schedulers scenario
-func responsibleForPod(pod *v1.Pod, schedulerName string, mySchedulerPodName string, c *consistent.Consistent) bool {
- if schedulerName != pod.Spec.SchedulerName {
- return false
- }
- if c != nil {
- var key string
- if len(pod.OwnerReferences) != 0 {
- key = pod.OwnerReferences[0].Name
- } else {
- key = pod.Name
- }
- schedulerPodName, err := c.Get(key)
- if err != nil {
- klog.Errorf("Failed to get scheduler by hash algorithm, err: %v", err)
- }
- if schedulerPodName != mySchedulerPodName {
- return false
- }
- }
-
- klog.V(4).Infof("schedulerPodName %v is responsible to Pod %v/%v", mySchedulerPodName, pod.Namespace, pod.Name)
- return true
-}
-
-// responsibleForNode returns true if the Node is assigned to current scheduler in multi-scheduler scenario
-func responsibleForNode(nodeName string, mySchedulerPodName string, c *consistent.Consistent) bool {
- if c != nil {
- schedulerPodName, err := c.Get(nodeName)
- if err != nil {
- klog.Errorf("Failed to get scheduler by hash algorithm, err: %v", err)
- }
- if schedulerPodName != mySchedulerPodName {
- return false
- }
- }
-
- klog.V(4).Infof("schedulerPodName %v is responsible to Node %v", mySchedulerPodName, nodeName)
- return true
-}
-
-// responsibleForPodGroup returns true if Job which PodGroup belongs is assigned to current scheduler in multi-schedulers scenario
-func responsibleForPodGroup(pg *scheduling.PodGroup, mySchedulerPodName string, c *consistent.Consistent) bool {
- if c != nil {
- var key string
- if len(pg.OwnerReferences) != 0 {
- key = pg.OwnerReferences[0].Name
- } else {
- key = pg.Name
- }
- schedulerPodName, err := c.Get(key)
- if err != nil {
- klog.Errorf("Failed to get scheduler by hash algorithm, err: %v", err)
- }
- if schedulerPodName != mySchedulerPodName {
- return false
- }
- }
-
- klog.V(4).Infof("schedulerPodName %v is responsible to PodGroup %v/%v", mySchedulerPodName, pg.Namespace, pg.Name)
- return true
-}
-
-// getMultiSchedulerInfo return the Pod name of current scheduler and the hash table for all schedulers
-func getMultiSchedulerInfo() (schedulerPodName string, c *consistent.Consistent) {
- multiSchedulerEnable := os.Getenv("MULTI_SCHEDULER_ENABLE")
- mySchedulerPodName := os.Getenv("SCHEDULER_POD_NAME")
- c = nil
- if multiSchedulerEnable == "true" {
- klog.V(3).Infof("multiSchedulerEnable true")
- schedulerNumStr := os.Getenv("SCHEDULER_NUM")
- schedulerNum, err := strconv.Atoi(schedulerNumStr)
- if err != nil {
- schedulerNum = 1
- }
- index := strings.LastIndex(mySchedulerPodName, "-")
- baseName := mySchedulerPodName[0:index]
- c = consistent.New()
- for i := 0; i < schedulerNum; i++ {
- name := fmt.Sprintf("%s-%d", baseName, i)
- c.Add(name)
- }
- }
- return mySchedulerPodName, c
-}
-
-
-
/*
-Copyright 2019 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- "strconv"
-
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/conf"
-)
-
-// Arguments map
-type Arguments map[string]string
-
-// GetInt get the integer value from string
-func (a Arguments) GetInt(ptr *int, key string) {
- if ptr == nil {
- return
- }
-
- argv, ok := a[key]
- if !ok || argv == "" {
- return
- }
-
- value, err := strconv.Atoi(argv)
- if err != nil {
- klog.Warningf("Could not parse argument: %s for key %s, with err %v", argv, key, err)
- return
- }
-
- *ptr = value
-}
-
-// GetFloat64 get the float64 value from string
-func (a Arguments) GetFloat64(ptr *float64, key string) {
- if ptr == nil {
- return
- }
-
- argv, ok := a[key]
- if !ok || len(argv) == 0 {
- return
- }
-
- value, err := strconv.ParseFloat(argv, 64)
- if err != nil {
- klog.Warningf("Could not parse argument: %s for key %s, with err %v", argv, key, err)
- return
- }
-
- *ptr = value
-}
-
-// GetBool get the bool value from string
-func (a Arguments) GetBool(ptr *bool, key string) {
- if ptr == nil {
- return
- }
-
- argv, ok := a[key]
- if !ok || argv == "" {
- return
- }
-
- value, err := strconv.ParseBool(argv)
- if err != nil {
- klog.Warningf("Could not parse argument: %s for key %s, with err %v", argv, key, err)
- return
- }
-
- *ptr = value
-}
-
-// GetArgOfActionFromConf return argument of action reading from configuration of schedule
-func GetArgOfActionFromConf(configurations []conf.Configuration, actionName string) Arguments {
- for _, c := range configurations {
- if c.Name == actionName {
- return c.Arguments
- }
- }
-
- return nil
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- "time"
-
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/cache"
- "volcano.sh/volcano/pkg/scheduler/conf"
- "volcano.sh/volcano/pkg/scheduler/metrics"
-)
-
-// OpenSession start the session
-func OpenSession(cache cache.Cache, tiers []conf.Tier, configurations []conf.Configuration) *Session {
- ssn := openSession(cache)
- ssn.Tiers = tiers
- ssn.Configurations = configurations
-
- for _, tier := range tiers {
- for _, plugin := range tier.Plugins {
- if pb, found := GetPluginBuilder(plugin.Name); !found {
- klog.Errorf("Failed to get plugin %s.", plugin.Name)
- } else {
- plugin := pb(plugin.Arguments)
- ssn.plugins[plugin.Name()] = plugin
- onSessionOpenStart := time.Now()
- plugin.OnSessionOpen(ssn)
- metrics.UpdatePluginDuration(plugin.Name(), metrics.OnSessionOpen, metrics.Duration(onSessionOpenStart))
- }
- }
- }
- return ssn
-}
-
-// CloseSession close the session
-func CloseSession(ssn *Session) {
- for _, plugin := range ssn.plugins {
- onSessionCloseStart := time.Now()
- plugin.OnSessionClose(ssn)
- metrics.UpdatePluginDuration(plugin.Name(), metrics.OnSessionClose, metrics.Duration(onSessionCloseStart))
- }
-
- closeSession(ssn)
-}
-
-
-
package framework
-
-import (
- "context"
- "math/rand"
- "reflect"
- "time"
-
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-const (
- jobUpdaterWorker = 16
-
- jobConditionUpdateTime = time.Minute
- jobConditionUpdateTimeJitter = 30 * time.Second
-)
-
-// TimeJitterAfter means: new after old + duration + jitter
-func TimeJitterAfter(new, old time.Time, duration, maxJitter time.Duration) bool {
- var jitter int64
- if maxJitter > 0 {
- jitter = rand.Int63n(int64(maxJitter))
- }
- return new.After(old.Add(duration + time.Duration(jitter)))
-}
-
-type jobUpdater struct {
- ssn *Session
- jobQueue []*api.JobInfo
-}
-
-func newJobUpdater(ssn *Session) *jobUpdater {
- queue := make([]*api.JobInfo, 0, len(ssn.Jobs))
- for _, job := range ssn.Jobs {
- queue = append(queue, job)
- }
-
- ju := &jobUpdater{
- ssn: ssn,
- jobQueue: queue,
- }
- return ju
-}
-
-func (ju *jobUpdater) UpdateAll() {
- workqueue.ParallelizeUntil(context.TODO(), jobUpdaterWorker, len(ju.jobQueue), ju.updateJob)
-}
-
-func isPodGroupConditionsUpdated(newCondition, oldCondition []scheduling.PodGroupCondition) bool {
- if len(newCondition) != len(oldCondition) {
- return true
- }
-
- for index, newCond := range newCondition {
- oldCond := oldCondition[index]
-
- newTime := newCond.LastTransitionTime
- oldTime := oldCond.LastTransitionTime
- if TimeJitterAfter(newTime.Time, oldTime.Time, jobConditionUpdateTime, jobConditionUpdateTimeJitter) {
- return true
- }
-
- // if newCond is not new enough, we treat it the same as the old one
- newCond.LastTransitionTime = oldTime
-
- // comparing should ignore the TransitionID
- newTransitionID := newCond.TransitionID
- newCond.TransitionID = oldCond.TransitionID
-
- shouldUpdate := !equality.Semantic.DeepEqual(&newCond, &oldCond)
-
- newCond.LastTransitionTime = newTime
- newCond.TransitionID = newTransitionID
- if shouldUpdate {
- return true
- }
- }
-
- return false
-}
-
-func isPodGroupStatusUpdated(newStatus, oldStatus scheduling.PodGroupStatus) bool {
- newCondition := newStatus.Conditions
- newStatus.Conditions = nil
- oldCondition := oldStatus.Conditions
- oldStatus.Conditions = nil
-
- return !equality.Semantic.DeepEqual(newStatus, oldStatus) || isPodGroupConditionsUpdated(newCondition, oldCondition)
-}
-
-// updateJob update specified job
-func (ju *jobUpdater) updateJob(index int) {
- job := ju.jobQueue[index]
- ssn := ju.ssn
-
- job.PodGroup.Status = jobStatus(ssn, job)
- oldStatus, found := ssn.podGroupStatus[job.UID]
- updatePG := !found || isPodGroupStatusUpdated(job.PodGroup.Status, oldStatus)
- if _, err := ssn.cache.UpdateJobStatus(job, updatePG); err != nil {
- klog.Errorf("Failed to update job <%s/%s>: %v",
- job.Namespace, job.Name, err)
- }
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- "fmt"
- "path/filepath"
- "plugin"
- "strings"
- "sync"
-
- "k8s.io/klog"
-)
-
-var pluginMutex sync.Mutex
-
-// PluginBuilder plugin management
-type PluginBuilder = func(Arguments) Plugin
-
-// Plugin management
-var pluginBuilders = map[string]PluginBuilder{}
-
-// RegisterPluginBuilder register the plugin
-func RegisterPluginBuilder(name string, pc PluginBuilder) {
- pluginMutex.Lock()
- defer pluginMutex.Unlock()
-
- pluginBuilders[name] = pc
-}
-
-// CleanupPluginBuilders cleans up all the plugin
-func CleanupPluginBuilders() {
- pluginMutex.Lock()
- defer pluginMutex.Unlock()
-
- pluginBuilders = map[string]PluginBuilder{}
-}
-
-// GetPluginBuilder get the pluginbuilder by name
-func GetPluginBuilder(name string) (PluginBuilder, bool) {
- pluginMutex.Lock()
- defer pluginMutex.Unlock()
-
- pb, found := pluginBuilders[name]
- return pb, found
-}
-
-// LoadCustomPlugins loads custom implement plugins
-func LoadCustomPlugins(pluginsDir string) error {
- pluginPaths, _ := filepath.Glob(fmt.Sprintf("%s/*.so", pluginsDir))
- for _, pluginPath := range pluginPaths {
- pluginBuilder, err := loadPluginBuilder(pluginPath)
- if err != nil {
- return err
- }
- pluginName := getPluginName(pluginPath)
- RegisterPluginBuilder(pluginName, pluginBuilder)
- klog.V(4).Infof("Custom plugin %s loaded", pluginName)
- }
-
- return nil
-}
-
-func getPluginName(pluginPath string) string {
- return strings.TrimSuffix(filepath.Base(pluginPath), filepath.Ext(pluginPath))
-}
-
-func loadPluginBuilder(pluginPath string) (PluginBuilder, error) {
- plug, err := plugin.Open(pluginPath)
- if err != nil {
- return nil, err
- }
-
- symBuilder, err := plug.Lookup("New")
- if err != nil {
- return nil, err
- }
-
- builder, ok := symBuilder.(PluginBuilder)
- if !ok {
- return nil, fmt.Errorf("unexpected plugin: %s, failed to convert PluginBuilder `New`", pluginPath)
- }
-
- return builder, nil
-}
-
-// Action management
-var actionMap = map[string]Action{}
-
-// RegisterAction register action
-func RegisterAction(act Action) {
- pluginMutex.Lock()
- defer pluginMutex.Unlock()
-
- actionMap[act.Name()] = act
-}
-
-// GetAction get the action by name
-func GetAction(name string) (Action, bool) {
- pluginMutex.Lock()
- defer pluginMutex.Unlock()
-
- act, found := actionMap[name]
- return act, found
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/types"
- "k8s.io/apimachinery/pkg/util/uuid"
- "k8s.io/client-go/informers"
- "k8s.io/client-go/kubernetes"
- "k8s.io/klog"
- volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
-
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/cache"
- "volcano.sh/volcano/pkg/scheduler/conf"
- "volcano.sh/volcano/pkg/scheduler/metrics"
- "volcano.sh/volcano/pkg/scheduler/util"
-)
-
-// Session information for the current session
-type Session struct {
- UID types.UID
-
- kubeClient kubernetes.Interface
- cache cache.Cache
- informerFactory informers.SharedInformerFactory
-
- TotalResource *api.Resource
- // podGroupStatus cache podgroup status during schedule
- // This should not be mutated after initiated
- podGroupStatus map[api.JobID]scheduling.PodGroupStatus
-
- Jobs map[api.JobID]*api.JobInfo
- Nodes map[string]*api.NodeInfo
- RevocableNodes map[string]*api.NodeInfo
- Queues map[api.QueueID]*api.QueueInfo
- NamespaceInfo map[api.NamespaceName]*api.NamespaceInfo
-
- Tiers []conf.Tier
- Configurations []conf.Configuration
- NodeList []*api.NodeInfo
-
- plugins map[string]Plugin
- eventHandlers []*EventHandler
- jobOrderFns map[string]api.CompareFn
- queueOrderFns map[string]api.CompareFn
- taskOrderFns map[string]api.CompareFn
- namespaceOrderFns map[string]api.CompareFn
- clusterOrderFns map[string]api.CompareFn
- predicateFns map[string]api.PredicateFn
- bestNodeFns map[string]api.BestNodeFn
- nodeOrderFns map[string]api.NodeOrderFn
- batchNodeOrderFns map[string]api.BatchNodeOrderFn
- nodeMapFns map[string]api.NodeMapFn
- nodeReduceFns map[string]api.NodeReduceFn
- preemptableFns map[string]api.EvictableFn
- reclaimableFns map[string]api.EvictableFn
- overusedFns map[string]api.ValidateFn
- underUsedFns map[string]api.UnderUsedResourceFn
- jobReadyFns map[string]api.ValidateFn
- jobPipelinedFns map[string]api.VoteFn
- jobValidFns map[string]api.ValidateExFn
- jobEnqueueableFns map[string]api.VoteFn
- jobEnqueuedFns map[string]api.JobEnqueuedFn
- targetJobFns map[string]api.TargetJobFn
- reservedNodesFns map[string]api.ReservedNodesFn
- victimTasksFns map[string]api.VictimTasksFn
- jobStarvingFns map[string]api.ValidateFn
-}
-
-func openSession(cache cache.Cache) *Session {
- ssn := &Session{
- UID: uuid.NewUUID(),
- kubeClient: cache.Client(),
- cache: cache,
- informerFactory: cache.SharedInformerFactory(),
-
- TotalResource: api.EmptyResource(),
- podGroupStatus: map[api.JobID]scheduling.PodGroupStatus{},
-
- Jobs: map[api.JobID]*api.JobInfo{},
- Nodes: map[string]*api.NodeInfo{},
- RevocableNodes: map[string]*api.NodeInfo{},
- Queues: map[api.QueueID]*api.QueueInfo{},
-
- plugins: map[string]Plugin{},
- jobOrderFns: map[string]api.CompareFn{},
- queueOrderFns: map[string]api.CompareFn{},
- taskOrderFns: map[string]api.CompareFn{},
- namespaceOrderFns: map[string]api.CompareFn{},
- clusterOrderFns: map[string]api.CompareFn{},
- predicateFns: map[string]api.PredicateFn{},
- bestNodeFns: map[string]api.BestNodeFn{},
- nodeOrderFns: map[string]api.NodeOrderFn{},
- batchNodeOrderFns: map[string]api.BatchNodeOrderFn{},
- nodeMapFns: map[string]api.NodeMapFn{},
- nodeReduceFns: map[string]api.NodeReduceFn{},
- preemptableFns: map[string]api.EvictableFn{},
- reclaimableFns: map[string]api.EvictableFn{},
- overusedFns: map[string]api.ValidateFn{},
- underUsedFns: map[string]api.UnderUsedResourceFn{},
- jobReadyFns: map[string]api.ValidateFn{},
- jobPipelinedFns: map[string]api.VoteFn{},
- jobValidFns: map[string]api.ValidateExFn{},
- jobEnqueueableFns: map[string]api.VoteFn{},
- jobEnqueuedFns: map[string]api.JobEnqueuedFn{},
- targetJobFns: map[string]api.TargetJobFn{},
- reservedNodesFns: map[string]api.ReservedNodesFn{},
- victimTasksFns: map[string]api.VictimTasksFn{},
- jobStarvingFns: map[string]api.ValidateFn{},
- }
-
- snapshot := cache.Snapshot()
-
- ssn.Jobs = snapshot.Jobs
- for _, job := range ssn.Jobs {
- // only conditions will be updated periodically
- if job.PodGroup != nil && job.PodGroup.Status.Conditions != nil {
- ssn.podGroupStatus[job.UID] = *job.PodGroup.Status.DeepCopy()
- }
-
- if vjr := ssn.JobValid(job); vjr != nil {
- if !vjr.Pass {
- jc := &scheduling.PodGroupCondition{
- Type: scheduling.PodGroupUnschedulableType,
- Status: v1.ConditionTrue,
- LastTransitionTime: metav1.Now(),
- TransitionID: string(ssn.UID),
- Reason: vjr.Reason,
- Message: vjr.Message,
- }
-
- if err := ssn.UpdatePodGroupCondition(job, jc); err != nil {
- klog.Errorf("Failed to update job condition: %v", err)
- }
- }
-
- delete(ssn.Jobs, job.UID)
- }
- }
- ssn.NodeList = util.GetNodeList(snapshot.Nodes, snapshot.NodeList)
- ssn.Nodes = snapshot.Nodes
- ssn.RevocableNodes = snapshot.RevocableNodes
- ssn.Queues = snapshot.Queues
- ssn.NamespaceInfo = snapshot.NamespaceInfo
- // calculate all nodes' resource only once in each schedule cycle, other plugins can clone it when need
- for _, n := range ssn.Nodes {
- ssn.TotalResource.Add(n.Allocatable)
- }
-
- klog.V(3).Infof("Open Session %v with <%d> Job and <%d> Queues",
- ssn.UID, len(ssn.Jobs), len(ssn.Queues))
-
- return ssn
-}
-
-func closeSession(ssn *Session) {
- ju := newJobUpdater(ssn)
- ju.UpdateAll()
-
- ssn.Jobs = nil
- ssn.Nodes = nil
- ssn.RevocableNodes = nil
- ssn.plugins = nil
- ssn.eventHandlers = nil
- ssn.jobOrderFns = nil
- ssn.namespaceOrderFns = nil
- ssn.queueOrderFns = nil
- ssn.clusterOrderFns = nil
- ssn.NodeList = nil
- ssn.TotalResource = nil
-
- klog.V(3).Infof("Close Session %v", ssn.UID)
-}
-
-func jobStatus(ssn *Session, jobInfo *api.JobInfo) scheduling.PodGroupStatus {
- status := jobInfo.PodGroup.Status
-
- unschedulable := false
- for _, c := range status.Conditions {
- if c.Type == scheduling.PodGroupUnschedulableType &&
- c.Status == v1.ConditionTrue &&
- c.TransitionID == string(ssn.UID) {
- unschedulable = true
- break
- }
- }
-
- // If running tasks && unschedulable, unknown phase
- if len(jobInfo.TaskStatusIndex[api.Running]) != 0 && unschedulable {
- status.Phase = scheduling.PodGroupUnknown
- } else {
- allocated := 0
- for status, tasks := range jobInfo.TaskStatusIndex {
- if api.AllocatedStatus(status) || status == api.Succeeded {
- allocated += len(tasks)
- }
- }
-
- // If there're enough allocated resource, it's running
- if int32(allocated) >= jobInfo.PodGroup.Spec.MinMember {
- status.Phase = scheduling.PodGroupRunning
- } else if jobInfo.PodGroup.Status.Phase != scheduling.PodGroupInqueue {
- status.Phase = scheduling.PodGroupPending
- }
- }
-
- status.Running = int32(len(jobInfo.TaskStatusIndex[api.Running]))
- status.Failed = int32(len(jobInfo.TaskStatusIndex[api.Failed]))
- status.Succeeded = int32(len(jobInfo.TaskStatusIndex[api.Succeeded]))
-
- return status
-}
-
-// Statement returns new statement object
-func (ssn *Session) Statement() *Statement {
- return &Statement{
- ssn: ssn,
- }
-}
-
-// Pipeline the task to the node in the session
-func (ssn *Session) Pipeline(task *api.TaskInfo, hostname string) error {
- // Only update status in session
- job, found := ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Pipelined); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Pipelined, ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
- task.Job, ssn.UID)
- return fmt.Errorf("failed to find job %s when binding", task.Job)
- }
-
- task.NodeName = hostname
-
- if node, found := ssn.Nodes[hostname]; found {
- if err := node.AddTask(task); err != nil {
- klog.Errorf("Failed to add task <%v/%v> to node <%v> in Session <%v>: %v",
- task.Namespace, task.Name, hostname, ssn.UID, err)
- return err
- }
- klog.V(3).Infof("After added Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
- task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
- } else {
- klog.Errorf("Failed to find Node <%s> in Session <%s> index when binding.",
- hostname, ssn.UID)
- return fmt.Errorf("failed to find node %s", hostname)
- }
-
- for _, eh := range ssn.eventHandlers {
- if eh.AllocateFunc != nil {
- eh.AllocateFunc(&Event{
- Task: task,
- })
- }
- }
-
- return nil
-}
-
-//Allocate the task to the node in the session
-func (ssn *Session) Allocate(task *api.TaskInfo, nodeInfo *api.NodeInfo) error {
- podVolumes, err := ssn.cache.GetPodVolumes(task, nodeInfo.Node)
- if err != nil {
- return err
- }
-
- hostname := nodeInfo.Name
- if err := ssn.cache.AllocateVolumes(task, hostname, podVolumes); err != nil {
- return err
- }
-
- task.Pod.Spec.NodeName = hostname
- task.PodVolumes = podVolumes
-
- // Only update status in session
- job, found := ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Allocated); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Allocated, ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
- task.Job, ssn.UID)
- return fmt.Errorf("failed to find job %s", task.Job)
- }
-
- task.NodeName = hostname
-
- if node, found := ssn.Nodes[hostname]; found {
- if err := node.AddTask(task); err != nil {
- klog.Errorf("Failed to add task <%v/%v> to node <%v> in Session <%v>: %v",
- task.Namespace, task.Name, hostname, ssn.UID, err)
- return err
- }
- klog.V(3).Infof("After allocated Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
- task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
- } else {
- klog.Errorf("Failed to find Node <%s> in Session <%s> index when binding.",
- hostname, ssn.UID)
- return fmt.Errorf("failed to find node %s", hostname)
- }
-
- // Callbacks
- for _, eh := range ssn.eventHandlers {
- if eh.AllocateFunc != nil {
- eh.AllocateFunc(&Event{
- Task: task,
- })
- }
- }
-
- if ssn.JobReady(job) {
- for _, task := range job.TaskStatusIndex[api.Allocated] {
- if err := ssn.dispatch(task, podVolumes); err != nil {
- klog.Errorf("Failed to dispatch task <%v/%v>: %v",
- task.Namespace, task.Name, err)
- return err
- }
- }
- }
-
- return nil
-}
-
-func (ssn *Session) dispatch(task *api.TaskInfo, volumes *volumescheduling.PodVolumes) error {
- if err := ssn.cache.AddBindTask(task); err != nil {
- return err
- }
-
- // Update status in session
- if job, found := ssn.Jobs[task.Job]; found {
- if err := job.UpdateTaskStatus(task, api.Binding); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Binding, ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
- task.Job, ssn.UID)
- return fmt.Errorf("failed to find job %s", task.Job)
- }
-
- metrics.UpdateTaskScheduleDuration(metrics.Duration(task.Pod.CreationTimestamp.Time))
- return nil
-}
-
-//Evict the task in the session
-func (ssn *Session) Evict(reclaimee *api.TaskInfo, reason string) error {
- if err := ssn.cache.Evict(reclaimee, reason); err != nil {
- return err
- }
-
- // Update status in session
- job, found := ssn.Jobs[reclaimee.Job]
- if found {
- if err := job.UpdateTaskStatus(reclaimee, api.Releasing); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- reclaimee.Namespace, reclaimee.Name, api.Releasing, ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
- reclaimee.Job, ssn.UID)
- return fmt.Errorf("failed to find job %s", reclaimee.Job)
- }
-
- // Update task in node.
- if node, found := ssn.Nodes[reclaimee.NodeName]; found {
- if err := node.UpdateTask(reclaimee); err != nil {
- klog.Errorf("Failed to update task <%v/%v> in Session <%v>: %v",
- reclaimee.Namespace, reclaimee.Name, ssn.UID, err)
- return err
- }
- }
-
- for _, eh := range ssn.eventHandlers {
- if eh.DeallocateFunc != nil {
- eh.DeallocateFunc(&Event{
- Task: reclaimee,
- })
- }
- }
-
- return nil
-}
-
-// BindPodGroup bind PodGroup to specified cluster
-func (ssn *Session) BindPodGroup(job *api.JobInfo, cluster string) error {
- return ssn.cache.BindPodGroup(job, cluster)
-}
-
-// UpdatePodGroupCondition update job condition accordingly.
-func (ssn *Session) UpdatePodGroupCondition(jobInfo *api.JobInfo, cond *scheduling.PodGroupCondition) error {
- job, ok := ssn.Jobs[jobInfo.UID]
- if !ok {
- return fmt.Errorf("failed to find job <%s/%s>", jobInfo.Namespace, jobInfo.Name)
- }
-
- index := -1
- for i, c := range job.PodGroup.Status.Conditions {
- if c.Type == cond.Type {
- index = i
- break
- }
- }
-
- // Update condition to the new condition.
- if index < 0 {
- job.PodGroup.Status.Conditions = append(job.PodGroup.Status.Conditions, *cond)
- } else {
- job.PodGroup.Status.Conditions[index] = *cond
- }
-
- return nil
-}
-
-// AddEventHandler add event handlers
-func (ssn *Session) AddEventHandler(eh *EventHandler) {
- ssn.eventHandlers = append(ssn.eventHandlers, eh)
-}
-
-// UpdateSchedulerNumaInfo update SchedulerNumaInfo
-func (ssn *Session) UpdateSchedulerNumaInfo(AllocatedSets map[string]api.ResNumaSets) {
- ssn.cache.UpdateSchedulerNumaInfo(AllocatedSets)
-}
-
-// KubeClient returns the kubernetes client
-func (ssn Session) KubeClient() kubernetes.Interface {
- return ssn.kubeClient
-}
-
-// InformerFactory returns the scheduler ShareInformerFactory
-func (ssn Session) InformerFactory() informers.SharedInformerFactory {
- return ssn.informerFactory
-}
-
-//String return nodes and jobs information in the session
-func (ssn Session) String() string {
- msg := fmt.Sprintf("Session %v: \n", ssn.UID)
-
- for _, job := range ssn.Jobs {
- msg = fmt.Sprintf("%s%v\n", msg, job)
- }
-
- for _, node := range ssn.Nodes {
- msg = fmt.Sprintf("%s%v\n", msg, node)
- }
-
- return msg
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/apis/pkg/apis/scheduling"
- "volcano.sh/volcano/pkg/controllers/job/helpers"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-// AddJobOrderFn add job order function
-func (ssn *Session) AddJobOrderFn(name string, cf api.CompareFn) {
- ssn.jobOrderFns[name] = cf
-}
-
-// AddQueueOrderFn add queue order function
-func (ssn *Session) AddQueueOrderFn(name string, qf api.CompareFn) {
- ssn.queueOrderFns[name] = qf
-}
-
-// AddClusterOrderFn add queue order function
-func (ssn *Session) AddClusterOrderFn(name string, qf api.CompareFn) {
- ssn.clusterOrderFns[name] = qf
-}
-
-// AddTaskOrderFn add task order function
-func (ssn *Session) AddTaskOrderFn(name string, cf api.CompareFn) {
- ssn.taskOrderFns[name] = cf
-}
-
-// AddNamespaceOrderFn add namespace order function
-func (ssn *Session) AddNamespaceOrderFn(name string, cf api.CompareFn) {
- ssn.namespaceOrderFns[name] = cf
-}
-
-// AddPreemptableFn add preemptable function
-func (ssn *Session) AddPreemptableFn(name string, cf api.EvictableFn) {
- ssn.preemptableFns[name] = cf
-}
-
-// AddReclaimableFn add Reclaimable function
-func (ssn *Session) AddReclaimableFn(name string, rf api.EvictableFn) {
- ssn.reclaimableFns[name] = rf
-}
-
-// AddJobReadyFn add JobReady function
-func (ssn *Session) AddJobReadyFn(name string, vf api.ValidateFn) {
- ssn.jobReadyFns[name] = vf
-}
-
-// AddJobPipelinedFn add pipelined function
-func (ssn *Session) AddJobPipelinedFn(name string, vf api.VoteFn) {
- ssn.jobPipelinedFns[name] = vf
-}
-
-// AddPredicateFn add Predicate function
-func (ssn *Session) AddPredicateFn(name string, pf api.PredicateFn) {
- ssn.predicateFns[name] = pf
-}
-
-// AddBestNodeFn add BestNode function
-func (ssn *Session) AddBestNodeFn(name string, pf api.BestNodeFn) {
- ssn.bestNodeFns[name] = pf
-}
-
-// AddNodeOrderFn add Node order function
-func (ssn *Session) AddNodeOrderFn(name string, pf api.NodeOrderFn) {
- ssn.nodeOrderFns[name] = pf
-}
-
-// AddBatchNodeOrderFn add Batch Node order function
-func (ssn *Session) AddBatchNodeOrderFn(name string, pf api.BatchNodeOrderFn) {
- ssn.batchNodeOrderFns[name] = pf
-}
-
-// AddNodeMapFn add Node map function
-func (ssn *Session) AddNodeMapFn(name string, pf api.NodeMapFn) {
- ssn.nodeMapFns[name] = pf
-}
-
-// AddNodeReduceFn add Node reduce function
-func (ssn *Session) AddNodeReduceFn(name string, pf api.NodeReduceFn) {
- ssn.nodeReduceFns[name] = pf
-}
-
-// AddOverusedFn add overused function
-func (ssn *Session) AddOverusedFn(name string, fn api.ValidateFn) {
- ssn.overusedFns[name] = fn
-}
-
-// AddUnderusedResourceFn add underused function
-func (ssn *Session) AddUnderusedResourceFn(name string, fn api.UnderUsedResourceFn) {
- ssn.underUsedFns[name] = fn
-}
-
-// AddJobValidFn add jobvalid function
-func (ssn *Session) AddJobValidFn(name string, fn api.ValidateExFn) {
- ssn.jobValidFns[name] = fn
-}
-
-// AddJobEnqueueableFn add jobenqueueable function
-func (ssn *Session) AddJobEnqueueableFn(name string, fn api.VoteFn) {
- ssn.jobEnqueueableFns[name] = fn
-}
-
-// AddJobEnqueuedFn add jobEnqueued function
-func (ssn *Session) AddJobEnqueuedFn(name string, fn api.JobEnqueuedFn) {
- ssn.jobEnqueuedFns[name] = fn
-}
-
-// AddTargetJobFn add targetjob function
-func (ssn *Session) AddTargetJobFn(name string, fn api.TargetJobFn) {
- ssn.targetJobFns[name] = fn
-}
-
-// AddReservedNodesFn add reservedNodesFn function
-func (ssn *Session) AddReservedNodesFn(name string, fn api.ReservedNodesFn) {
- ssn.reservedNodesFns[name] = fn
-}
-
-// AddVictimTasksFns add victimTasksFns function
-func (ssn *Session) AddVictimTasksFns(name string, fn api.VictimTasksFn) {
- ssn.victimTasksFns[name] = fn
-}
-
-// AddJobStarvingFns add jobStarvingFns function
-func (ssn *Session) AddJobStarvingFns(name string, fn api.ValidateFn) {
- ssn.jobStarvingFns[name] = fn
-}
-
-// Reclaimable invoke reclaimable function of the plugins
-func (ssn *Session) Reclaimable(reclaimer *api.TaskInfo, reclaimees []*api.TaskInfo) []*api.TaskInfo {
- var victims []*api.TaskInfo
- var init bool
-
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledReclaimable) {
- continue
- }
- rf, found := ssn.reclaimableFns[plugin.Name]
- if !found {
- continue
- }
-
- candidates, abstain := rf(reclaimer, reclaimees)
- if abstain == 0 {
- continue
- }
- if len(candidates) == 0 {
- victims = nil
- break
- }
- if !init {
- victims = candidates
- init = true
- } else {
- var intersection []*api.TaskInfo
- // Get intersection of victims and candidates.
- for _, v := range victims {
- for _, c := range candidates {
- if v.UID == c.UID {
- intersection = append(intersection, v)
- }
- }
- }
-
- // Update victims to intersection
- victims = intersection
- }
- }
- // Plugins in this tier made decision if victims is not nil
- if victims != nil {
- return victims
- }
- }
-
- return victims
-}
-
-// Preemptable invoke preemptable function of the plugins
-func (ssn *Session) Preemptable(preemptor *api.TaskInfo, preemptees []*api.TaskInfo) []*api.TaskInfo {
- var victims []*api.TaskInfo
- var init bool
-
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledPreemptable) {
- continue
- }
-
- pf, found := ssn.preemptableFns[plugin.Name]
- if !found {
- continue
- }
- candidates, abstain := pf(preemptor, preemptees)
- if abstain == 0 {
- continue
- }
- // intersection will be nil if length is 0, don't need to do any more check
- if len(candidates) == 0 {
- victims = nil
- break
- }
-
- if !init {
- victims = candidates
- init = true
- } else {
- var intersection []*api.TaskInfo
- // Get intersection of victims and candidates.
- for _, v := range victims {
- for _, c := range candidates {
- if v.UID == c.UID {
- intersection = append(intersection, v)
- }
- }
- }
-
- // Update victims to intersection
- victims = intersection
- }
- }
- // Plugins in this tier made decision if victims is not nil
- if victims != nil {
- return victims
- }
- }
-
- return victims
-}
-
-// Overused invoke overused function of the plugins
-func (ssn *Session) Overused(queue *api.QueueInfo) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- of, found := ssn.overusedFns[plugin.Name]
- if !found {
- continue
- }
- if of(queue) {
- return true
- }
- }
- }
-
- return false
-}
-
-// UnderusedResources invoke underused function of the plugins
-// Returns:
-// * nil if no `UnderUsedResourceFn` is registered
-// * [] if no under-used resources
-func (ssn *Session) UnderusedResources(queue *api.QueueInfo) api.ResourceNameList {
- if len(ssn.underUsedFns) == 0 {
- return nil
- }
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- of, found := ssn.underUsedFns[plugin.Name]
- if !found {
- continue
- }
- underUsedResourceList := of(queue)
- return underUsedResourceList
- }
- }
-
- return api.ResourceNameList{}
-}
-
-// JobReady invoke jobready function of the plugins
-func (ssn *Session) JobReady(obj interface{}) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobReady) {
- continue
- }
- jrf, found := ssn.jobReadyFns[plugin.Name]
- if !found {
- continue
- }
-
- if !jrf(obj) {
- return false
- }
- }
- }
-
- return true
-}
-
-// JobPipelined invoke pipelined function of the plugins
-// Check if job has get enough resource to run
-func (ssn *Session) JobPipelined(obj interface{}) bool {
- var hasFound bool
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobPipelined) {
- continue
- }
- jrf, found := ssn.jobPipelinedFns[plugin.Name]
- if !found {
- continue
- }
-
- res := jrf(obj)
- if res < 0 {
- return false
- }
- if res > 0 {
- hasFound = true
- }
- }
- // if plugin exists that votes permit, meanwhile other plugin votes abstention,
- // permit job to be pipelined, do not check next tier
- if hasFound {
- return true
- }
- }
-
- return true
-}
-
-// JobStarving invoke jobStarving function of the plugins
-// Check if job still need more resource
-func (ssn *Session) JobStarving(obj interface{}) bool {
- var hasFound bool
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobStarving) {
- continue
- }
- jrf, found := ssn.jobStarvingFns[plugin.Name]
- if !found {
- continue
- }
- hasFound = true
-
- if !jrf(obj) {
- return false
- }
- }
- // this tier registered function
- if hasFound {
- return true
- }
- }
-
- return false
-}
-
-// JobValid invoke jobvalid function of the plugins
-func (ssn *Session) JobValid(obj interface{}) *api.ValidateResult {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- jrf, found := ssn.jobValidFns[plugin.Name]
- if !found {
- continue
- }
-
- if vr := jrf(obj); vr != nil && !vr.Pass {
- return vr
- }
- }
- }
-
- return nil
-}
-
-// JobEnqueueable invoke jobEnqueueableFns function of the plugins
-func (ssn *Session) JobEnqueueable(obj interface{}) bool {
- var hasFound bool
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobEnqueued) {
- continue
- }
- fn, found := ssn.jobEnqueueableFns[plugin.Name]
- if !found {
- continue
- }
-
- res := fn(obj)
- if res < 0 {
- return false
- }
- if res > 0 {
- hasFound = true
- }
- }
- // if plugin exists that votes permit, meanwhile other plugin votes abstention,
- // permit job to be enqueueable, do not check next tier
- if hasFound {
- return true
- }
- }
-
- return true
-}
-
-// JobEnqueued invoke jobEnqueuedFns function of the plugins
-func (ssn *Session) JobEnqueued(obj interface{}) {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobEnqueued) {
- continue
- }
- fn, found := ssn.jobEnqueuedFns[plugin.Name]
- if !found {
- continue
- }
-
- fn(obj)
- }
- }
-}
-
-// TargetJob invoke targetJobFns function of the plugins
-func (ssn *Session) TargetJob(jobs []*api.JobInfo) *api.JobInfo {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledTargetJob) {
- continue
- }
- fn, found := ssn.targetJobFns[plugin.Name]
- if !found {
- continue
- }
- return fn(jobs)
- }
- }
- return nil
-}
-
-// VictimTasks invoke ReservedNodes function of the plugins
-func (ssn *Session) VictimTasks() []*api.TaskInfo {
- var victims []*api.TaskInfo
- var init bool
-
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledVictim) {
- continue
- }
-
- pf, found := ssn.victimTasksFns[plugin.Name]
- if !found {
- continue
- }
- candidates := pf()
- if !init {
- victims = candidates
- init = true
- } else {
- var intersection []*api.TaskInfo
- // Get intersection of victims and candidates.
- for _, v := range victims {
- for _, c := range candidates {
- if v.UID == c.UID {
- intersection = append(intersection, v)
- }
- }
- }
-
- // Update victims to intersection
- victims = intersection
- }
- }
- // Plugins in this tier made decision if victims is not nil
- if victims != nil {
- return victims
- }
- }
-
- return victims
-}
-
-// ReservedNodes invoke ReservedNodes function of the plugins
-func (ssn *Session) ReservedNodes() {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledReservedNodes) {
- continue
- }
- fn, found := ssn.reservedNodesFns[plugin.Name]
- if !found {
- continue
- }
- fn()
- }
- }
-}
-
-// JobOrderFn invoke joborder function of the plugins
-func (ssn *Session) JobOrderFn(l, r interface{}) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledJobOrder) {
- continue
- }
- jof, found := ssn.jobOrderFns[plugin.Name]
- if !found {
- continue
- }
- if j := jof(l, r); j != 0 {
- return j < 0
- }
- }
- }
-
- // If no job order funcs, order job by CreationTimestamp first, then by UID.
- lv := l.(*api.JobInfo)
- rv := r.(*api.JobInfo)
- if lv.CreationTimestamp.Equal(&rv.CreationTimestamp) {
- return lv.UID < rv.UID
- }
- return lv.CreationTimestamp.Before(&rv.CreationTimestamp)
-}
-
-// NamespaceOrderFn invoke namespaceorder function of the plugins
-func (ssn *Session) NamespaceOrderFn(l, r interface{}) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledNamespaceOrder) {
- continue
- }
- nof, found := ssn.namespaceOrderFns[plugin.Name]
- if !found {
- continue
- }
- if j := nof(l, r); j != 0 {
- return j < 0
- }
- }
- }
-
- // TODO(lminzhw): if all NamespaceOrderFn treat these two namespace as the same,
- // we should make the job order have its affect among namespaces.
- // or just schedule namespace one by one
- lv := l.(api.NamespaceName)
- rv := r.(api.NamespaceName)
- return lv < rv
-}
-
-// ClusterOrderFn invoke ClusterOrderFn function of the plugins
-func (ssn *Session) ClusterOrderFn(l, r interface{}) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledClusterOrder) {
- continue
- }
- cof, found := ssn.clusterOrderFns[plugin.Name]
- if !found {
- continue
- }
- if j := cof(l, r); j != 0 {
- return j < 0
- }
- }
- }
-
- // If no cluster order funcs, order cluster by ClusterID
- lv := l.(*scheduling.Cluster)
- rv := r.(*scheduling.Cluster)
- return lv.Name < rv.Name
-}
-
-// QueueOrderFn invoke queueorder function of the plugins
-func (ssn *Session) QueueOrderFn(l, r interface{}) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledQueueOrder) {
- continue
- }
- qof, found := ssn.queueOrderFns[plugin.Name]
- if !found {
- continue
- }
- if j := qof(l, r); j != 0 {
- return j < 0
- }
- }
- }
-
- // If no queue order funcs, order queue by CreationTimestamp first, then by UID.
- lv := l.(*api.QueueInfo)
- rv := r.(*api.QueueInfo)
- if lv.Queue.CreationTimestamp.Equal(&rv.Queue.CreationTimestamp) {
- return lv.UID < rv.UID
- }
- return lv.Queue.CreationTimestamp.Before(&rv.Queue.CreationTimestamp)
-}
-
-// TaskCompareFns invoke taskorder function of the plugins
-func (ssn *Session) TaskCompareFns(l, r interface{}) int {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledTaskOrder) {
- continue
- }
- tof, found := ssn.taskOrderFns[plugin.Name]
- if !found {
- continue
- }
- if j := tof(l, r); j != 0 {
- return j
- }
- }
- }
-
- return 0
-}
-
-// TaskOrderFn invoke taskorder function of the plugins
-func (ssn *Session) TaskOrderFn(l, r interface{}) bool {
- if res := ssn.TaskCompareFns(l, r); res != 0 {
- return res < 0
- }
-
- // If no task order funcs, order task by default func.
- lv := l.(*api.TaskInfo)
- rv := r.(*api.TaskInfo)
- return helpers.CompareTask(lv, rv)
-}
-
-// PredicateFn invoke predicate function of the plugins
-func (ssn *Session) PredicateFn(task *api.TaskInfo, node *api.NodeInfo) error {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledPredicate) {
- continue
- }
- pfn, found := ssn.predicateFns[plugin.Name]
- if !found {
- continue
- }
- err := pfn(task, node)
- if err != nil {
- return err
- }
- }
- }
- return nil
-}
-
-// BestNodeFn invoke bestNode function of the plugins
-func (ssn *Session) BestNodeFn(task *api.TaskInfo, nodeScores map[float64][]*api.NodeInfo) *api.NodeInfo {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledBestNode) {
- continue
- }
- pfn, found := ssn.bestNodeFns[plugin.Name]
- if !found {
- continue
- }
- // Only the first plugin that enables and realizes bestNodeFn is allowed to choose best node for task
- if bestNode := pfn(task, nodeScores); bestNode != nil {
- return bestNode
- }
- }
- }
- return nil
-}
-
-// NodeOrderFn invoke node order function of the plugins
-func (ssn *Session) NodeOrderFn(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
- priorityScore := 0.0
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledNodeOrder) {
- continue
- }
- pfn, found := ssn.nodeOrderFns[plugin.Name]
- if !found {
- continue
- }
- score, err := pfn(task, node)
- if err != nil {
- return 0, err
- }
- priorityScore += score
- }
- }
- return priorityScore, nil
-}
-
-// BatchNodeOrderFn invoke node order function of the plugins
-func (ssn *Session) BatchNodeOrderFn(task *api.TaskInfo, nodes []*api.NodeInfo) (map[string]float64, error) {
- priorityScore := make(map[string]float64, len(nodes))
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledNodeOrder) {
- continue
- }
- pfn, found := ssn.batchNodeOrderFns[plugin.Name]
- if !found {
- continue
- }
- score, err := pfn(task, nodes)
- if err != nil {
- return nil, err
- }
- for nodeName, score := range score {
- priorityScore[nodeName] += score
- }
- }
- }
- return priorityScore, nil
-}
-
-func isEnabled(enabled *bool) bool {
- return enabled != nil && *enabled
-}
-
-// NodeOrderMapFn invoke node order function of the plugins
-func (ssn *Session) NodeOrderMapFn(task *api.TaskInfo, node *api.NodeInfo) (map[string]float64, float64, error) {
- nodeScoreMap := map[string]float64{}
- var priorityScore float64
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledNodeOrder) {
- continue
- }
- if pfn, found := ssn.nodeOrderFns[plugin.Name]; found {
- score, err := pfn(task, node)
- if err != nil {
- return nodeScoreMap, priorityScore, err
- }
- priorityScore += score
- }
- if pfn, found := ssn.nodeMapFns[plugin.Name]; found {
- score, err := pfn(task, node)
- if err != nil {
- return nodeScoreMap, priorityScore, err
- }
- nodeScoreMap[plugin.Name] = score
- }
- }
- }
- return nodeScoreMap, priorityScore, nil
-}
-
-// NodeOrderReduceFn invoke node order function of the plugins
-func (ssn *Session) NodeOrderReduceFn(task *api.TaskInfo, pluginNodeScoreMap map[string]k8sframework.NodeScoreList) (map[string]float64, error) {
- nodeScoreMap := map[string]float64{}
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if !isEnabled(plugin.EnabledNodeOrder) {
- continue
- }
- pfn, found := ssn.nodeReduceFns[plugin.Name]
- if !found {
- continue
- }
- if err := pfn(task, pluginNodeScoreMap[plugin.Name]); err != nil {
- return nodeScoreMap, err
- }
- for _, hp := range pluginNodeScoreMap[plugin.Name] {
- nodeScoreMap[hp.Name] += float64(hp.Score)
- }
- }
- }
- return nodeScoreMap, nil
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package framework
-
-import (
- "fmt"
-
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/metrics"
-)
-
-// Operation type
-type Operation int8
-
-const (
- // Evict op
- Evict = iota
- // Pipeline op
- Pipeline
- // Allocate op
- Allocate
-)
-
-type operation struct {
- name Operation
- task *api.TaskInfo
- reason string
-}
-
-// Statement structure
-type Statement struct {
- operations []operation
- ssn *Session
-}
-
-// NewStatement returns new statement object
-func NewStatement(ssn *Session) *Statement {
- return &Statement{
- ssn: ssn,
- }
-}
-
-// Evict the pod
-func (s *Statement) Evict(reclaimee *api.TaskInfo, reason string) error {
- // Update status in session
- if job, found := s.ssn.Jobs[reclaimee.Job]; found {
- if err := job.UpdateTaskStatus(reclaimee, api.Releasing); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- reclaimee.Namespace, reclaimee.Name, api.Releasing, s.ssn.UID, err)
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
- reclaimee.Job, s.ssn.UID)
- }
-
- // Update task in node.
- if node, found := s.ssn.Nodes[reclaimee.NodeName]; found {
- err := node.UpdateTask(reclaimee)
- if err != nil {
- klog.Errorf("Failed to update task <%v/%v> in node %v for: %s",
- reclaimee.Namespace, reclaimee.Name, reclaimee.NodeName, err.Error())
- return err
- }
- }
-
- for _, eh := range s.ssn.eventHandlers {
- if eh.DeallocateFunc != nil {
- eh.DeallocateFunc(&Event{
- Task: reclaimee,
- })
- }
- }
-
- s.operations = append(s.operations, operation{
- name: Evict,
- task: reclaimee,
- reason: reason,
- })
-
- return nil
-}
-
-func (s *Statement) evict(reclaimee *api.TaskInfo, reason string) error {
- if err := s.ssn.cache.Evict(reclaimee, reason); err != nil {
- if e := s.unevict(reclaimee); e != nil {
- klog.Errorf("Faled to unevict task <%v/%v>: %v.",
- reclaimee.Namespace, reclaimee.Name, e)
- }
- return err
- }
-
- return nil
-}
-
-func (s *Statement) unevict(reclaimee *api.TaskInfo) error {
- // Update status in session
- job, found := s.ssn.Jobs[reclaimee.Job]
- if found {
- if err := job.UpdateTaskStatus(reclaimee, api.Running); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- reclaimee.Namespace, reclaimee.Name, api.Releasing, s.ssn.UID, err)
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
- reclaimee.Job, s.ssn.UID)
- }
-
- // Update task in node.
- if node, found := s.ssn.Nodes[reclaimee.NodeName]; found {
- err := node.UpdateTask(reclaimee)
- if err != nil {
- klog.Errorf("Failed to update task <%v/%v> in node %v for: %s",
- reclaimee.Namespace, reclaimee.Name, reclaimee.NodeName, err.Error())
- return err
- }
- }
-
- for _, eh := range s.ssn.eventHandlers {
- if eh.AllocateFunc != nil {
- eh.AllocateFunc(&Event{
- Task: reclaimee,
- })
- }
- }
-
- return nil
-}
-
-// Pipeline the task for the node
-func (s *Statement) Pipeline(task *api.TaskInfo, hostname string) error {
- job, found := s.ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Pipelined); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Pipelined, s.ssn.UID, err)
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
- task.Job, s.ssn.UID)
- }
-
- task.NodeName = hostname
-
- if node, found := s.ssn.Nodes[hostname]; found {
- if err := node.AddTask(task); err != nil {
- klog.Errorf("Failed to pipeline task <%v/%v> to node <%v> in Session <%v>: %v",
- task.Namespace, task.Name, hostname, s.ssn.UID, err)
- }
- klog.V(3).Infof("After pipelined Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
- task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
- } else {
- klog.Errorf("Failed to find Node <%s> in Session <%s> index when binding.",
- hostname, s.ssn.UID)
- }
-
- for _, eh := range s.ssn.eventHandlers {
- if eh.AllocateFunc != nil {
- eh.AllocateFunc(&Event{
- Task: task,
- })
- }
- }
-
- s.operations = append(s.operations, operation{
- name: Pipeline,
- task: task,
- })
-
- return nil
-}
-
-func (s *Statement) pipeline(task *api.TaskInfo) {
-}
-
-func (s *Statement) unpipeline(task *api.TaskInfo) error {
- job, found := s.ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Pending); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Pipelined, s.ssn.UID, err)
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
- task.Job, s.ssn.UID)
- }
-
- if node, found := s.ssn.Nodes[task.NodeName]; found {
- if err := node.RemoveTask(task); err != nil {
- klog.Errorf("Failed to pipeline task <%v/%v> to node <%v> in Session <%v>: %v",
- task.Namespace, task.Name, task.NodeName, s.ssn.UID, err)
- }
- klog.V(3).Infof("After pipelined Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
- task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
- } else {
- klog.Errorf("Failed to find Node <%s> in Session <%s> index when binding.",
- task.NodeName, s.ssn.UID)
- }
-
- for _, eh := range s.ssn.eventHandlers {
- if eh.DeallocateFunc != nil {
- eh.DeallocateFunc(&Event{
- Task: task,
- })
- }
- }
- task.NodeName = ""
-
- return nil
-}
-
-// Allocate the task to node
-func (s *Statement) Allocate(task *api.TaskInfo, nodeInfo *api.NodeInfo) error {
- podVolumes, err := s.ssn.cache.GetPodVolumes(task, nodeInfo.Node)
- if err != nil {
- return err
- }
-
- hostname := nodeInfo.Name
- if err := s.ssn.cache.AllocateVolumes(task, hostname, podVolumes); err != nil {
- return err
- }
-
- task.Pod.Spec.NodeName = hostname
- task.PodVolumes = podVolumes
-
- // Only update status in session
- job, found := s.ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Allocated); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Allocated, s.ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
- task.Job, s.ssn.UID)
- return fmt.Errorf("failed to find job %s", task.Job)
- }
-
- task.NodeName = hostname
- if node, found := s.ssn.Nodes[hostname]; found {
- if err := node.AddTask(task); err != nil {
- klog.Errorf("Failed to add task <%v/%v> to node <%v> in Session <%v>: %v",
- task.Namespace, task.Name, hostname, s.ssn.UID, err)
- return err
- }
- klog.V(3).Infof("After allocated Task <%v/%v> to Node <%v>: idle <%v>, used <%v>, releasing <%v>",
- task.Namespace, task.Name, node.Name, node.Idle, node.Used, node.Releasing)
- } else {
- klog.Errorf("Failed to find Node <%s> in Session <%s> index when binding.",
- hostname, s.ssn.UID)
- return fmt.Errorf("failed to find node %s", hostname)
- }
-
- // Callbacks
- for _, eh := range s.ssn.eventHandlers {
- if eh.AllocateFunc != nil {
- eh.AllocateFunc(&Event{
- Task: task,
- })
- }
- }
-
- // Update status in session
- klog.V(3).Info("Allocating operations ...")
- s.operations = append(s.operations, operation{
- name: Allocate,
- task: task,
- })
-
- return nil
-}
-
-func (s *Statement) allocate(task *api.TaskInfo) error {
- if err := s.ssn.cache.AddBindTask(task); err != nil {
- return err
- }
-
- if job, found := s.ssn.Jobs[task.Job]; found {
- if err := job.UpdateTaskStatus(task, api.Binding); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Binding, s.ssn.UID, err)
- return err
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when binding.",
- task.Job, s.ssn.UID)
- return fmt.Errorf("failed to find job %s", task.Job)
- }
-
- metrics.UpdateTaskScheduleDuration(metrics.Duration(task.Pod.CreationTimestamp.Time))
- return nil
-}
-
-// unallocate the pod for task
-func (s *Statement) unallocate(task *api.TaskInfo) error {
- // Update status in session
- job, found := s.ssn.Jobs[task.Job]
- if found {
- if err := job.UpdateTaskStatus(task, api.Pending); err != nil {
- klog.Errorf("Failed to update task <%v/%v> status to %v in Session <%v>: %v",
- task.Namespace, task.Name, api.Pending, s.ssn.UID, err)
- }
- } else {
- klog.Errorf("Failed to find Job <%s> in Session <%s> index when unallocating.",
- task.Job, s.ssn.UID)
- }
-
- if node, found := s.ssn.Nodes[task.NodeName]; found {
- klog.V(3).Infof("Remove Task <%v> on node <%v>", task.Name, task.NodeName)
- err := node.RemoveTask(task)
- if err != nil {
- klog.Errorf("Failed to remove Task <%v> on node <%v>: %s", task.Name, task.NodeName, err.Error())
- }
- }
-
- for _, eh := range s.ssn.eventHandlers {
- if eh.DeallocateFunc != nil {
- eh.DeallocateFunc(&Event{
- Task: task,
- })
- }
- }
- task.NodeName = ""
-
- return nil
-}
-
-// Discard operation for evict, pipeline and allocate
-func (s *Statement) Discard() {
- klog.V(3).Info("Discarding operations ...")
- for i := len(s.operations) - 1; i >= 0; i-- {
- op := s.operations[i]
- op.task.GenerateLastTxContext()
- switch op.name {
- case Evict:
- err := s.unevict(op.task)
- if err != nil {
- klog.Errorf("Failed to unevict task: %s", err.Error())
- }
- case Pipeline:
- err := s.unpipeline(op.task)
- if err != nil {
- klog.Errorf("Failed to unpipeline task: %s", err.Error())
- }
- case Allocate:
- err := s.unallocate(op.task)
- if err != nil {
- klog.Errorf("Failed to unallocate task: %s", err.Error())
- }
- }
- }
-}
-
-// Commit operation for evict and pipeline
-func (s *Statement) Commit() {
- klog.V(3).Info("Committing operations ...")
- for _, op := range s.operations {
- op.task.ClearLastTxContext()
- switch op.name {
- case Evict:
- err := s.evict(op.task, op.reason)
- if err != nil {
- klog.Errorf("Failed to evict task: %s", err.Error())
- }
- case Pipeline:
- s.pipeline(op.task)
- case Allocate:
- err := s.allocate(op.task)
- if err != nil {
- klog.Errorf("Failed to allocate task: for %s", err.Error())
- }
- }
- }
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package binpack
-
-import (
- "fmt"
- "strings"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
-)
-
-const (
- // PluginName indicates name of volcano scheduler plugin.
- PluginName = "binpack"
-)
-
-const (
- // BinpackWeight is the key for providing Binpack Priority Weight in YAML
- BinpackWeight = "binpack.weight"
- // BinpackCPU is the key for weight of cpu
- BinpackCPU = "binpack.cpu"
- // BinpackMemory is the key for weight of memory
- BinpackMemory = "binpack.memory"
-
- // BinpackResources is the key for additional resource key name
- BinpackResources = "binpack.resources"
- // BinpackResourcesPrefix is the key prefix for additional resource key name
- BinpackResourcesPrefix = BinpackResources + "."
-
- resourceFmt = "%s[%d]"
-)
-
-type priorityWeight struct {
- BinPackingWeight int
- BinPackingCPU int
- BinPackingMemory int
- BinPackingResources map[v1.ResourceName]int
-}
-
-func (w *priorityWeight) String() string {
- length := 3
- if extendLength := len(w.BinPackingResources); extendLength == 0 {
- length++
- } else {
- length += extendLength
- }
- msg := make([]string, 0, length)
- msg = append(msg,
- fmt.Sprintf(resourceFmt, BinpackWeight, w.BinPackingWeight),
- fmt.Sprintf(resourceFmt, BinpackCPU, w.BinPackingCPU),
- fmt.Sprintf(resourceFmt, BinpackMemory, w.BinPackingMemory),
- )
-
- if len(w.BinPackingResources) == 0 {
- msg = append(msg, "no extend resources.")
- } else {
- for name, weight := range w.BinPackingResources {
- msg = append(msg, fmt.Sprintf(resourceFmt, name, weight))
- }
- }
- return strings.Join(msg, ", ")
-}
-
-type binpackPlugin struct {
- // Arguments given for the plugin
- weight priorityWeight
-}
-
-//New function returns prioritizePlugin object
-func New(aruguments framework.Arguments) framework.Plugin {
- weight := calculateWeight(aruguments)
- return &binpackPlugin{weight: weight}
-}
-
-func calculateWeight(args framework.Arguments) priorityWeight {
- /*
- User Should give priorityWeight in this format(binpack.weight, binpack.cpu, binpack.memory).
- Support change the weight about cpu, memory and additional resource by arguments.
-
- actions: "enqueue, reclaim, allocate, backfill, preempt"
- tiers:
- - plugins:
- - name: binpack
- arguments:
- binpack.weight: 10
- binpack.cpu: 5
- binpack.memory: 1
- binpack.resources: nvidia.com/gpu, example.com/foo
- binpack.resources.nvidia.com/gpu: 2
- binpack.resources.example.com/foo: 3
- */
- // Values are initialized to 1.
- weight := priorityWeight{
- BinPackingWeight: 1,
- BinPackingCPU: 1,
- BinPackingMemory: 1,
- BinPackingResources: make(map[v1.ResourceName]int),
- }
-
- // Checks whether binpack.weight is provided or not, if given, modifies the value in weight struct.
- args.GetInt(&weight.BinPackingWeight, BinpackWeight)
- // Checks whether binpack.cpu is provided or not, if given, modifies the value in weight struct.
- args.GetInt(&weight.BinPackingCPU, BinpackCPU)
- if weight.BinPackingCPU < 0 {
- weight.BinPackingCPU = 1
- }
- // Checks whether binpack.memory is provided or not, if given, modifies the value in weight struct.
- args.GetInt(&weight.BinPackingMemory, BinpackMemory)
- if weight.BinPackingMemory < 0 {
- weight.BinPackingMemory = 1
- }
-
- resourcesStr := args[BinpackResources]
- resources := strings.Split(resourcesStr, ",")
- for _, resource := range resources {
- resource = strings.TrimSpace(resource)
- if resource == "" {
- continue
- }
-
- // binpack.resources.[ResourceName]
- resourceKey := BinpackResourcesPrefix + resource
- resourceWeight := 1
- args.GetInt(&resourceWeight, resourceKey)
- if resourceWeight < 0 {
- resourceWeight = 1
- }
- weight.BinPackingResources[v1.ResourceName(resource)] = resourceWeight
- }
-
- return weight
-}
-
-func (bp *binpackPlugin) Name() string {
- return PluginName
-}
-
-func (bp *binpackPlugin) OnSessionOpen(ssn *framework.Session) {
- klog.V(4).Infof("Enter binpack plugin ...")
- if klog.V(4) {
- defer func() {
- klog.V(4).Infof("Leaving binpack plugin. %s ...", bp.weight.String())
- }()
-
- notFoundResource := []string{}
- for resource := range bp.weight.BinPackingResources {
- found := false
- for _, nodeInfo := range ssn.Nodes {
- if nodeInfo.Allocatable.Get(resource) > 0 {
- found = true
- break
- }
- }
- if !found {
- notFoundResource = append(notFoundResource, string(resource))
- }
- }
- klog.V(4).Infof("resources [%s] record in weight but not found on any node", strings.Join(notFoundResource, ", "))
- }
-
- nodeOrderFn := func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
- binPackingScore := BinPackingScore(task, node, bp.weight)
-
- klog.V(4).Infof("Binpack score for Task %s/%s on node %s is: %v", task.Namespace, task.Name, node.Name, binPackingScore)
- return binPackingScore, nil
- }
- if bp.weight.BinPackingWeight != 0 {
- ssn.AddNodeOrderFn(bp.Name(), nodeOrderFn)
- } else {
- klog.Infof("binpack weight is zero, skip node order function")
- }
-}
-
-func (bp *binpackPlugin) OnSessionClose(ssn *framework.Session) {
-}
-
-// BinPackingScore use the best fit polices during scheduling.
-// Goals:
-// - Schedule Jobs using BestFit Policy using Resource Bin Packing Priority Function
-// - Reduce Fragmentation of scarce resources on the Cluster
-func BinPackingScore(task *api.TaskInfo, node *api.NodeInfo, weight priorityWeight) float64 {
- score := 0.0
- weightSum := 0
- requested := task.Resreq
- allocatable := node.Allocatable
- used := node.Used
-
- for _, resource := range requested.ResourceNames() {
- request := requested.Get(resource)
- if request == 0 {
- continue
- }
- allocate := allocatable.Get(resource)
- nodeUsed := used.Get(resource)
-
- resourceWeight := 0
- found := false
- switch resource {
- case v1.ResourceCPU:
- resourceWeight = weight.BinPackingCPU
- found = true
- case v1.ResourceMemory:
- resourceWeight = weight.BinPackingMemory
- found = true
- default:
- resourceWeight, found = weight.BinPackingResources[resource]
- }
- if !found {
- continue
- }
-
- resourceScore := ResourceBinPackingScore(request, allocate, nodeUsed, resourceWeight)
- klog.V(5).Infof("task %s/%s on node %s resource %s, need %f, used %f, allocatable %f, weight %d, score %f", task.Namespace, task.Name, node.Name, resource, request, nodeUsed, allocate, resourceWeight, resourceScore)
-
- score += resourceScore
- weightSum += resourceWeight
- }
-
- // mapping the result from [0, weightSum] to [0, 10(MaxPriority)]
- if weightSum > 0 {
- score /= float64(weightSum)
- }
- score *= float64(v1alpha1.MaxNodeScore * int64(weight.BinPackingWeight))
-
- return score
-}
-
-// ResourceBinPackingScore calculate the binpack score for resource with provided info
-func ResourceBinPackingScore(requested, capacity, used float64, weight int) float64 {
- if capacity == 0 || weight == 0 {
- return 0
- }
-
- usedFinally := requested + used
- if usedFinally > capacity {
- return 0
- }
-
- score := usedFinally * float64(weight) / capacity
- return score
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package drf
-
-import (
- "fmt"
- "math"
- "strconv"
- "strings"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/api/helpers"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/metrics"
- "volcano.sh/volcano/pkg/scheduler/plugins/util"
-)
-
-// PluginName indicates name of volcano scheduler plugin.
-const PluginName = "drf"
-
-var shareDelta = 0.000001
-
-// hierarchicalNode represents the node hierarchy
-// and the corresponding weight and drf attribute
-type hierarchicalNode struct {
- parent *hierarchicalNode
- attr *drfAttr
- // If the node is a leaf node,
- // request represents the request of the job.
- request *api.Resource
- weight float64
- saturated bool
- hierarchy string
- children map[string]*hierarchicalNode
-}
-
-func (node *hierarchicalNode) Clone(parent *hierarchicalNode) *hierarchicalNode {
- newNode := &hierarchicalNode{
- parent: parent,
- attr: &drfAttr{
- share: node.attr.share,
- dominantResource: node.attr.dominantResource,
- allocated: node.attr.allocated.Clone(),
- },
- request: node.request.Clone(),
- weight: node.weight,
- saturated: node.saturated,
- hierarchy: node.hierarchy,
- children: nil,
- }
- if node.children != nil {
- newNode.children = map[string]*hierarchicalNode{}
- for _, child := range node.children {
- newNode.children[child.hierarchy] = child.Clone(newNode)
- }
- }
- return newNode
-}
-
-// resourceSaturated returns true if any resource of the job is saturated or the job demands fully allocated resource
-func resourceSaturated(allocated *api.Resource,
- jobRequest *api.Resource, demandingResources map[v1.ResourceName]bool) bool {
- for _, rn := range allocated.ResourceNames() {
- if allocated.Get(rn) != 0 && jobRequest.Get(rn) != 0 &&
- allocated.Get(rn) >= jobRequest.Get(rn) {
- return true
- }
- if !demandingResources[rn] && jobRequest.Get(rn) != 0 {
- return true
- }
- }
- return false
-}
-
-type drfAttr struct {
- share float64
- dominantResource string
- allocated *api.Resource
-}
-
-func (attr *drfAttr) String() string {
- return fmt.Sprintf("dominant resource <%s>, dominant share %f, allocated %s",
- attr.dominantResource, attr.share, attr.allocated)
-}
-
-type drfPlugin struct {
- totalResource *api.Resource
- totalAllocated *api.Resource
-
- // Key is Job ID
- jobAttrs map[api.JobID]*drfAttr
-
- // map[namespaceName]->attr
- namespaceOpts map[string]*drfAttr
-
- // hierarchical tree root
- hierarchicalRoot *hierarchicalNode
-
- // Arguments given for the plugin
- pluginArguments framework.Arguments
-}
-
-// New return drf plugin
-func New(arguments framework.Arguments) framework.Plugin {
- return &drfPlugin{
- totalResource: api.EmptyResource(),
- totalAllocated: api.EmptyResource(),
- jobAttrs: map[api.JobID]*drfAttr{},
- namespaceOpts: map[string]*drfAttr{},
- hierarchicalRoot: &hierarchicalNode{
- attr: &drfAttr{allocated: api.EmptyResource()},
- request: api.EmptyResource(),
- hierarchy: "root",
- weight: 1,
- children: map[string]*hierarchicalNode{},
- },
- pluginArguments: arguments,
- }
-}
-
-func (drf *drfPlugin) Name() string {
- return PluginName
-}
-
-// HierarchyEnabled returns if hierarchy is enabled
-func (drf *drfPlugin) HierarchyEnabled(ssn *framework.Session) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if plugin.Name != PluginName {
- continue
- }
- return plugin.EnabledHierarchy != nil && *plugin.EnabledHierarchy
- }
- }
- return false
-}
-
-// NamespaceOrderEnabled returns the NamespaceOrder for this plugin is enabled in this session or not
-func (drf *drfPlugin) NamespaceOrderEnabled(ssn *framework.Session) bool {
- for _, tier := range ssn.Tiers {
- for _, plugin := range tier.Plugins {
- if plugin.Name != PluginName {
- continue
- }
- return plugin.EnabledNamespaceOrder != nil && *plugin.EnabledNamespaceOrder
- }
- }
- return false
-}
-
-func (drf *drfPlugin) compareQueues(root *hierarchicalNode, lqueue *api.QueueInfo, rqueue *api.QueueInfo) float64 {
- lnode := root
- lpaths := strings.Split(lqueue.Hierarchy, "/")
- rnode := root
- rpaths := strings.Split(rqueue.Hierarchy, "/")
- depth := 0
- if len(lpaths) < len(rpaths) {
- depth = len(lpaths)
- } else {
- depth = len(rpaths)
- }
- for i := 0; i < depth; i++ {
- // Saturated nodes have minumun prioirty,
- // so that demanding nodes will be poped first.
- if !lnode.saturated && rnode.saturated {
- return -1
- }
- if lnode.saturated && !rnode.saturated {
- return 1
- }
- if lnode.attr.share/lnode.weight == rnode.attr.share/rnode.weight {
- if i < depth-1 {
- lnode = lnode.children[lpaths[i+1]]
- rnode = rnode.children[rpaths[i+1]]
- }
- } else {
- return lnode.attr.share/lnode.weight - rnode.attr.share/rnode.weight
- }
- }
- return 0
-}
-
-func (drf *drfPlugin) OnSessionOpen(ssn *framework.Session) {
- // Prepare scheduling data for this session.
- drf.totalResource.Add(ssn.TotalResource)
-
- klog.V(4).Infof("Total Allocatable %s", drf.totalResource)
-
- namespaceOrderEnabled := drf.NamespaceOrderEnabled(ssn)
- hierarchyEnabled := drf.HierarchyEnabled(ssn)
-
- for _, job := range ssn.Jobs {
- attr := &drfAttr{
- allocated: api.EmptyResource(),
- }
-
- for status, tasks := range job.TaskStatusIndex {
- if api.AllocatedStatus(status) {
- for _, t := range tasks {
- attr.allocated.Add(t.Resreq)
- }
- }
- }
-
- // Calculate the init share of Job
- drf.updateJobShare(job.Namespace, job.Name, attr)
-
- drf.jobAttrs[job.UID] = attr
-
- if namespaceOrderEnabled {
- nsOpts, found := drf.namespaceOpts[job.Namespace]
- if !found {
- nsOpts = &drfAttr{
- allocated: api.EmptyResource(),
- }
- drf.namespaceOpts[job.Namespace] = nsOpts
- }
- // all task in job should have the same namespace with job
- nsOpts.allocated.Add(attr.allocated)
- drf.updateNamespaceShare(job.Namespace, nsOpts)
- }
- if hierarchyEnabled {
- queue := ssn.Queues[job.Queue]
- drf.totalAllocated.Add(attr.allocated)
- drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
- }
- }
-
- preemptableFn := func(preemptor *api.TaskInfo, preemptees []*api.TaskInfo) ([]*api.TaskInfo, int) {
- var victims []*api.TaskInfo
-
- addVictim := func(candidate *api.TaskInfo) {
- victims = append(victims, candidate)
- }
-
- if namespaceOrderEnabled {
- // apply the namespace share policy on preemptee firstly
-
- lWeight := ssn.NamespaceInfo[api.NamespaceName(preemptor.Namespace)].GetWeight()
- lNsAtt := drf.namespaceOpts[preemptor.Namespace]
- lNsAlloc := lNsAtt.allocated.Clone().Add(preemptor.Resreq)
- _, lNsShare := drf.calculateShare(lNsAlloc, drf.totalResource)
- lNsShareWeighted := lNsShare / float64(lWeight)
-
- namespaceAllocation := map[string]*api.Resource{}
-
- // undecidedPreemptees means this policy could not judge preemptee is preemptable or not
- // and left it to next policy
- undecidedPreemptees := []*api.TaskInfo{}
-
- for _, preemptee := range preemptees {
- if preemptor.Namespace == preemptee.Namespace {
- // policy is disabled when they are in the same namespace
- undecidedPreemptees = append(undecidedPreemptees, preemptee)
- continue
- }
-
- // compute the preemptee namespace weighted share after preemption
- nsAllocation, found := namespaceAllocation[preemptee.Namespace]
- if !found {
- rNsAtt := drf.namespaceOpts[preemptee.Namespace]
- nsAllocation = rNsAtt.allocated.Clone()
- namespaceAllocation[preemptee.Namespace] = nsAllocation
- }
- rWeight := ssn.NamespaceInfo[api.NamespaceName(preemptee.Namespace)].GetWeight()
- rNsAlloc := nsAllocation.Sub(preemptee.Resreq)
- _, rNsShare := drf.calculateShare(rNsAlloc, drf.totalResource)
- rNsShareWeighted := rNsShare / float64(rWeight)
-
- // to avoid ping pong actions, the preemptee namespace should
- // have the higher weighted share after preemption.
- if lNsShareWeighted < rNsShareWeighted {
- addVictim(preemptee)
- continue
- }
- if lNsShareWeighted-rNsShareWeighted > shareDelta {
- continue
- }
-
- // equal namespace order leads to judgement of jobOrder
- undecidedPreemptees = append(undecidedPreemptees, preemptee)
- }
-
- preemptees = undecidedPreemptees
- }
-
- latt := drf.jobAttrs[preemptor.Job]
- lalloc := latt.allocated.Clone().Add(preemptor.Resreq)
- _, ls := drf.calculateShare(lalloc, drf.totalResource)
-
- allocations := map[api.JobID]*api.Resource{}
-
- for _, preemptee := range preemptees {
- if _, found := allocations[preemptee.Job]; !found {
- ratt := drf.jobAttrs[preemptee.Job]
- allocations[preemptee.Job] = ratt.allocated.Clone()
- }
- ralloc := allocations[preemptee.Job].Sub(preemptee.Resreq)
- _, rs := drf.calculateShare(ralloc, drf.totalResource)
-
- if ls < rs || math.Abs(ls-rs) <= shareDelta {
- addVictim(preemptee)
- }
- }
-
- klog.V(4).Infof("Victims from DRF plugins are %+v", victims)
-
- return victims, util.Permit
- }
-
- ssn.AddPreemptableFn(drf.Name(), preemptableFn)
-
- if hierarchyEnabled {
- queueOrderFn := func(l interface{}, r interface{}) int {
- lv := l.(*api.QueueInfo)
- rv := r.(*api.QueueInfo)
- ret := drf.compareQueues(drf.hierarchicalRoot, lv, rv)
- if ret < 0 {
- return -1
- }
- if ret > 0 {
- return 1
- }
- return 0
- }
- ssn.AddQueueOrderFn(drf.Name(), queueOrderFn)
-
- reclaimFn := func(reclaimer *api.TaskInfo, reclaimees []*api.TaskInfo) ([]*api.TaskInfo, int) {
- var victims []*api.TaskInfo
- // clone hdrf tree
- totalAllocated := drf.totalAllocated.Clone()
- root := drf.hierarchicalRoot.Clone(nil)
-
- // update reclaimer hdrf
- ljob := ssn.Jobs[reclaimer.Job]
- lqueue := ssn.Queues[ljob.Queue]
- ljob = ljob.Clone()
- attr := drf.jobAttrs[ljob.UID]
- lattr := &drfAttr{
- allocated: attr.allocated.Clone(),
- }
- lattr.allocated.Add(reclaimer.Resreq)
- totalAllocated.Add(reclaimer.Resreq)
- drf.updateShare(lattr)
- drf.UpdateHierarchicalShare(root, totalAllocated, ljob, lattr, lqueue.Hierarchy, lqueue.Weights)
-
- for _, preemptee := range reclaimees {
- rjob := ssn.Jobs[preemptee.Job]
- rqueue := ssn.Queues[rjob.Queue]
-
- // update hdrf of reclaimee job
- totalAllocated.Sub(preemptee.Resreq)
- rjob = rjob.Clone()
- attr := drf.jobAttrs[rjob.UID]
- rattr := &drfAttr{
- allocated: attr.allocated.Clone(),
- }
- rattr.allocated.Sub(preemptee.Resreq)
- drf.updateShare(rattr)
- drf.UpdateHierarchicalShare(root, totalAllocated, rjob, rattr, rqueue.Hierarchy, rqueue.Weights)
-
- // compare hdrf of queues
- ret := drf.compareQueues(root, lqueue, rqueue)
-
- // resume hdrf of reclaimee job
- totalAllocated.Add(preemptee.Resreq)
- rattr.allocated.Add(preemptee.Resreq)
- drf.updateShare(rattr)
- drf.UpdateHierarchicalShare(root, totalAllocated, rjob, rattr, rqueue.Hierarchy, rqueue.Weights)
-
- if ret < 0 {
- victims = append(victims, preemptee)
- }
-
- if ret > shareDelta {
- continue
- }
- }
-
- klog.V(4).Infof("Victims from HDRF plugins are %+v", victims)
-
- return victims, util.Permit
- }
- ssn.AddReclaimableFn(drf.Name(), reclaimFn)
- }
-
- jobOrderFn := func(l interface{}, r interface{}) int {
- lv := l.(*api.JobInfo)
- rv := r.(*api.JobInfo)
-
- klog.V(4).Infof("DRF JobOrderFn: <%v/%v> share state: %v, <%v/%v> share state: %v",
- lv.Namespace, lv.Name, drf.jobAttrs[lv.UID].share, rv.Namespace, rv.Name, drf.jobAttrs[rv.UID].share)
-
- if drf.jobAttrs[lv.UID].share == drf.jobAttrs[rv.UID].share {
- return 0
- }
-
- if drf.jobAttrs[lv.UID].share < drf.jobAttrs[rv.UID].share {
- return -1
- }
-
- return 1
- }
-
- ssn.AddJobOrderFn(drf.Name(), jobOrderFn)
-
- namespaceOrderFn := func(l interface{}, r interface{}) int {
- lv := l.(api.NamespaceName)
- rv := r.(api.NamespaceName)
-
- lOpt := drf.namespaceOpts[string(lv)]
- rOpt := drf.namespaceOpts[string(rv)]
-
- lWeight := ssn.NamespaceInfo[lv].GetWeight()
- rWeight := ssn.NamespaceInfo[rv].GetWeight()
-
- klog.V(4).Infof("DRF NamespaceOrderFn: <%v> share state: %f, weight %v, <%v> share state: %f, weight %v",
- lv, lOpt.share, lWeight, rv, rOpt.share, rWeight)
-
- lWeightedShare := lOpt.share / float64(lWeight)
- rWeightedShare := rOpt.share / float64(rWeight)
-
- metrics.UpdateNamespaceWeight(string(lv), lWeight)
- metrics.UpdateNamespaceWeight(string(rv), rWeight)
- metrics.UpdateNamespaceWeightedShare(string(lv), lWeightedShare)
- metrics.UpdateNamespaceWeightedShare(string(rv), rWeightedShare)
-
- if lWeightedShare == rWeightedShare {
- return 0
- }
-
- if lWeightedShare < rWeightedShare {
- return -1
- }
-
- return 1
- }
-
- if namespaceOrderEnabled {
- ssn.AddNamespaceOrderFn(drf.Name(), namespaceOrderFn)
- }
-
- // Register event handlers.
- ssn.AddEventHandler(&framework.EventHandler{
- AllocateFunc: func(event *framework.Event) {
- attr := drf.jobAttrs[event.Task.Job]
- attr.allocated.Add(event.Task.Resreq)
-
- job := ssn.Jobs[event.Task.Job]
- drf.updateJobShare(job.Namespace, job.Name, attr)
-
- nsShare := -1.0
- if namespaceOrderEnabled {
- nsOpt := drf.namespaceOpts[event.Task.Namespace]
- nsOpt.allocated.Add(event.Task.Resreq)
-
- drf.updateNamespaceShare(event.Task.Namespace, nsOpt)
- nsShare = nsOpt.share
- }
- if hierarchyEnabled {
- queue := ssn.Queues[job.Queue]
-
- drf.totalAllocated.Add(event.Task.Resreq)
- drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
- }
-
- klog.V(4).Infof("DRF AllocateFunc: task <%v/%v>, resreq <%v>, share <%v>, namespace share <%v>",
- event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share, nsShare)
- },
- DeallocateFunc: func(event *framework.Event) {
- attr := drf.jobAttrs[event.Task.Job]
- attr.allocated.Sub(event.Task.Resreq)
-
- job := ssn.Jobs[event.Task.Job]
- drf.updateJobShare(job.Namespace, job.Name, attr)
-
- nsShare := -1.0
- if namespaceOrderEnabled {
- nsOpt := drf.namespaceOpts[event.Task.Namespace]
- nsOpt.allocated.Sub(event.Task.Resreq)
-
- drf.updateNamespaceShare(event.Task.Namespace, nsOpt)
- nsShare = nsOpt.share
- }
-
- if hierarchyEnabled {
- queue := ssn.Queues[job.Queue]
- drf.totalAllocated.Sub(event.Task.Resreq)
- drf.UpdateHierarchicalShare(drf.hierarchicalRoot, drf.totalAllocated, job, attr, queue.Hierarchy, queue.Weights)
- }
-
- klog.V(4).Infof("DRF EvictFunc: task <%v/%v>, resreq <%v>, share <%v>, namespace share <%v>",
- event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share, nsShare)
- },
- })
-}
-
-func (drf *drfPlugin) updateNamespaceShare(namespaceName string, attr *drfAttr) {
- drf.updateShare(attr)
- metrics.UpdateNamespaceShare(namespaceName, attr.share)
-}
-
-// build hierarchy if the node does not exist
-func (drf *drfPlugin) buildHierarchy(root *hierarchicalNode, job *api.JobInfo, attr *drfAttr,
- hierarchy, hierarchicalWeights string) {
- inode := root
- paths := strings.Split(hierarchy, "/")
- weights := strings.Split(hierarchicalWeights, "/")
-
- for i := 1; i < len(paths); i++ {
- if child, ok := inode.children[paths[i]]; ok {
- inode = child
- } else {
- fweight, _ := strconv.ParseFloat(weights[i], 64)
- if fweight < 1 {
- fweight = 1
- }
- child = &hierarchicalNode{
- weight: fweight,
- hierarchy: paths[i],
- request: api.EmptyResource(),
- attr: &drfAttr{
- allocated: api.EmptyResource(),
- },
- children: make(map[string]*hierarchicalNode),
- }
- klog.V(4).Infof("Node %s added to %s, weight %f",
- child.hierarchy, inode.hierarchy, fweight)
- inode.children[paths[i]] = child
- child.parent = inode
- inode = child
- }
- }
-
- child := &hierarchicalNode{
- weight: 1,
- attr: attr,
- hierarchy: string(job.UID),
- request: job.TotalRequest.Clone(),
- children: nil,
- }
- inode.children[string(job.UID)] = child
- // update drf attribute bottom up
- klog.V(4).Infof("Job <%s/%s> added to %s, weights %s, attr %v, total request: %s",
- job.Namespace, job.Name, inode.hierarchy, hierarchicalWeights, child.attr, job.TotalRequest)
-}
-
-// updateNamespaceShare updates the node attribute recursively
-func (drf *drfPlugin) updateHierarchicalShare(node *hierarchicalNode,
- demandingResources map[v1.ResourceName]bool) {
- if node.children == nil {
- node.saturated = resourceSaturated(node.attr.allocated,
- node.request, demandingResources)
- klog.V(4).Infof("Update hierarchical node %s, share %f, dominant %s, resource %v, saturated: %t",
- node.hierarchy, node.attr.share, node.attr.dominantResource, node.attr.allocated, node.saturated)
- } else {
- var mdr float64 = 1
- // get minimun dominant resource share
- for _, child := range node.children {
- drf.updateHierarchicalShare(child, demandingResources)
- // skip empty child and saturated child
- if child.attr.share != 0 && !child.saturated {
- _, resShare := drf.calculateShare(child.attr.allocated, drf.totalResource)
- if resShare < mdr {
- mdr = resShare
- }
- }
- }
-
- node.attr.allocated = api.EmptyResource()
- saturated := true
- for _, child := range node.children {
- if !child.saturated {
- saturated = false
- }
- // only consider non-empty children
- if child.attr.share != 0 {
- // saturated child is not scaled
- if child.saturated {
- t := child.attr.allocated
- node.attr.allocated.Add(t)
- } else {
- t := child.attr.allocated.Clone().Multi(mdr / child.attr.share)
- node.attr.allocated.Add(t)
- }
- }
- }
- node.attr.dominantResource, node.attr.share = drf.calculateShare(
- node.attr.allocated, drf.totalResource)
- node.saturated = saturated
- klog.V(4).Infof("Update hierarchical node %s, share %f, dominant resource %s, resource %v, saturated: %t",
- node.hierarchy, node.attr.share, node.attr.dominantResource, node.attr.allocated, node.saturated)
- }
-}
-
-func (drf *drfPlugin) UpdateHierarchicalShare(root *hierarchicalNode, totalAllocated *api.Resource, job *api.JobInfo, attr *drfAttr, hierarchy, hierarchicalWeights string) {
- // filter out demanding resources
- demandingResources := map[v1.ResourceName]bool{}
- for _, rn := range drf.totalResource.ResourceNames() {
- if totalAllocated.Get(rn) < drf.totalResource.Get(rn) {
- demandingResources[rn] = true
- }
- }
- drf.buildHierarchy(root, job, attr, hierarchy, hierarchicalWeights)
- drf.updateHierarchicalShare(root, demandingResources)
-}
-
-func (drf *drfPlugin) updateJobShare(jobNs, jobName string, attr *drfAttr) {
- drf.updateShare(attr)
- metrics.UpdateJobShare(jobNs, jobName, attr.share)
-}
-
-func (drf *drfPlugin) updateShare(attr *drfAttr) {
- attr.dominantResource, attr.share = drf.calculateShare(attr.allocated, drf.totalResource)
-}
-
-func (drf *drfPlugin) calculateShare(allocated, totalResource *api.Resource) (string, float64) {
- res := float64(0)
- dominantResource := ""
- for _, rn := range totalResource.ResourceNames() {
- share := helpers.Share(allocated.Get(rn), totalResource.Get(rn))
- if share > res {
- res = share
- dominantResource = string(rn)
- }
- }
-
- return dominantResource, res
-}
-
-func (drf *drfPlugin) OnSessionClose(session *framework.Session) {
- // Clean schedule data.
- drf.totalResource = api.EmptyResource()
- drf.totalAllocated = api.EmptyResource()
- drf.jobAttrs = map[api.JobID]*drfAttr{}
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-import (
- v1 "k8s.io/api/core/v1"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
- "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- nodeinfov1alpha1 "volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-// TopologyHint is a struct containing the NUMANodeAffinity for a Container
-type TopologyHint struct {
- NUMANodeAffinity bitmask.BitMask
- // Preferred is set to true when the NUMANodeAffinity encodes a preferred
- // allocation for the Container. It is set to false otherwise.
- Preferred bool
-}
-
-// Policy is an interface for topology manager policy
-type Policy interface {
- // Predicate Get the best hit.
- Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool)
-}
-
-// HintProvider is an interface for components that want to collaborate to
-// achieve globally optimal concrete resource alignment with respect to
-// NUMA locality.
-type HintProvider interface {
- // Name returns provider name used for register and logging.
- Name() string
- // GetTopologyHints returns hints if this hint provider has a preference,
- GetTopologyHints(container *v1.Container, topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string][]TopologyHint
- Allocate(container *v1.Container, bestHit *TopologyHint, topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string]cpuset.CPUSet
-}
-
-// GetPolicy return the interface matched the input task topology config
-func GetPolicy(node *api.NodeInfo, numaNodes []int) Policy {
- switch batch.NumaPolicy(node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.TopologyManagerPolicy]) {
- case batch.None:
- return NewPolicyNone(numaNodes)
- case batch.BestEffort:
- return NewPolicyBestEffort(numaNodes)
- case batch.Restricted:
- return NewPolicyRestricted(numaNodes)
- case batch.SingleNumaNode:
- return NewPolicySingleNumaNode(numaNodes)
- }
-
- return &policyNone{}
-}
-
-// AccumulateProvidersHints return all TopologyHint collection from different providers
-func AccumulateProvidersHints(container *v1.Container,
- topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets,
- hintProviders []HintProvider) (providersHints []map[string][]TopologyHint) {
- for _, provider := range hintProviders {
- hints := provider.GetTopologyHints(container, topoInfo, resNumaSets)
- providersHints = append(providersHints, hints)
- }
-
- return providersHints
-}
-
-// Allocate return all resource assignment collection from different providers
-func Allocate(container *v1.Container, bestHit *TopologyHint,
- topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets, hintProviders []HintProvider) map[string]cpuset.CPUSet {
- allResAlloc := make(map[string]cpuset.CPUSet)
- for _, provider := range hintProviders {
- resAlloc := provider.Allocate(container, bestHit, topoInfo, resNumaSets)
- for resName, assign := range resAlloc {
- allResAlloc[resName] = assign
- }
- }
-
- return allResAlloc
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-import (
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
-)
-
-func filterProvidersHints(providersHints []map[string][]TopologyHint) [][]TopologyHint {
- var allProviderHints [][]TopologyHint
- for _, hints := range providersHints {
- // If hints is nil, insert a single, preferred any-numa hint into allProviderHints.
- if len(hints) == 0 {
- klog.Infof("[numatopo] Hint Provider has no preference for NUMA affinity with any resource")
- allProviderHints = append(allProviderHints, []TopologyHint{{nil, true}})
- continue
- }
-
- // Otherwise, accumulate the hints for each resource type into allProviderHints.
- for resource := range hints {
- if hints[resource] == nil {
- klog.Infof("[numatopo] Hint Provider has no preference for NUMA affinity with resource '%s'", resource)
- allProviderHints = append(allProviderHints, []TopologyHint{{nil, true}})
- continue
- }
-
- if len(hints[resource]) == 0 {
- klog.Infof("[numatopo] Hint Provider has no possible NUMA affinities for resource '%s'", resource)
- allProviderHints = append(allProviderHints, []TopologyHint{{nil, false}})
- continue
- }
-
- allProviderHints = append(allProviderHints, hints[resource])
- }
- }
- return allProviderHints
-}
-
-func mergeFilteredHints(numaNodes []int, filteredHints [][]TopologyHint) TopologyHint {
- // Set the default affinity as an any-numa affinity containing the list
- // of NUMA Nodes available on this machine.
- defaultAffinity, _ := bitmask.NewBitMask(numaNodes...)
-
- // Set the bestHint to return from this function as {nil false}.
- // This will only be returned if no better hint can be found when
- // merging hints from each hint provider.
- bestHint := TopologyHint{defaultAffinity, false}
- iterateAllProviderTopologyHints(filteredHints, func(permutation []TopologyHint) {
- // Get the NUMANodeAffinity from each hint in the permutation and see if any
- // of them encode unpreferred allocations.
- mergedHint := mergePermutation(numaNodes, permutation)
- // Only consider mergedHints that result in a NUMANodeAffinity > 0 to
- // replace the current bestHint.
- if mergedHint.NUMANodeAffinity.Count() == 0 {
- return
- }
-
- // If the current bestHint is non-preferred and the new mergedHint is
- // preferred, always choose the preferred hint over the non-preferred one.
- if mergedHint.Preferred && !bestHint.Preferred {
- bestHint = mergedHint
- return
- }
-
- // If the current bestHint is preferred and the new mergedHint is
- // non-preferred, never update bestHint, regardless of mergedHint's
- // narowness.
- if !mergedHint.Preferred && bestHint.Preferred {
- return
- }
-
- // If mergedHint and bestHint has the same preference, only consider
- // mergedHints that have a narrower NUMANodeAffinity than the
- // NUMANodeAffinity in the current bestHint.
- if !mergedHint.NUMANodeAffinity.IsNarrowerThan(bestHint.NUMANodeAffinity) {
- return
- }
-
- // In all other cases, update bestHint to the current mergedHint
- bestHint = mergedHint
- })
-
- return bestHint
-}
-
-// Iterate over all permutations of hints in 'allProviderHints [][]TopologyHint'.
-//
-// This procedure is implemented as a recursive function over the set of hints
-// in 'allproviderHints[i]'. It applies the function 'callback' to each
-// permutation as it is found. It is the equivalent of:
-//
-// for i := 0; i < len(providerHints[0]); i++
-// for j := 0; j < len(providerHints[1]); j++
-// for k := 0; k < len(providerHints[2]); k++
-// ...
-// for z := 0; z < len(providerHints[-1]); z++
-// permutation := []TopologyHint{
-// providerHints[0][i],
-// providerHints[1][j],
-// providerHints[2][k],
-// ...
-// providerHints[-1][z]
-// }
-// callback(permutation)
-func iterateAllProviderTopologyHints(allProviderHints [][]TopologyHint, callback func([]TopologyHint)) {
- // Internal helper function to accumulate the permutation before calling the callback.
- var iterate func(i int, accum []TopologyHint)
- iterate = func(i int, accum []TopologyHint) {
- // Base case: we have looped through all providers and have a full permutation.
- if i == len(allProviderHints) {
- callback(accum)
- return
- }
-
- // Loop through all hints for provider 'i', and recurse to build the
- // the permutation of this hint with all hints from providers 'i++'.
- for j := range allProviderHints[i] {
- iterate(i+1, append(accum, allProviderHints[i][j]))
- }
- }
- iterate(0, []TopologyHint{})
-}
-
-// Merge a TopologyHints permutation to a single hint by performing a bitwise-AND
-// of their affinity masks. The hint shall be preferred if all hits in the permutation
-// are preferred.
-func mergePermutation(numaNodes []int, permutation []TopologyHint) TopologyHint {
- // Get the NUMANodeAffinity from each hint in the permutation and see if any
- // of them encode unpreferred allocations.
- preferred := true
- defaultAffinity, _ := bitmask.NewBitMask(numaNodes...)
- var numaAffinities []bitmask.BitMask
- for _, hint := range permutation {
- // Only consider hints that have an actual NUMANodeAffinity set.
- if hint.NUMANodeAffinity == nil {
- numaAffinities = append(numaAffinities, defaultAffinity)
- } else {
- numaAffinities = append(numaAffinities, hint.NUMANodeAffinity)
- }
-
- if !hint.Preferred {
- preferred = false
- }
- }
-
- // Merge the affinities using a bitwise-and operation.
- mergedAffinity := bitmask.And(defaultAffinity, numaAffinities...)
- // Build a mergedHint from the merged affinity mask, indicating if an
- // preferred allocation was used to generate the affinity mask or not.
- return TopologyHint{mergedAffinity, preferred}
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-import "k8s.io/klog"
-
-type policyBestEffort struct {
- numaNodes []int
-}
-
-// NewPolicyBestEffort return a new policy interface
-func NewPolicyBestEffort(numaNodes []int) Policy {
- return &policyBestEffort{numaNodes: numaNodes}
-}
-
-func (p *policyBestEffort) canAdmitPodResult(hint *TopologyHint) bool {
- return true
-}
-
-func (p *policyBestEffort) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
- filteredProvidersHints := filterProvidersHints(providersHints)
- bestHint := mergeFilteredHints(p.numaNodes, filteredProvidersHints)
- admit := p.canAdmitPodResult(&bestHint)
-
- klog.V(4).Infof("bestHint: %v admit %v\n", bestHint, admit)
- return bestHint, admit
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-type policyNone struct {
- numaNodes []int
-}
-
-// NewPolicyNone return a new policy interface
-func NewPolicyNone(numaNodes []int) Policy {
- return &policyNone{numaNodes: numaNodes}
-}
-
-func (policy *policyNone) canAdmitPodResult(hint *TopologyHint) bool {
- return true
-}
-
-func (policy *policyNone) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
- return TopologyHint{}, policy.canAdmitPodResult(nil)
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-import "k8s.io/klog"
-
-type policyRestricted struct {
- numaNodes []int
-}
-
-// NewPolicyRestricted return a new policy interface
-func NewPolicyRestricted(numaNodes []int) Policy {
- return &policyRestricted{numaNodes: numaNodes}
-}
-
-func (p *policyRestricted) canAdmitPodResult(hint *TopologyHint) bool {
- return hint.Preferred
-}
-
-func (p *policyRestricted) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
- filteredHints := filterProvidersHints(providersHints)
- bestHint := mergeFilteredHints(p.numaNodes, filteredHints)
- admit := p.canAdmitPodResult(&bestHint)
-
- klog.V(4).Infof("bestHint: %v admit %v\n", bestHint, admit)
- return bestHint, admit
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package policy
-
-import "k8s.io/klog"
-
-type policySingleNumaNode struct {
- numaNodes []int
-}
-
-// NewPolicySingleNumaNode return a new policy interface
-func NewPolicySingleNumaNode(numaNodes []int) Policy {
- return &policySingleNumaNode{numaNodes: numaNodes}
-}
-
-func (policy *policySingleNumaNode) canAdmitPodResult(hint *TopologyHint) bool {
- return hint.Preferred
-}
-
-// Return hints that have valid bitmasks with exactly one bit set.
-func filterSingleNumaHints(allResourcesHints [][]TopologyHint) [][]TopologyHint {
- var filteredResourcesHints [][]TopologyHint
- for _, oneResourceHints := range allResourcesHints {
- var filtered []TopologyHint
- for _, hint := range oneResourceHints {
- if hint.NUMANodeAffinity == nil && hint.Preferred {
- filtered = append(filtered, hint)
- }
- if hint.NUMANodeAffinity != nil && hint.NUMANodeAffinity.Count() == 1 && hint.Preferred {
- filtered = append(filtered, hint)
- }
- }
- filteredResourcesHints = append(filteredResourcesHints, filtered)
- }
- return filteredResourcesHints
-}
-
-func (policy *policySingleNumaNode) Predicate(providersHints []map[string][]TopologyHint) (TopologyHint, bool) {
- filteredHints := filterProvidersHints(providersHints)
- singleNumaHints := filterSingleNumaHints(filteredHints)
- bestHint := mergeFilteredHints(policy.numaNodes, singleNumaHints)
- klog.V(4).Infof("bestHint: %v\n", bestHint)
- admit := policy.canAdmitPodResult(&bestHint)
- return bestHint, admit
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package cpumanager
-
-import (
- "fmt"
- "sort"
-
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
-)
-
-type cpuAccumulator struct {
- topo *topology.CPUTopology
- details topology.CPUDetails
- numCPUsNeeded int
- result cpuset.CPUSet
-}
-
-func newCPUAccumulator(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) *cpuAccumulator {
- return &cpuAccumulator{
- topo: topo,
- details: topo.CPUDetails.KeepOnly(availableCPUs),
- numCPUsNeeded: numCPUs,
- result: cpuset.NewCPUSet(),
- }
-}
-
-func (a *cpuAccumulator) take(cpus cpuset.CPUSet) {
- a.result = a.result.Union(cpus)
- a.details = a.details.KeepOnly(a.details.CPUs().Difference(a.result))
- a.numCPUsNeeded -= cpus.Size()
-}
-
-// isSocketFree Returns true if the supplied socket is fully available in `topoDetails`.
-func (a *cpuAccumulator) isSocketFree(socketID int) bool {
- return a.details.CPUsInSockets(socketID).Size() == a.topo.CPUsPerSocket()
-}
-
-// isCoreFree Returns true if the supplied core is fully available in `topoDetails`.
-func (a *cpuAccumulator) isCoreFree(coreID int) bool {
- return a.details.CPUsInCores(coreID).Size() == a.topo.CPUsPerCore()
-}
-
-// freeSockets Returns free socket IDs as a slice sorted by:
-// - socket ID, ascending.
-func (a *cpuAccumulator) freeSockets() []int {
- return a.details.Sockets().Filter(a.isSocketFree).ToSlice()
-}
-
-// freeCores Returns core IDs as a slice sorted by:
-// - the number of whole available cores on the socket, ascending
-// - socket ID, ascending
-// - core ID, ascending
-func (a *cpuAccumulator) freeCores() []int {
- socketIDs := a.details.Sockets().ToSliceNoSort()
- sort.Slice(socketIDs,
- func(i, j int) bool {
- iCores := a.details.CoresInSockets(socketIDs[i]).Filter(a.isCoreFree)
- jCores := a.details.CoresInSockets(socketIDs[j]).Filter(a.isCoreFree)
- return iCores.Size() < jCores.Size() || socketIDs[i] < socketIDs[j]
- })
-
- coreIDs := []int{}
- for _, s := range socketIDs {
- coreIDs = append(coreIDs, a.details.CoresInSockets(s).Filter(a.isCoreFree).ToSlice()...)
- }
- return coreIDs
-}
-
-// freeCPUs Returns CPU IDs as a slice sorted by:
-// - socket affinity with result
-// - number of CPUs available on the same socket
-// - number of CPUs available on the same core
-// - socket ID.
-// - core ID.
-func (a *cpuAccumulator) freeCPUs() []int {
- result := []int{}
- cores := a.details.Cores().ToSlice()
-
- sort.Slice(
- cores,
- func(i, j int) bool {
- iCore := cores[i]
- jCore := cores[j]
-
- iCPUs := a.topo.CPUDetails.CPUsInCores(iCore).ToSlice()
- jCPUs := a.topo.CPUDetails.CPUsInCores(jCore).ToSlice()
-
- iSocket := a.topo.CPUDetails[iCPUs[0]].SocketID
- jSocket := a.topo.CPUDetails[jCPUs[0]].SocketID
-
- // Compute the number of CPUs in the result reside on the same socket
- // as each core.
- iSocketColoScore := a.topo.CPUDetails.CPUsInSockets(iSocket).Intersection(a.result).Size()
- jSocketColoScore := a.topo.CPUDetails.CPUsInSockets(jSocket).Intersection(a.result).Size()
-
- // Compute the number of available CPUs available on the same socket
- // as each core.
- iSocketFreeScore := a.details.CPUsInSockets(iSocket).Size()
- jSocketFreeScore := a.details.CPUsInSockets(jSocket).Size()
-
- // Compute the number of available CPUs on each core.
- iCoreFreeScore := a.details.CPUsInCores(iCore).Size()
- jCoreFreeScore := a.details.CPUsInCores(jCore).Size()
-
- return iSocketColoScore > jSocketColoScore ||
- iSocketFreeScore < jSocketFreeScore ||
- iCoreFreeScore < jCoreFreeScore ||
- iSocket < jSocket ||
- iCore < jCore
- })
-
- // For each core, append sorted CPU IDs to result.
- for _, core := range cores {
- result = append(result, a.details.CPUsInCores(core).ToSlice()...)
- }
- return result
-}
-
-func (a *cpuAccumulator) needs(n int) bool {
- return a.numCPUsNeeded >= n
-}
-
-func (a *cpuAccumulator) isSatisfied() bool {
- return a.numCPUsNeeded < 1
-}
-
-func (a *cpuAccumulator) isFailed() bool {
- return a.numCPUsNeeded > a.details.CPUs().Size()
-}
-
-// takeByTopology return the assigned cpuset
-func takeByTopology(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) {
- acc := newCPUAccumulator(topo, availableCPUs, numCPUs)
- if acc.isSatisfied() {
- return acc.result, nil
- }
- if acc.isFailed() {
- return cpuset.NewCPUSet(), fmt.Errorf("not enough cpus available to satisfy request")
- }
-
- // Algorithm: topology-aware best-fit
- // 1. Acquire whole sockets, if available and the container requires at
- // least a socket's-worth of CPUs.
- if acc.needs(acc.topo.CPUsPerSocket()) {
- for _, s := range acc.freeSockets() {
- klog.V(4).Infof("[cpumanager] takeByTopology: claiming socket [%d]", s)
- acc.take(acc.details.CPUsInSockets(s))
- if acc.isSatisfied() {
- return acc.result, nil
- }
- if !acc.needs(acc.topo.CPUsPerSocket()) {
- break
- }
- }
- }
-
- // 2. Acquire whole cores, if available and the container requires at least
- // a core's-worth of CPUs.
- if acc.needs(acc.topo.CPUsPerCore()) {
- for _, c := range acc.freeCores() {
- klog.V(4).Infof("[cpumanager] takeByTopology: claiming core [%d]", c)
- acc.take(acc.details.CPUsInCores(c))
- if acc.isSatisfied() {
- return acc.result, nil
- }
- if !acc.needs(acc.topo.CPUsPerCore()) {
- break
- }
- }
- }
-
- // 3. Acquire single threads, preferring to fill partially-allocated cores
- // on the same sockets as the whole cores we have already taken in this
- // allocation.
- for _, c := range acc.freeCPUs() {
- klog.V(4).Infof("[cpumanager] takeByTopology: claiming CPU [%d]", c)
- if acc.needs(1) {
- acc.take(cpuset.NewCPUSet(c))
- }
- if acc.isSatisfied() {
- return acc.result, nil
- }
- }
-
- return cpuset.NewCPUSet(), fmt.Errorf("failed to allocate cpus")
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package cpumanager
-
-import (
- "math"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
- "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
- "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/plugins/numaaware/policy"
-)
-
-type cpuMng struct {
-}
-
-// NewProvider return a new provider
-func NewProvider() policy.HintProvider {
- return &cpuMng{}
-}
-
-// Name return the cpu manager name
-func (mng *cpuMng) Name() string {
- return "cpuMng"
-}
-
-// guaranteedCPUs return the intger num of request cpu
-func guaranteedCPUs(container *v1.Container) int {
- cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
- if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
- return 0
- }
-
- return int(cpuQuantity.Value())
-}
-
-// generateCPUTopologyHints return the numa topology hints based on
-// - availableCPUs
-func generateCPUTopologyHints(availableCPUs cpuset.CPUSet, CPUDetails topology.CPUDetails, request int) []policy.TopologyHint {
- minAffinitySize := CPUDetails.NUMANodes().Size()
- hints := []policy.TopologyHint{}
- bitmask.IterateBitMasks(CPUDetails.NUMANodes().ToSlice(), func(mask bitmask.BitMask) {
- // First, update minAffinitySize for the current request size.
- cpusInMask := CPUDetails.CPUsInNUMANodes(mask.GetBits()...).Size()
- if cpusInMask >= request && mask.Count() < minAffinitySize {
- minAffinitySize = mask.Count()
- }
-
- // Then check to see if we have enough CPUs available on the current
- // numa node bitmask to satisfy the CPU request.
- numMatching := 0
- // Finally, check to see if enough available CPUs remain on the current
- // NUMA node combination to satisfy the CPU request.
- for _, c := range availableCPUs.ToSlice() {
- if mask.IsSet(CPUDetails[c].NUMANodeID) {
- numMatching++
- }
- }
-
- // If they don't, then move onto the next combination.
- if numMatching < request {
- return
- }
-
- // Otherwise, create a new hint from the numa node bitmask and add it to the
- // list of hints. We set all hint preferences to 'false' on the first
- // pass through.
- hints = append(hints, policy.TopologyHint{
- NUMANodeAffinity: mask,
- Preferred: false,
- })
- })
-
- // Loop back through all hints and update the 'Preferred' field based on
- // counting the number of bits sets in the affinity mask and comparing it
- // to the minAffinitySize. Only those with an equal number of bits set (and
- // with a minimal set of numa nodes) will be considered preferred.
- for i := range hints {
- if hints[i].NUMANodeAffinity.Count() == minAffinitySize {
- hints[i].Preferred = true
- }
- }
-
- return hints
-}
-
-func (mng *cpuMng) GetTopologyHints(container *v1.Container,
- topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string][]policy.TopologyHint {
- if _, ok := container.Resources.Requests[v1.ResourceCPU]; !ok {
- klog.Warningf("container %s has no cpu request", container.Name)
- return nil
- }
-
- requestNum := guaranteedCPUs(container)
- if requestNum == 0 {
- klog.Warningf(" the cpu request isn't integer in container %s", container.Name)
- return nil
- }
-
- cputopo := &topology.CPUTopology{
- NumCPUs: topoInfo.CPUDetail.CPUs().Size(),
- NumCores: topoInfo.CPUDetail.Cores().Size() * topoInfo.CPUDetail.Sockets().Size(),
- NumSockets: topoInfo.CPUDetail.Sockets().Size(),
- CPUDetails: topoInfo.CPUDetail,
- }
-
- reserved := cpuset.NewCPUSet()
- reservedCPUs, ok := topoInfo.ResReserved[v1.ResourceCPU]
- if ok {
- // Take the ceiling of the reservation, since fractional CPUs cannot be
- // exclusively allocated.
- reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
- numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
- reserved, _ = takeByTopology(cputopo, cputopo.CPUDetails.CPUs(), numReservedCPUs)
- klog.V(4).Infof("[cpumanager] reserve cpuset :%v", reserved)
- }
-
- availableCPUSet, ok := resNumaSets[string(v1.ResourceCPU)]
- if !ok {
- klog.Warningf("no cpu resource")
- return nil
- }
-
- availableCPUSet = availableCPUSet.Difference(reserved)
- klog.V(4).Infof("requested: %d, availableCPUSet: %v", requestNum, availableCPUSet)
- return map[string][]policy.TopologyHint{
- string(v1.ResourceCPU): generateCPUTopologyHints(availableCPUSet, topoInfo.CPUDetail, requestNum),
- }
-}
-
-func (mng *cpuMng) Allocate(container *v1.Container, bestHit *policy.TopologyHint,
- topoInfo *api.NumatopoInfo, resNumaSets api.ResNumaSets) map[string]cpuset.CPUSet {
- cputopo := &topology.CPUTopology{
- NumCPUs: topoInfo.CPUDetail.CPUs().Size(),
- NumCores: topoInfo.CPUDetail.Cores().Size() * topoInfo.CPUDetail.Sockets().Size(),
- NumSockets: topoInfo.CPUDetail.Sockets().Size(),
- CPUDetails: topoInfo.CPUDetail,
- }
-
- reserved := cpuset.NewCPUSet()
- reservedCPUs, ok := topoInfo.ResReserved[v1.ResourceCPU]
- if ok {
- // Take the ceiling of the reservation, since fractional CPUs cannot be
- // exclusively allocated.
- reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
- numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
- reserved, _ = takeByTopology(cputopo, cputopo.CPUDetails.CPUs(), numReservedCPUs)
- klog.V(3).Infof("[cpumanager] reserve cpuset :%v", reserved)
- }
-
- requestNum := guaranteedCPUs(container)
- availableCPUSet := resNumaSets[string(v1.ResourceCPU)]
- availableCPUSet = availableCPUSet.Difference(reserved)
-
- klog.V(4).Infof("alignedCPUs: %v requestNum: %v bestHit %v", availableCPUSet, requestNum, bestHit)
-
- result := cpuset.NewCPUSet()
- if bestHit.NUMANodeAffinity != nil {
- alignedCPUs := cpuset.NewCPUSet()
- for _, numaNodeID := range bestHit.NUMANodeAffinity.GetBits() {
- alignedCPUs = alignedCPUs.Union(availableCPUSet.Intersection(cputopo.CPUDetails.CPUsInNUMANodes(numaNodeID)))
- }
-
- numAlignedToAlloc := alignedCPUs.Size()
- if requestNum < numAlignedToAlloc {
- numAlignedToAlloc = requestNum
- }
-
- alignedCPUs, err := takeByTopology(cputopo, alignedCPUs, numAlignedToAlloc)
- if err != nil {
- return map[string]cpuset.CPUSet{
- string(v1.ResourceCPU): cpuset.NewCPUSet(),
- }
- }
-
- result = result.Union(alignedCPUs)
- }
-
- // Get any remaining CPUs from what's leftover after attempting to grab aligned ones.
- remainingCPUs, err := takeByTopology(cputopo, availableCPUSet.Difference(result), requestNum-result.Size())
- if err != nil {
- return map[string]cpuset.CPUSet{
- string(v1.ResourceCPU): cpuset.NewCPUSet(),
- }
- }
-
- result = result.Union(remainingCPUs)
-
- return map[string]cpuset.CPUSet{
- string(v1.ResourceCPU): result,
- }
-}
-
-
-
/*
-Copyright 2020 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package predicates
-
-import (
- "fmt"
- "sync"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- batch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
-)
-
-type predicateCache struct {
- sync.RWMutex
- cache map[string]map[string]bool //key_1: nodename key_2:pod uid
-}
-
-// predicateCacheNew return cache map
-func predicateCacheNew() *predicateCache {
- return &predicateCache{
- cache: make(map[string]map[string]bool),
- }
-}
-
-// getPodTemplateUID return pod template key
-func getPodTemplateUID(pod *v1.Pod) string {
- uid, found := pod.Annotations[batch.PodTemplateKey]
- if !found {
- return ""
- }
-
- return uid
-}
-
-// PredicateWithCache: check the predicate result existed in cache
-func (pc *predicateCache) PredicateWithCache(nodeName string, pod *v1.Pod) (bool, error) {
- podTemplateUID := getPodTemplateUID(pod)
- if podTemplateUID == "" {
- return false, fmt.Errorf("no anonation of volcano.sh/template-uid in pod %s", pod.Name)
- }
-
- pc.RLock()
- defer pc.RUnlock()
- if nodeCache, exist := pc.cache[nodeName]; exist {
- if result, exist := nodeCache[podTemplateUID]; exist {
- klog.V(4).Infof("Predicate node %s and pod %s result %v", nodeName, pod.Name, result)
- return result, nil
- }
- }
-
- return false, fmt.Errorf("no information of node %s and pod %s in predicate cache", nodeName, pod.Name)
-}
-
-// UpdateCache update cache data
-func (pc *predicateCache) UpdateCache(nodeName string, pod *v1.Pod, fit bool) {
- podTemplateUID := getPodTemplateUID(pod)
- if podTemplateUID == "" {
- klog.V(3).Infof("Don't find pod %s template uid", pod.Name)
- return
- }
-
- pc.Lock()
- defer pc.Unlock()
-
- if _, exist := pc.cache[nodeName]; !exist {
- podCache := make(map[string]bool)
- podCache[podTemplateUID] = fit
- pc.cache[nodeName] = podCache
- } else {
- pc.cache[nodeName][podTemplateUID] = fit
- }
-}
-
-
-
/*
-Copyright 2020 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package predicates
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-// checkNodeGPUSharingPredicate checks if a gpu sharing pod can be scheduled on a node.
-func checkNodeGPUSharingPredicate(pod *v1.Pod, nodeInfo *api.NodeInfo) (bool, error) {
- // no gpu sharing request
- if api.GetGPUResourceOfPod(pod) <= 0 {
- return true, nil
- }
-
- id := predicateGPU(pod, nodeInfo)
- if id < 0 {
- return false, fmt.Errorf("no enough gpu memory on single device of node %s", nodeInfo.Name)
- }
- return true, nil
-}
-
-// predicateGPU returns the available GPU ID
-func predicateGPU(pod *v1.Pod, node *api.NodeInfo) int {
- gpuRequest := api.GetGPUResourceOfPod(pod)
- allocatableGPUs := node.GetDevicesIdleGPUMemory()
-
- for devID := 0; devID < len(allocatableGPUs); devID++ {
- availableGPU, ok := allocatableGPUs[devID]
- if ok {
- if availableGPU >= gpuRequest {
- return devID
- }
- }
- }
-
- return -1
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package predicates
-
-import (
- "context"
- "fmt"
- "strings"
-
- v1 "k8s.io/api/core/v1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/types"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/scheduler/apis/config"
- "k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity"
- "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity"
- "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports"
- "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeunschedulable"
- "k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
- k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/plugins/util"
- "volcano.sh/volcano/pkg/scheduler/plugins/util/k8s"
-)
-
-const (
- // PluginName indicates name of volcano scheduler plugin.
- PluginName = "predicates"
-
- // GPUSharingPredicate is the key for enabling GPU Sharing Predicate in YAML
- GPUSharingPredicate = "predicate.GPUSharingEnable"
-
- // CachePredicate control cache predicate feature
- CachePredicate = "predicate.CacheEnable"
-
- // ProportionalPredicate is the key for enabling Proportional Predicate in YAML
- ProportionalPredicate = "predicate.ProportionalEnable"
- // ProportionalResource is the key for additional resource key name
- ProportionalResource = "predicate.resources"
- // ProportionalResourcesPrefix is the key prefix for additional resource key name
- ProportionalResourcesPrefix = ProportionalResource + "."
-)
-
-type predicatesPlugin struct {
- // Arguments given for the plugin
- pluginArguments framework.Arguments
-}
-
-// New return predicate plugin
-func New(arguments framework.Arguments) framework.Plugin {
- return &predicatesPlugin{pluginArguments: arguments}
-}
-
-func (pp *predicatesPlugin) Name() string {
- return PluginName
-}
-
-type baseResource struct {
- CPU float64
- Memory float64
-}
-
-type predicateEnable struct {
- gpuSharingEnable bool
- cacheEnable bool
- proportionalEnable bool
- proportional map[v1.ResourceName]baseResource
-}
-
-func enablePredicate(args framework.Arguments) predicateEnable {
- /*
- User Should give predicatesEnable in this format(predicate.GPUSharingEnable).
- Currently supported only GPUSharing predicate checks.
-
- actions: "reclaim, allocate, backfill, preempt"
- tiers:
- - plugins:
- - name: priority
- - name: gang
- - name: conformance
- - plugins:
- - name: drf
- - name: predicates
- arguments:
- predicate.GPUSharingEnable: true
- predicate.CacheEnable: true
- predicate.ProportionalEnable: true
- predicate.resources: nvidia.com/gpu
- predicate.resources.nvidia.com/gpu.cpu: 4
- predicate.resources.nvidia.com/gpu.memory: 8
- - name: proportion
- - name: nodeorder
- */
-
- predicate := predicateEnable{
- gpuSharingEnable: false,
- cacheEnable: false,
- proportionalEnable: false,
- }
-
- // Checks whether predicate.GPUSharingEnable is provided or not, if given, modifies the value in predicateEnable struct.
- args.GetBool(&predicate.gpuSharingEnable, GPUSharingPredicate)
- args.GetBool(&predicate.cacheEnable, CachePredicate)
- // Checks whether predicate.ProportionalEnable is provided or not, if given, modifies the value in predicateEnable struct.
- args.GetBool(&predicate.proportionalEnable, ProportionalPredicate)
- resourcesProportional := make(map[v1.ResourceName]baseResource)
- resourcesStr := args[ProportionalResource]
- resources := strings.Split(resourcesStr, ",")
- for _, resource := range resources {
- resource = strings.TrimSpace(resource)
- if resource == "" {
- continue
- }
- // proportional.resources.[ResourceName]
- cpuResourceKey := ProportionalResourcesPrefix + resource + ".cpu"
- cpuResourceRate := 1.0
- args.GetFloat64(&cpuResourceRate, cpuResourceKey)
- if cpuResourceRate < 0 {
- cpuResourceRate = 1.0
- }
- memoryResourceKey := ProportionalResourcesPrefix + resource + ".memory"
- memoryResourceRate := 1.0
- args.GetFloat64(&memoryResourceRate, memoryResourceKey)
- if memoryResourceRate < 0 {
- memoryResourceRate = 1.0
- }
- r := baseResource{
- CPU: cpuResourceRate,
- Memory: memoryResourceRate,
- }
- resourcesProportional[v1.ResourceName(resource)] = r
- }
- predicate.proportional = resourcesProportional
-
- return predicate
-}
-
-func (pp *predicatesPlugin) OnSessionOpen(ssn *framework.Session) {
- pl := util.NewPodListerFromNode(ssn)
- nodeMap := util.GenerateNodeMapAndSlice(ssn.Nodes)
-
- pCache := predicateCacheNew()
- predicate := enablePredicate(pp.pluginArguments)
-
- kubeClient := ssn.KubeClient()
- // Register event handlers to update task info in PodLister & nodeMap
- ssn.AddEventHandler(&framework.EventHandler{
- AllocateFunc: func(event *framework.Event) {
- pod := pl.UpdateTask(event.Task, event.Task.NodeName)
-
- nodeName := event.Task.NodeName
- node, found := nodeMap[nodeName]
- if !found {
- klog.Errorf("predicates, update pod %s/%s allocate to NOT EXIST node [%s]", pod.Namespace, pod.Name, nodeName)
- return
- }
-
- if predicate.gpuSharingEnable && api.GetGPUResourceOfPod(pod) > 0 {
- nodeInfo, ok := ssn.Nodes[nodeName]
- if !ok {
- klog.Errorf("Failed to get node %s info from cache", nodeName)
- return
- }
-
- id := predicateGPU(pod, nodeInfo)
- if id < 0 {
- klog.Errorf("The node %s can't place the pod %s in ns %s", pod.Spec.NodeName, pod.Name, pod.Namespace)
- return
- }
- dev, ok := nodeInfo.GPUDevices[id]
- if !ok {
- klog.Errorf("Failed to get GPU %d from node %s", id, nodeName)
- return
- }
- patch := api.AddGPUIndexPatch(id)
- pod, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{})
- if err != nil {
- klog.Errorf("Patch pod %s failed with patch %s: %v", pod.Name, patch, err)
- return
- }
- dev.PodMap[string(pod.UID)] = pod
- klog.V(4).Infof("predicates with gpu sharing, update pod %s/%s allocate to node [%s]", pod.Namespace, pod.Name, nodeName)
- }
-
- node.AddPod(pod)
- klog.V(4).Infof("predicates, update pod %s/%s allocate to node [%s]", pod.Namespace, pod.Name, nodeName)
- },
- DeallocateFunc: func(event *framework.Event) {
- pod := pl.UpdateTask(event.Task, "")
- nodeName := event.Task.NodeName
- node, found := nodeMap[nodeName]
- if !found {
- klog.Errorf("predicates, update pod %s/%s allocate from NOT EXIST node [%s]", pod.Namespace, pod.Name, nodeName)
- return
- }
-
- if predicate.gpuSharingEnable && api.GetGPUResourceOfPod(pod) > 0 {
- // deallocate pod gpu id
- id := api.GetGPUIndex(pod)
- patch := api.RemoveGPUIndexPatch()
- _, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{})
- if err != nil {
- klog.Errorf("Patch pod %s failed with patch %s: %v", pod.Name, patch, err)
- return
- }
-
- nodeInfo, ok := ssn.Nodes[nodeName]
- if !ok {
- klog.Errorf("Failed to get node %s info from cache", nodeName)
- return
- }
- if dev, ok := nodeInfo.GPUDevices[id]; ok {
- delete(dev.PodMap, string(pod.UID))
- }
-
- klog.V(4).Infof("predicates with gpu sharing, update pod %s/%s deallocate from node [%s]", pod.Namespace, pod.Name, nodeName)
- }
-
- err := node.RemovePod(pod)
- if err != nil {
- klog.Errorf("predicates, remove pod %s/%s from node [%s] error: %v", pod.Namespace, pod.Name, nodeName, err)
- return
- }
- klog.V(4).Infof("predicates, update pod %s/%s deallocate from node [%s]", pod.Namespace, pod.Name, nodeName)
- },
- })
-
- // Initialize k8s plugins
- // TODO: Add more predicates, k8s.io/kubernetes/pkg/scheduler/framework/plugins/legacy_registry.go
- handle := k8s.NewFrameworkHandle(nodeMap, ssn.KubeClient(), ssn.InformerFactory())
- // 1. NodeUnschedulable
- plugin, _ := nodeunschedulable.New(nil, handle)
- nodeUnscheduleFilter := plugin.(*nodeunschedulable.NodeUnschedulable)
- // 2. NodeAffinity
- plugin, _ = nodeaffinity.New(nil, handle)
- nodeAffinityFilter := plugin.(*nodeaffinity.NodeAffinity)
- // 3. NodePorts
- plugin, _ = nodeports.New(nil, handle)
- nodePortFilter := plugin.(*nodeports.NodePorts)
- // 4. TaintToleration
- plugin, _ = tainttoleration.New(nil, handle)
- tolerationFilter := plugin.(*tainttoleration.TaintToleration)
- // 5. InterPodAffinity
- plArgs := &config.InterPodAffinityArgs{}
- plugin, _ = interpodaffinity.New(plArgs, handle)
- podAffinityFilter := plugin.(*interpodaffinity.InterPodAffinity)
-
- ssn.AddPredicateFn(pp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) error {
- nodeInfo, found := nodeMap[node.Name]
- if !found {
- return fmt.Errorf("failed to predicates, node info for %s not found", node.Name)
- }
-
- if node.Allocatable.MaxTaskNum <= len(nodeInfo.Pods) {
- klog.V(4).Infof("NodePodNumber predicates Task <%s/%s> on Node <%s> failed",
- task.Namespace, task.Name, node.Name)
- return api.NewFitError(task, node, api.NodePodNumberExceeded)
- }
-
- state := k8sframework.NewCycleState()
- predicateByStablefilter := func(pod *v1.Pod, nodeInfo *k8sframework.NodeInfo) (bool, error) {
- // CheckNodeUnschedulable
- status := nodeUnscheduleFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
- if !status.IsSuccess() {
- return false, fmt.Errorf("plugin %s predicates failed %s", nodeunschedulable.Name, status.Message())
- }
-
- // Check NodeAffinity
- status = nodeAffinityFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
- if !status.IsSuccess() {
- return false, fmt.Errorf("plugin %s predicates failed %s", nodeaffinity.Name, status.Message())
- }
-
- // PodToleratesNodeTaints: TaintToleration
- status = tolerationFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
- if !status.IsSuccess() {
- return false, fmt.Errorf("plugin %s predicates failed %s", tainttoleration.Name, status.Message())
- }
-
- return true, nil
- }
-
- // Check PredicateWithCache
- {
- var err error
- var fit bool
- if predicate.cacheEnable {
- fit, err = pCache.PredicateWithCache(node.Name, task.Pod)
- if err != nil {
- fit, err = predicateByStablefilter(task.Pod, nodeInfo)
- pCache.UpdateCache(node.Name, task.Pod, fit)
- } else {
- if !fit {
- err = fmt.Errorf("plugin equivalence cache predicates failed")
- }
- }
- } else {
- fit, err = predicateByStablefilter(task.Pod, nodeInfo)
- }
-
- if !fit {
- return err
- }
- }
-
- // Check NodePorts
- nodePortFilter.PreFilter(context.TODO(), state, task.Pod)
- status := nodePortFilter.Filter(context.TODO(), state, nil, nodeInfo)
- if !status.IsSuccess() {
- return fmt.Errorf("plugin %s predicates failed %s", nodeaffinity.Name, status.Message())
- }
-
- // InterPodAffinity Predicate
- status = podAffinityFilter.PreFilter(context.TODO(), state, task.Pod)
- if !status.IsSuccess() {
- return fmt.Errorf("plugin %s pre-predicates failed %s", interpodaffinity.Name, status.Message())
- }
-
- status = podAffinityFilter.Filter(context.TODO(), state, task.Pod, nodeInfo)
- if !status.IsSuccess() {
- return fmt.Errorf("plugin %s predicates failed %s", interpodaffinity.Name, status.Message())
- }
-
- if predicate.gpuSharingEnable {
- // CheckGPUSharingPredicate
- fit, err := checkNodeGPUSharingPredicate(task.Pod, node)
- if err != nil {
- return err
- }
-
- klog.V(4).Infof("checkNodeGPUSharingPredicate predicates Task <%s/%s> on Node <%s>: fit %v",
- task.Namespace, task.Name, node.Name, fit)
- }
- if predicate.proportionalEnable {
- // Check ProportionalPredicate
- fit, err := checkNodeResourceIsProportional(task, node, predicate.proportional)
- if err != nil {
- return err
- }
- klog.V(4).Infof("checkNodeResourceIsProportional predicates Task <%s/%s> on Node <%s>: fit %v",
- task.Namespace, task.Name, node.Name, fit)
- }
- return nil
- })
-}
-
-func (pp *predicatesPlugin) OnSessionClose(ssn *framework.Session) {}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package predicates
-
-import (
- "fmt"
-
- v1 "k8s.io/api/core/v1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-// checkNodeResourceIsProportional checks if a gpu:cpu:memory is Proportional
-func checkNodeResourceIsProportional(task *api.TaskInfo, node *api.NodeInfo, proportional map[v1.ResourceName]baseResource) (bool, error) {
- for resourceName := range proportional {
- if value, found := task.Resreq.ScalarResources[resourceName]; found && value > 0 {
- return true, nil
- }
- }
- for resourceName, resourceRate := range proportional {
- if value, found := node.Idle.ScalarResources[resourceName]; found {
- cpuReserved := value * resourceRate.CPU
- memoryReserved := value * resourceRate.Memory * 1000 * 1000
- r := node.Idle.Clone()
- r = r.Sub(task.Resreq)
- if r.MilliCPU < cpuReserved || r.Memory < memoryReserved {
- return false, fmt.Errorf("proportional of resource %s check failed", resourceName)
- }
- }
- }
- return true, nil
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package tasktopology
-
-import (
- "k8s.io/apimachinery/pkg/types"
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-type reqAction int
-
-const (
- reqSub reqAction = iota
- reqAdd
-)
-
-// Bucket is struct used to classify tasks by affinity and anti-affinity
-type Bucket struct {
- index int
- tasks map[types.UID]*api.TaskInfo
- taskNameSet map[string]int
-
- // reqScore is score of resource
- // now, we regard 1 CPU and 1 GPU and 1Gi memory as the same score.
- reqScore float64
- request *api.Resource
-
- boundTask int
- node map[string]int
-}
-
-// NewBucket create a new empty bucket
-func NewBucket() *Bucket {
- return &Bucket{
- index: 0,
- tasks: make(map[types.UID]*api.TaskInfo),
- taskNameSet: make(map[string]int),
-
- reqScore: 0,
- request: api.EmptyResource(),
-
- boundTask: 0,
- node: make(map[string]int),
- }
-}
-
-// CalcResReq calculates task resources request
-func (b *Bucket) CalcResReq(req *api.Resource, action reqAction) {
- if req == nil {
- return
- }
-
- cpu := req.MilliCPU
- // treat 1Mi the same as 1m cpu 1m gpu
- mem := req.Memory / 1024 / 1024
- score := cpu + mem
- for _, request := range req.ScalarResources {
- score += request
- }
-
- switch action {
- case reqSub:
- b.reqScore -= score
- b.request.Sub(req)
- case reqAdd:
- b.reqScore += score
- b.request.Add(req)
- default:
- klog.V(3).Infof("Invalid action <%v> for resource <%v>", action, req)
- }
-}
-
-// AddTask adds task into bucket
-func (b *Bucket) AddTask(taskName string, task *api.TaskInfo) {
- b.taskNameSet[taskName]++
- if task.NodeName != "" {
- b.node[task.NodeName]++
- b.boundTask++
- return
- }
-
- b.tasks[task.Pod.UID] = task
- b.CalcResReq(task.Resreq, reqAdd)
-}
-
-// TaskBound binds task to bucket
-func (b *Bucket) TaskBound(task *api.TaskInfo) {
- b.node[task.NodeName]++
- b.boundTask++
-
- delete(b.tasks, task.Pod.UID)
- b.CalcResReq(task.Resreq, reqSub)
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package tasktopology
-
-import (
- "fmt"
- "math"
- "sort"
- "strings"
-
- "k8s.io/apimachinery/pkg/types"
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-type topologyType int
-
-const (
- selfAntiAffinity topologyType = iota
- interAntiAffinity
- selfAffinity
- interAffinity
-)
-
-// map[topologyType]priority, the larger number means the higher priority
-var affinityPriority = map[topologyType]int{
- selfAntiAffinity: 4,
- interAffinity: 3,
- selfAffinity: 2,
- interAntiAffinity: 1,
-}
-
-// JobManager is struct used to save infos about affinity and buckets of a job
-type JobManager struct {
- jobID api.JobID
-
- buckets []*Bucket
- podInBucket map[types.UID]int
- podInTask map[types.UID]string
- taskOverPod map[string]map[types.UID]struct{}
-
- taskAffinityPriority map[string]int // [taskName] -> priority
- taskExistOrder map[string]int
- interAffinity map[string]map[string]struct{} // [taskName]->[taskName]
- selfAffinity map[string]struct{}
- interAntiAffinity map[string]map[string]struct{} // [taskName]->[taskName]
- selfAntiAffinity map[string]struct{}
-
- bucketMaxSize int
- nodeTaskSet map[string]map[string]int // [nodeName]->[taskName]
-}
-
-// NewJobManager creates a new job manager for job
-func NewJobManager(jobID api.JobID) *JobManager {
- return &JobManager{
- jobID: jobID,
-
- buckets: make([]*Bucket, 0),
- podInBucket: make(map[types.UID]int),
- podInTask: make(map[types.UID]string),
- taskOverPod: make(map[string]map[types.UID]struct{}),
-
- taskAffinityPriority: make(map[string]int),
- taskExistOrder: make(map[string]int),
- interAffinity: make(map[string]map[string]struct{}),
- interAntiAffinity: make(map[string]map[string]struct{}),
- selfAffinity: make(map[string]struct{}),
- selfAntiAffinity: make(map[string]struct{}),
-
- bucketMaxSize: 0,
- nodeTaskSet: make(map[string]map[string]int),
- }
-}
-
-// MarkOutOfBucket indicates task is outside of any bucket
-func (jm *JobManager) MarkOutOfBucket(uid types.UID) {
- jm.podInBucket[uid] = OutOfBucket
-}
-
-// MarkTaskHasTopology indicates task has topology settings
-func (jm *JobManager) MarkTaskHasTopology(taskName string, topoType topologyType) {
- priority := affinityPriority[topoType]
- if priority > jm.taskAffinityPriority[taskName] {
- jm.taskAffinityPriority[taskName] = priority
- }
-}
-
-// ApplyTaskTopology transforms taskTopology to matrix
-// affinity: [[a, b], [c]]
-// interAffinity:
-// a b c
-// a - x -
-// b x - -
-// c - - -
-// selfAffinity:
-// a b c
-// - - x
-func (jm *JobManager) ApplyTaskTopology(topo *TaskTopology) {
- for _, aff := range topo.Affinity {
- if len(aff) == 1 {
- taskName := aff[0]
- jm.selfAffinity[taskName] = struct{}{}
- jm.MarkTaskHasTopology(taskName, selfAffinity)
- continue
- }
- for index, src := range aff {
- for _, dst := range aff[:index] {
- addAffinity(jm.interAffinity, src, dst)
- addAffinity(jm.interAffinity, dst, src)
- }
- jm.MarkTaskHasTopology(src, interAffinity)
- }
- }
-
- for _, aff := range topo.AntiAffinity {
- if len(aff) == 1 {
- taskName := aff[0]
- jm.selfAntiAffinity[taskName] = struct{}{}
- jm.MarkTaskHasTopology(taskName, selfAntiAffinity)
- continue
- }
- for index, src := range aff {
- for _, dst := range aff[:index] {
- addAffinity(jm.interAntiAffinity, src, dst)
- addAffinity(jm.interAntiAffinity, dst, src)
- }
- jm.MarkTaskHasTopology(src, interAntiAffinity)
- }
- }
-
- length := len(topo.TaskOrder)
- for index, taskName := range topo.TaskOrder {
- jm.taskExistOrder[taskName] = length - index
- }
-}
-
-// NewBucket creates a new bucket
-func (jm *JobManager) NewBucket() *Bucket {
- bucket := NewBucket()
- bucket.index = len(jm.buckets)
- jm.buckets = append(jm.buckets, bucket)
- return bucket
-}
-
-// AddTaskToBucket adds task into bucket
-func (jm *JobManager) AddTaskToBucket(bucketIndex int, taskName string, task *api.TaskInfo) {
- bucket := jm.buckets[bucketIndex]
- jm.podInBucket[task.Pod.UID] = bucketIndex
- bucket.AddTask(taskName, task)
- if size := len(bucket.tasks) + bucket.boundTask; size > jm.bucketMaxSize {
- jm.bucketMaxSize = size
- }
-}
-
-// L compared with R, -1 for L < R, 0 for L == R, 1 for L > R
-func (jm *JobManager) taskAffinityOrder(L, R *api.TaskInfo) int {
- LTaskName := jm.podInTask[L.Pod.UID]
- RTaskName := jm.podInTask[R.Pod.UID]
-
- // in the same vk task, they are equal
- if LTaskName == RTaskName {
- return 0
- }
-
- // use user defined order firstly
- LOrder := jm.taskExistOrder[LTaskName]
- ROrder := jm.taskExistOrder[RTaskName]
- if LOrder != ROrder {
- if LOrder > ROrder {
- return 1
- }
- return -1
- }
-
- LPriority := jm.taskAffinityPriority[LTaskName]
- RPriority := jm.taskAffinityPriority[RTaskName]
- if LPriority != RPriority {
- if LPriority > RPriority {
- return 1
- }
- return -1
- }
-
- // all affinity setting of L and R are the same, they are equal
- return 0
-}
-
-func (jm *JobManager) buildTaskInfo(tasks map[api.TaskID]*api.TaskInfo) []*api.TaskInfo {
- taskWithoutBucket := make([]*api.TaskInfo, 0, len(tasks))
- for _, task := range tasks {
- pod := task.Pod
-
- taskName := getTaskName(task)
- if taskName == "" {
- jm.MarkOutOfBucket(pod.UID)
- continue
- }
- if _, hasTopology := jm.taskAffinityPriority[taskName]; !hasTopology {
- jm.MarkOutOfBucket(pod.UID)
- continue
- }
-
- jm.podInTask[pod.UID] = taskName
- taskSet, ok := jm.taskOverPod[taskName]
- if !ok {
- taskSet = make(map[types.UID]struct{})
- jm.taskOverPod[taskName] = taskSet
- }
- taskSet[pod.UID] = struct{}{}
- taskWithoutBucket = append(taskWithoutBucket, task)
- }
- return taskWithoutBucket
-}
-
-func (jm *JobManager) checkTaskSetAffinity(taskName string, taskNameSet map[string]int, onlyAnti bool) int {
- bucketPodAff := 0
-
- if taskName == "" {
- return bucketPodAff
- }
-
- for taskNameInBucket, count := range taskNameSet {
- theSameTask := taskNameInBucket == taskName
-
- if !onlyAnti {
- affinity := false
- if theSameTask {
- _, affinity = jm.selfAffinity[taskName]
- } else {
- _, affinity = jm.interAffinity[taskName][taskNameInBucket]
- }
- if affinity {
- bucketPodAff += count
- }
- }
-
- antiAffinity := false
- if theSameTask {
- _, antiAffinity = jm.selfAntiAffinity[taskName]
- } else {
- _, antiAffinity = jm.interAntiAffinity[taskName][taskNameInBucket]
- }
- if antiAffinity {
- bucketPodAff -= count
- }
- }
-
- return bucketPodAff
-}
-
-func (jm *JobManager) buildBucket(taskWithOrder []*api.TaskInfo) {
- nodeBucketMapping := make(map[string]*Bucket)
-
- for _, task := range taskWithOrder {
- klog.V(5).Infof("jobID %s task with order task %s/%s", jm.jobID, task.Namespace, task.Name)
-
- var selectedBucket *Bucket
- maxAffinity := math.MinInt32
-
- taskName := getTaskName(task)
-
- if task.NodeName != "" {
- // generate bucket by node
- maxAffinity = 0
- selectedBucket = nodeBucketMapping[task.NodeName]
- } else {
- for _, bucket := range jm.buckets {
- bucketPodAff := jm.checkTaskSetAffinity(taskName, bucket.taskNameSet, false)
-
- // choose the best fit affinity, or balance resource between bucket
- if bucketPodAff > maxAffinity {
- maxAffinity = bucketPodAff
- selectedBucket = bucket
- } else if bucketPodAff == maxAffinity && selectedBucket != nil &&
- bucket.reqScore < selectedBucket.reqScore {
- selectedBucket = bucket
- }
- }
- }
-
- if maxAffinity < 0 || selectedBucket == nil {
- selectedBucket = jm.NewBucket()
- if task.NodeName != "" {
- nodeBucketMapping[task.NodeName] = selectedBucket
- }
- }
-
- jm.AddTaskToBucket(selectedBucket.index, taskName, task)
- }
-}
-
-// ConstructBucket builds bucket for tasks
-func (jm *JobManager) ConstructBucket(tasks map[api.TaskID]*api.TaskInfo) {
- taskWithoutBucket := jm.buildTaskInfo(tasks)
-
- o := TaskOrder{
- tasks: taskWithoutBucket,
-
- manager: jm,
- }
- sort.Sort(sort.Reverse(&o))
-
- jm.buildBucket(o.tasks)
-}
-
-// TaskBound binds task to bucket
-func (jm *JobManager) TaskBound(task *api.TaskInfo) {
- if taskName := getTaskName(task); taskName != "" {
- set, ok := jm.nodeTaskSet[task.NodeName]
- if !ok {
- set = make(map[string]int)
- jm.nodeTaskSet[task.NodeName] = set
- }
- set[taskName]++
- }
-
- bucket := jm.GetBucket(task)
- if bucket != nil {
- bucket.TaskBound(task)
- }
-}
-
-// GetBucket get bucket inside which task has been
-func (jm *JobManager) GetBucket(task *api.TaskInfo) *Bucket {
- index, ok := jm.podInBucket[task.Pod.UID]
- if !ok || index == OutOfBucket {
- return nil
- }
-
- bucket := jm.buckets[index]
- return bucket
-}
-
-func (jm *JobManager) String() string {
- // saa: selfAntiAffinity
- // iaa: interAntiAffinity
- // sa: selfAffinity
- // ia: interAffinity
- msg := []string{
- fmt.Sprintf("%s - job %s max %d || saa: %v - iaa: %v - sa: %v - ia: %v || priority: %v - order: %v || ",
- PluginName, jm.jobID, jm.bucketMaxSize,
- jm.selfAntiAffinity, jm.interAntiAffinity,
- jm.selfAffinity, jm.interAffinity,
- jm.taskAffinityPriority, jm.taskExistOrder,
- ),
- }
-
- for _, bucket := range jm.buckets {
- bucketMsg := fmt.Sprintf("b:%d -- ", bucket.index)
- var info []string
- for _, task := range bucket.tasks {
- info = append(info, task.Pod.Name)
- }
- bucketMsg += strings.Join(info, ", ")
- bucketMsg += "|"
-
- info = nil
- for nodeName, count := range bucket.node {
- info = append(info, fmt.Sprintf("n%s-%d", nodeName, count))
- }
- bucketMsg += strings.Join(info, ", ")
-
- msg = append(msg, "["+bucketMsg+"]")
- }
- return strings.Join(msg, " ")
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package tasktopology
-
-import (
- "fmt"
- "strings"
- "time"
-
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
-)
-
-type taskTopologyPlugin struct {
- arguments framework.Arguments
-
- weight int
- managers map[api.JobID]*JobManager
-}
-
-// New function returns taskTopologyPlugin object
-func New(arguments framework.Arguments) framework.Plugin {
- return &taskTopologyPlugin{
- arguments: arguments,
-
- weight: calculateWeight(arguments),
- managers: make(map[api.JobID]*JobManager),
- }
-}
-
-func (p *taskTopologyPlugin) Name() string {
- return PluginName
-}
-
-// TaskOrderFn returns -1 to make l prior to r.
-//
-// for example:
-// A:
-// | bucket1 | bucket2 | out of bucket
-// | a1 a3 | a2 | a4
-// B:
-// | bucket1 | out of bucket
-// | b1 b2 | b3
-// the right task order should be:
-// a1 a3 a2 b1 b2 a4 b3
-func (p *taskTopologyPlugin) TaskOrderFn(l interface{}, r interface{}) int {
- lv, ok := l.(*api.TaskInfo)
- if !ok {
- klog.Errorf("Object is not a taskinfo")
- }
- rv, ok := r.(*api.TaskInfo)
- if !ok {
- klog.Errorf("Object is not a taskinfo")
- }
-
- lvJobManager := p.managers[lv.Job]
- rvJobManager := p.managers[rv.Job]
-
- var lvBucket, rvBucket *Bucket
- if lvJobManager != nil {
- lvBucket = lvJobManager.GetBucket(lv)
- } else {
- klog.V(4).Infof("No job manager for job <ID: %s>, do not return task order.", lv.Job)
- return 0
- }
- if rvJobManager != nil {
- rvBucket = rvJobManager.GetBucket(rv)
- } else {
- klog.V(4).Infof("No job manager for job <ID: %s>, do not return task order.", rv.Job)
- return 0
- }
-
- // the one have bucket would always prior to another
- lvInBucket := lvBucket != nil
- rvInBucket := rvBucket != nil
- if lvInBucket != rvInBucket {
- if lvInBucket {
- return -1
- }
- return 1
- }
-
- // comparison between job is not the duty of this plugin
- if lv.Job != rv.Job {
- return 0
- }
-
- // task out of bucket have no order
- if !lvInBucket && !rvInBucket {
- return 0
- }
-
- // the big bucket should prior to small one
- lvHasTask := len(lvBucket.tasks)
- rvHasTask := len(rvBucket.tasks)
- if lvHasTask != rvHasTask {
- if lvHasTask > rvHasTask {
- return -1
- }
- return 1
- }
-
- lvBucketIndex := lvBucket.index
- rvBucketIndex := rvBucket.index
- // in the same bucket, the affinityOrder is ok
- if lvBucketIndex == rvBucketIndex {
- affinityOrder := lvJobManager.taskAffinityOrder(lv, rv)
- return -affinityOrder
- }
-
- // the old bucket should prior to young one
- if lvBucketIndex < rvBucketIndex {
- return -1
- }
- return 1
-}
-
-func (p *taskTopologyPlugin) calcBucketScore(task *api.TaskInfo, node *api.NodeInfo) (int, *JobManager, error) {
- // task could never fits the node
- maxResource := node.Idle.Clone().Add(node.Releasing)
- if req := task.Resreq; req != nil && maxResource.LessPartly(req, api.Zero) {
- return 0, nil, nil
- }
-
- jobManager, hasManager := p.managers[task.Job]
- if !hasManager {
- return 0, nil, nil
- }
-
- bucket := jobManager.GetBucket(task)
- // task out of bucket
- if bucket == nil {
- return 0, jobManager, nil
- }
-
- // 1. bound task in bucket is the base score of this node
- score := bucket.node[node.Name]
-
- // 2. task inter/self anti-affinity should be calculated
- if nodeTaskSet := jobManager.nodeTaskSet[node.Name]; nodeTaskSet != nil {
- taskName := getTaskName(task)
- affinityScore := jobManager.checkTaskSetAffinity(taskName, nodeTaskSet, true)
- if affinityScore < 0 {
- score += affinityScore
- }
- }
- klog.V(4).Infof("task %s/%s, node %s, additional score %d, task %d",
- task.Namespace, task.Name, node.Name, score, len(bucket.tasks))
-
- // 3. the other tasks in bucket take into considering
- score += len(bucket.tasks)
- if bucket.request == nil || bucket.request.LessEqual(maxResource, api.Zero) {
- return score, jobManager, nil
- }
-
- remains := bucket.request.Clone()
- // randomly (by map) take out task to make the bucket fits the node
- for bucketTaskID, bucketTask := range bucket.tasks {
- // current task should kept in bucket
- if bucketTaskID == task.Pod.UID || bucketTask.Resreq == nil {
- continue
- }
- remains.Sub(bucketTask.Resreq)
- score--
- if remains.LessEqual(maxResource, api.Zero) {
- break
- }
- }
- // here, the bucket remained request will always fit the maxResource
- return score, jobManager, nil
-}
-
-func (p *taskTopologyPlugin) NodeOrderFn(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
- score, jobManager, err := p.calcBucketScore(task, node)
- if err != nil {
- return 0, err
- }
- fScore := float64(score * p.weight)
- if jobManager != nil && jobManager.bucketMaxSize != 0 {
- fScore = fScore * float64(v1alpha1.MaxNodeScore) / float64(jobManager.bucketMaxSize)
- }
- klog.V(4).Infof("task %s/%s at node %s has bucket score %d, score %f",
- task.Namespace, task.Name, node.Name, score, fScore)
- return fScore, nil
-}
-
-func (p *taskTopologyPlugin) AllocateFunc(event *framework.Event) {
- task := event.Task
-
- jobManager, hasManager := p.managers[task.Job]
- if !hasManager {
- return
- }
- jobManager.TaskBound(task)
-}
-
-func (p *taskTopologyPlugin) initBucket(ssn *framework.Session) {
- for jobID, job := range ssn.Jobs {
- if noPendingTasks(job) {
- klog.V(4).Infof("No pending tasks in job <%s/%s> by plugin %s.",
- job.Namespace, job.Name, PluginName)
- continue
- }
-
- jobTopology, err := readTopologyFromPgAnnotations(job)
- if err != nil {
- klog.V(4).Infof("Failed to read task topology from job <%s/%s> annotations, error: %s.",
- job.Namespace, job.Name, err.Error())
- continue
- }
- if jobTopology == nil {
- continue
- }
-
- manager := NewJobManager(jobID)
- manager.ApplyTaskTopology(jobTopology)
- manager.ConstructBucket(job.Tasks)
-
- p.managers[job.UID] = manager
- }
-}
-
-func affinityCheck(job *api.JobInfo, affinity [][]string) error {
- if job == nil || affinity == nil {
- return fmt.Errorf("empty input, job: %v, affinity: %v", job, affinity)
- }
-
- var taskNumber = len(job.Tasks)
- var taskRef = make(map[string]bool, taskNumber)
- for _, task := range job.Tasks {
- tmpStrings := strings.Split(task.Name, "-")
- if _, exist := taskRef[tmpStrings[len(tmpStrings)-2]]; !exist {
- taskRef[tmpStrings[len(tmpStrings)-2]] = true
- }
- }
-
- for _, aff := range affinity {
- affTasks := make(map[string]bool, len(aff))
- for _, task := range aff {
- if len(task) == 0 {
- continue
- }
- if _, exist := taskRef[task]; !exist {
- return fmt.Errorf("task %s do not exist in job <%s/%s>", task, job.Namespace, job.Name)
- }
- if _, exist := affTasks[task]; exist {
- return fmt.Errorf("task %s is duplicated in job <%s/%s>", task, job.Namespace, job.Name)
- }
- affTasks[task] = true
- }
- }
-
- return nil
-}
-
-func splitAnnotations(job *api.JobInfo, annotation string) ([][]string, error) {
- affinityStr := strings.Split(annotation, ";")
- if len(affinityStr) == 0 {
- return nil, nil
- }
- var affinity = make([][]string, len(affinityStr))
- for i, str := range affinityStr {
- affinity[i] = strings.Split(str, ",")
- }
- if err := affinityCheck(job, affinity); err != nil {
- klog.V(4).Infof("Job <%s/%s> affinity key invalid: %s.",
- job.Namespace, job.Name, err.Error())
- return nil, err
- }
- return affinity, nil
-}
-
-func readTopologyFromPgAnnotations(job *api.JobInfo) (*TaskTopology, error) {
- jobAffinityStr, affinityExist := job.PodGroup.Annotations[JobAffinityAnnotations]
- jobAntiAffinityStr, antiAffinityExist := job.PodGroup.Annotations[JobAntiAffinityAnnotations]
- taskOrderStr, taskOrderExist := job.PodGroup.Annotations[TaskOrderAnnotations]
-
- if !(affinityExist || antiAffinityExist || taskOrderExist) {
- return nil, nil
- }
-
- var jobTopology = TaskTopology{
- Affinity: nil,
- AntiAffinity: nil,
- TaskOrder: nil,
- }
-
- if affinityExist {
- affinities, err := splitAnnotations(job, jobAffinityStr)
- if err != nil {
- klog.V(4).Infof("Job <%s/%s> affinity key invalid: %s.",
- job.Namespace, job.Name, err.Error())
- return nil, err
- }
- jobTopology.Affinity = affinities
- }
-
- if antiAffinityExist {
- affinities, err := splitAnnotations(job, jobAntiAffinityStr)
- if err != nil {
- klog.V(4).Infof("Job <%s/%s> anti affinity key invalid: %s.",
- job.Namespace, job.Name, err.Error())
- return nil, err
- }
- jobTopology.AntiAffinity = affinities
- }
-
- if taskOrderExist {
- jobTopology.TaskOrder = strings.Split(taskOrderStr, ",")
- if err := affinityCheck(job, [][]string{jobTopology.TaskOrder}); err != nil {
- klog.V(4).Infof("Job <%s/%s> task order key invalid: %s.",
- job.Namespace, job.Name, err.Error())
- return nil, err
- }
- }
-
- return &jobTopology, nil
-}
-
-func (p *taskTopologyPlugin) OnSessionOpen(ssn *framework.Session) {
- start := time.Now()
- klog.V(3).Infof("start to init task topology plugin, weight[%d], defined order %v", p.weight, affinityPriority)
-
- p.initBucket(ssn)
-
- ssn.AddTaskOrderFn(p.Name(), p.TaskOrderFn)
-
- ssn.AddNodeOrderFn(p.Name(), p.NodeOrderFn)
-
- ssn.AddEventHandler(&framework.EventHandler{
- AllocateFunc: p.AllocateFunc,
- })
-
- klog.V(3).Infof("finished to init task topology plugin, using time %v", time.Since(start))
-}
-
-func (p *taskTopologyPlugin) OnSessionClose(ssn *framework.Session) {
- p.managers = nil
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package tasktopology
-
-import (
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
-)
-
-const (
- // PluginName indicates name of volcano scheduler plugin
- PluginName = "task-topology"
- // PluginWeight is task-topology plugin weight in nodeOrderFn
- PluginWeight = "task-topology.weight"
- // JobAffinityKey is the key to read in task-topology arguments from job annotations
- JobAffinityKey = "volcano.sh/task-topology"
- // OutOfBucket indicates task is outside of any bucket
- OutOfBucket = -1
-
- // JobAffinityAnnotations is the key to read in task-topology affinity arguments from podgroup annotations
- JobAffinityAnnotations = "volcano.sh/task-topology-affinity"
- // JobAntiAffinityAnnotations is the key to read in task-topology anti-affinity arguments from podgroup annotations
- JobAntiAffinityAnnotations = "volcano.sh/task-topology-anti-affinity"
- // TaskOrderAnnotations is the key to read in task-topology task order arguments from podgroup annotations
- TaskOrderAnnotations = "volcano.sh/task-topology-task-order"
-)
-
-// TaskTopology is struct used to save affinity infos of a job read from job plugin or annotations
-type TaskTopology struct {
- Affinity [][]string `json:"affinity,omitempty"`
- AntiAffinity [][]string `json:"antiAffinity,omitempty"`
- TaskOrder []string `json:"taskOrder,omitempty"`
-}
-
-func calculateWeight(args framework.Arguments) int {
- /*
- User Should give taskTopologyWeight in this format(task-topology.weight).
-
- actions: "enqueue, reclaim, allocate, backfill, preempt"
- tiers:
- - plugins:
- - name: task-topology
- arguments:
- task-topology.weight: 10
- */
- // Values are initialized to 1.
- weight := 1
-
- args.GetInt(&weight, PluginWeight)
-
- return weight
-}
-
-func getTaskName(task *api.TaskInfo) string {
- return task.Pod.Annotations[v1alpha1.TaskSpecKey]
-}
-
-func addAffinity(m map[string]map[string]struct{}, src, dst string) {
- srcMap, ok := m[src]
- if !ok {
- srcMap = make(map[string]struct{})
- m[src] = srcMap
- }
- srcMap[dst] = struct{}{}
-}
-
-func noPendingTasks(job *api.JobInfo) bool {
- return len(job.TaskStatusIndex[api.Pending]) == 0
-}
-
-// TaskOrder is struct used to save task order
-type TaskOrder struct {
- tasks []*api.TaskInfo
- manager *JobManager
-}
-
-func (p *TaskOrder) Len() int { return len(p.tasks) }
-
-func (p *TaskOrder) Swap(l, r int) {
- p.tasks[l], p.tasks[r] = p.tasks[r], p.tasks[l]
-}
-
-func (p *TaskOrder) Less(l, r int) bool {
- L := p.tasks[l]
- R := p.tasks[r]
-
- LHasNode := L.NodeName != ""
- RHasNode := R.NodeName != ""
- if LHasNode || RHasNode {
- // the task bounded would have high priority
- if LHasNode != RHasNode {
- return !LHasNode
- }
- // all bound, any order is alright
- return L.NodeName > R.NodeName
- }
-
- result := p.manager.taskAffinityOrder(L, R)
- // they have the same taskAffinity order, any order is alright
- if result == 0 {
- return L.Name > R.Name
- }
- return result < 0
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package tdm
-
-import (
- "fmt"
- "strings"
- "time"
-
- "k8s.io/apimachinery/pkg/util/intstr"
- "k8s.io/klog"
- "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/volcano/pkg/scheduler/api"
- "volcano.sh/volcano/pkg/scheduler/framework"
- tutil "volcano.sh/volcano/pkg/scheduler/plugins/util"
- "volcano.sh/volcano/pkg/scheduler/util"
-)
-
-const (
- // PluginName indicates name of volcano scheduler plugin.
- PluginName = "tdm"
- // revocableZoneLayout revocable zone layout
- revocableZoneLayout = "15:04"
- revocableZoneLabelPrefix = "tdm.revocable-zone."
- evictPeriodLabel = "tdm.evict.period"
- defaultPodEvictNum = 1
-)
-
-var lastEvictAt time.Time
-
-/*
- actions: "enqueue, reclaim, allocate, preempt"
- tiers:
- - plugins:
- - name: tdm
- arguments:
- tdm.revocable-zone.rz1: 10:00-21:00
- tdm.revocable-zone.rz2: 12:00-14:00
- tdm.evict.period: 1m
-*/
-
-type tdmPlugin struct {
- revocableZone map[string]string
- // evictPeriod
- // default 1m
- evictPeriod time.Duration
-}
-
-// New function returns prioritizePlugin object
-func New(args framework.Arguments) framework.Plugin {
- revocableZone := make(map[string]string)
- evictPeriod := time.Minute
-
- for k, v := range args {
- if strings.Contains(k, revocableZoneLabelPrefix) {
- revocableZone[strings.Replace(k, revocableZoneLabelPrefix, "", 1)] = v
- }
- }
-
- if period, ok := args[evictPeriodLabel]; ok {
- if d, err := time.ParseDuration(period); err == nil {
- evictPeriod = d
- }
- }
-
- return &tdmPlugin{revocableZone, evictPeriod}
-}
-
-func (tp *tdmPlugin) Name() string {
- return PluginName
-}
-
-func parseRevocableZone(rzRaw string) (start, end time.Time, err error) {
- rzValues := strings.Split(strings.TrimSpace(rzRaw), "-")
-
- if len(rzValues) != 2 {
- err = fmt.Errorf("revocable zone %v format error", rzRaw)
- return
- }
-
- t1, err := time.Parse(revocableZoneLayout, rzValues[0])
- if err != nil {
- return
- }
-
- t2, err := time.Parse(revocableZoneLayout, rzValues[1])
- if err != nil {
- return
- }
-
- now := time.Now()
-
- start = time.Date(now.Year(), now.Month(), now.Day(), t1.Hour(), t1.Minute(), 0, 0, now.Location())
- if t1.After(t2) || t1.Equal(t2) {
- end = time.Date(now.Year(), now.Month(), now.Day()+1, t2.Hour(), t2.Minute(), 0, 0, now.Location())
- } else {
- end = time.Date(now.Year(), now.Month(), now.Day(), t2.Hour(), t2.Minute(), 0, 0, now.Location())
- }
-
- return
-}
-
-func (tp *tdmPlugin) availableRevocableZone(rz string) error {
- // rzRaw format 00:00-23:59
- rzRaw, ok := tp.revocableZone[rz]
- if !ok {
- return fmt.Errorf("revocable zone %v not support", rz)
- }
-
- now := time.Now()
-
- start, end, err := parseRevocableZone(rzRaw)
- if err != nil {
- return err
- }
-
- if now.Unix() < start.Unix() || now.Unix() > end.Unix() {
- return fmt.Errorf("current time beyond revocable zone %v:%v", rz, rzRaw)
- }
-
- return nil
-}
-
-func (tp *tdmPlugin) OnSessionOpen(ssn *framework.Session) {
- klog.V(4).Infof("Enter tdm plugin ...")
- if klog.V(4) {
- defer func() {
- klog.V(4).Infof("Leaving tdm plugin.")
- }()
- }
-
- // tdm plugin just handle revocable node
- predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) error {
- if node.RevocableZone == "" {
- return nil
- }
-
- if err := tp.availableRevocableZone(node.RevocableZone); err != nil {
- return fmt.Errorf("plugin %s predicates %w", tp.Name(), err)
- }
-
- klog.V(4).Infof("TDM node %v revocable zone %v:%v is active", node.Name, node.RevocableZone, tp.revocableZone[node.RevocableZone])
-
- if len(task.RevocableZone) == 0 {
- msg := fmt.Sprintf("task %s/%s is not allow to dispatch to revocable node %s", task.Namespace, task.Name, node.Name)
- return fmt.Errorf("plugin %s predicates %s", tp.Name(), msg)
- }
-
- klog.V(4).Infof("TDM filter for Task %s/%s on node %s pass.", task.Namespace, task.Name, node.Name)
- return nil
- }
-
- // tdm plugin just handle revocable node
- nodeOrderFn := func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
- score := 0.0
-
- if node.RevocableZone == "" {
- return score, nil
- }
-
- if err := tp.availableRevocableZone(node.RevocableZone); err != nil {
- klog.V(4).Infof("TDM not available %s", err)
- return score, err
- }
-
- if len(task.RevocableZone) == 0 {
- klog.V(4).Infof("TDM task %s/%s is not allow to dispatch to revocable node %s", task.Namespace, task.Name, node.Name)
- return score, nil
- }
-
- score = float64(v1alpha1.MaxNodeScore)
-
- klog.V(4).Infof("TDM score for Task %s/%s on node %s is: %v", task.Namespace, task.Name, node.Name, score)
- return score, nil
- }
-
- preemptableFn := func(preemptor *api.TaskInfo, preemptees []*api.TaskInfo) ([]*api.TaskInfo, int) {
- // for the preemptable or can use revocablezone workload, they can not preempt other tasks.
- if preemptor.Preemptable || len(preemptor.RevocableZone) > 0 {
- klog.V(4).Infof("TDM task %s/%s is preemptable, do nothing skip", preemptor.Namespace, preemptor.Name)
- return nil, tutil.Reject
- }
-
- var victims []*api.TaskInfo
- tasksMap := make(map[api.JobID][]*api.TaskInfo)
-
- // find preemptable tasks which appear on none revocable node
- for _, task := range preemptees {
- if !task.Preemptable || task.Status != api.Running {
- continue
- }
-
- node, ok := ssn.Nodes[task.NodeName]
- if !ok {
- continue
- }
-
- if node.RevocableZone != "" {
- continue
- }
-
- tasksMap[task.Job] = append(tasksMap[task.Job], task)
- }
-
- for jobID, preemptableTasks := range tasksMap {
- if job, ok := ssn.Jobs[jobID]; ok {
- victims = append(victims, tp.maxVictims(job, preemptableTasks)...)
- }
- }
-
- klog.V(4).Infof("TDM victims are %+v", victims)
-
- return victims, tutil.Permit
- }
-
- victimsFn := func() []*api.TaskInfo {
- if lastEvictAt.Add(tp.evictPeriod).After(time.Now()) {
- klog.V(4).Infof("TDM next evict time at %v", lastEvictAt)
- return nil
- }
-
- klog.V(4).Infof("TDM start to find victims")
-
- // find preemptable task on timeout revocable zone node
- victims := make([]*api.TaskInfo, 0)
- for rz := range tp.revocableZone {
- if err := tp.availableRevocableZone(rz); err != nil {
- klog.V(4).Infof("TDM revocable zone %v disactive, %v", rz, err)
- // rz disactive, then evict preemptable tasks by job from the revocable node
- for jobID, preemtableTasks := range tp.revocableNodePreemptableTask(rz, ssn) {
- if job, ok := ssn.Jobs[jobID]; ok {
- victims = append(victims, tp.maxVictims(job, preemtableTasks)...)
- }
- }
- }
- }
-
- // need to consider concurrency?
- lastEvictAt = time.Now()
-
- klog.V(4).Infof("TDM got %v victims", len(victims))
-
- return victims
- }
-
- jobOrderFn := func(l, r interface{}) int {
- lv := l.(*api.JobInfo)
- rv := r.(*api.JobInfo)
-
- if lv.Preemptable == rv.Preemptable {
- return 0
- }
-
- if !lv.Preemptable {
- return -1
- }
-
- return 1
- }
-
- jobPipelinedFn := func(obj interface{}) int {
- jobInfo := obj.(*api.JobInfo)
- occupied := jobInfo.WaitingTaskNum() + jobInfo.ReadyTaskNum()
- if occupied >= jobInfo.MinAvailable {
- return tutil.Permit
- }
- return tutil.Reject
- }
-
- jobStarvingFn := func(obj interface{}) bool {
- jobInfo := obj.(*api.JobInfo)
- // allow none preemptable elastic job (deployment) preempt task
- if jobInfo.Preemptable {
- return false
- }
- return len(jobInfo.TaskStatusIndex[api.Pending]) > 0
- }
-
- ssn.AddPredicateFn(tp.Name(), predicateFn)
- ssn.AddNodeOrderFn(tp.Name(), nodeOrderFn)
- ssn.AddPreemptableFn(tp.Name(), preemptableFn)
- ssn.AddVictimTasksFns(tp.Name(), victimsFn)
- ssn.AddJobOrderFn(tp.Name(), jobOrderFn)
- ssn.AddJobPipelinedFn(tp.Name(), jobPipelinedFn)
- ssn.AddJobStarvingFns(tp.Name(), jobStarvingFn)
-}
-
-func (tp *tdmPlugin) maxVictims(job *api.JobInfo, victims []*api.TaskInfo) []*api.TaskInfo {
- maxPodEvictNum := tp.getMaxPodEvictNum(job)
- targetNum := util.GetMinInt(maxPodEvictNum, len(victims))
- klog.V(3).Infof("Job <%s/%s> max evict:%v, potential victims number:%v, max victims number:%v",
- job.Namespace, job.Name, maxPodEvictNum, len(victims), targetNum)
-
- return victims[:targetNum]
-}
-
-// get max pod evict number from job budget configure
-func (tp *tdmPlugin) getMaxPodEvictNum(job *api.JobInfo) int {
- jobRunningTaskNum := len(job.TaskStatusIndex[api.Running])
- if job.Budget.MaxUnavilable != "" {
- maxUnavilable := tp.parseIntStr(job.Budget.MaxUnavilable, len(job.Tasks))
- finalTaskNum := len(job.TaskStatusIndex[api.Succeeded]) + len(job.TaskStatusIndex[api.Failed])
- realUnavilable := len(job.Tasks) - finalTaskNum - jobRunningTaskNum
- if realUnavilable >= maxUnavilable {
- return 0
- }
- return maxUnavilable - realUnavilable
- }
-
- if job.Budget.MinAvailable != "" {
- minAvailable := tp.parseIntStr(job.Budget.MinAvailable, len(job.Tasks))
- if jobRunningTaskNum >= minAvailable {
- return jobRunningTaskNum - minAvailable
- }
- }
-
- return defaultPodEvictNum
-}
-
-func (tp *tdmPlugin) parseIntStr(input string, taskNum int) int {
- resultValue := 0
- tmp := intstr.Parse(input)
- switch tmp.Type {
- case intstr.Int:
- resultValue = tmp.IntValue()
- case intstr.String:
- if v, err := intstr.GetValueFromIntOrPercent(&tmp, taskNum, true); err == nil {
- resultValue = v
- } else {
- klog.Warningf("TDM get percent value err: %v", err)
- }
- }
-
- return resultValue
-}
-
-func (tp *tdmPlugin) revocableNodePreemptableTask(rz string, ssn *framework.Session) map[api.JobID][]*api.TaskInfo {
- tasksMap := make(map[api.JobID][]*api.TaskInfo)
- for _, node := range ssn.RevocableNodes {
- if node.RevocableZone != rz {
- continue
- }
-
- for _, task := range node.Tasks {
- if task.Preemptable {
- if task.Status == api.Running {
- tasksMap[task.Job] = append(tasksMap[task.Job], task)
- }
- }
- }
- }
-
- return tasksMap
-}
-
-func (tp *tdmPlugin) OnSessionClose(ssn *framework.Session) {}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package scheduler
-
-import (
- "fmt"
- "path/filepath"
- "sync"
- "time"
-
- "github.com/fsnotify/fsnotify"
- "k8s.io/apimachinery/pkg/util/wait"
- "k8s.io/client-go/rest"
- "k8s.io/klog"
-
- "volcano.sh/volcano/pkg/filewatcher"
- schedcache "volcano.sh/volcano/pkg/scheduler/cache"
- "volcano.sh/volcano/pkg/scheduler/conf"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/metrics"
-)
-
-// Scheduler watches for new unscheduled pods for volcano. It attempts to find
-// nodes that they fit on and writes bindings back to the api server.
-type Scheduler struct {
- cache schedcache.Cache
- schedulerConf string
- fileWatcher filewatcher.FileWatcher
- schedulePeriod time.Duration
- once sync.Once
-
- mutex sync.Mutex
- actions []framework.Action
- plugins []conf.Tier
- configurations []conf.Configuration
-}
-
-// NewScheduler returns a scheduler
-func NewScheduler(
- config *rest.Config,
- schedulerName string,
- schedulerConf string,
- period time.Duration,
- defaultQueue string,
- nodeSelectors []string,
-) (*Scheduler, error) {
- var watcher filewatcher.FileWatcher
- if schedulerConf != "" {
- var err error
- path := filepath.Dir(schedulerConf)
- watcher, err = filewatcher.NewFileWatcher(path)
- if err != nil {
- return nil, fmt.Errorf("failed creating filewatcher for %s: %v", schedulerConf, err)
- }
- }
-
- scheduler := &Scheduler{
- schedulerConf: schedulerConf,
- fileWatcher: watcher,
- cache: schedcache.New(config, schedulerName, defaultQueue, nodeSelectors),
- schedulePeriod: period,
- }
-
- return scheduler, nil
-}
-
-// Run runs the Scheduler
-func (pc *Scheduler) Run(stopCh <-chan struct{}) {
- pc.loadSchedulerConf()
- go pc.watchSchedulerConf(stopCh)
- // Start cache for policy.
- pc.cache.Run(stopCh)
- pc.cache.WaitForCacheSync(stopCh)
- klog.V(2).Infof("scheduler completes Initialization and start to run")
- go wait.Until(pc.runOnce, pc.schedulePeriod, stopCh)
-}
-
-func (pc *Scheduler) runOnce() {
- klog.V(4).Infof("Start scheduling ...")
- scheduleStartTime := time.Now()
- defer klog.V(4).Infof("End scheduling ...")
-
- pc.mutex.Lock()
- actions := pc.actions
- plugins := pc.plugins
- configurations := pc.configurations
- pc.mutex.Unlock()
-
- ssn := framework.OpenSession(pc.cache, plugins, configurations)
- defer framework.CloseSession(ssn)
-
- for _, action := range actions {
- actionStartTime := time.Now()
- action.Execute(ssn)
- metrics.UpdateActionDuration(action.Name(), metrics.Duration(actionStartTime))
- }
- metrics.UpdateE2eDuration(metrics.Duration(scheduleStartTime))
-}
-
-func (pc *Scheduler) loadSchedulerConf() {
- var err error
- pc.once.Do(func() {
- pc.actions, pc.plugins, pc.configurations, err = unmarshalSchedulerConf(defaultSchedulerConf)
- if err != nil {
- klog.Errorf("unmarshal scheduler config %s failed: %v", defaultSchedulerConf, err)
- panic("invalid default configuration")
- }
- })
-
- var config string
- if len(pc.schedulerConf) != 0 {
- if config, err = readSchedulerConf(pc.schedulerConf); err != nil {
- klog.Errorf("Failed to read scheduler configuration '%s', using previous configuration: %v",
- pc.schedulerConf, err)
- return
- }
- }
-
- actions, plugins, configurations, err := unmarshalSchedulerConf(config)
- if err != nil {
- klog.Errorf("scheduler config %s is invalid: %v", config, err)
- return
- }
-
- pc.mutex.Lock()
- // If it is valid, use the new configuration
- pc.actions = actions
- pc.plugins = plugins
- pc.configurations = configurations
- pc.mutex.Unlock()
-}
-
-func (pc *Scheduler) watchSchedulerConf(stopCh <-chan struct{}) {
- if pc.fileWatcher == nil {
- return
- }
- eventCh := pc.fileWatcher.Events()
- errCh := pc.fileWatcher.Errors()
- for {
- select {
- case event, ok := <-eventCh:
- if !ok {
- return
- }
- klog.V(4).Infof("watch %s event: %v", pc.schedulerConf, event)
- if event.Op&fsnotify.Write == fsnotify.Write || event.Op&fsnotify.Create == fsnotify.Create {
- pc.loadSchedulerConf()
- }
- case err, ok := <-errCh:
- if !ok {
- return
- }
- klog.Infof("watch %s error: %v", pc.schedulerConf, err)
- case <-stopCh:
- return
- }
- }
-}
-
-
-
/*
-Copyright 2018 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package scheduler
-
-import (
- "fmt"
- "io/ioutil"
- "strings"
-
- "gopkg.in/yaml.v2"
-
- "volcano.sh/volcano/pkg/scheduler/conf"
- "volcano.sh/volcano/pkg/scheduler/framework"
- "volcano.sh/volcano/pkg/scheduler/plugins"
-)
-
-var defaultSchedulerConf = `
-actions: "enqueue, allocate, backfill"
-tiers:
-- plugins:
- - name: priority
- - name: gang
- - name: conformance
-- plugins:
- - name: overcommit
- - name: drf
- - name: predicates
- - name: proportion
- - name: nodeorder
-`
-
-func unmarshalSchedulerConf(confStr string) ([]framework.Action, []conf.Tier, []conf.Configuration, error) {
- var actions []framework.Action
-
- schedulerConf := &conf.SchedulerConfiguration{}
-
- if err := yaml.Unmarshal([]byte(confStr), schedulerConf); err != nil {
- return nil, nil, nil, err
- }
- // Set default settings for each plugin if not set
- for i, tier := range schedulerConf.Tiers {
- // drf with hierarchy enabled
- hdrf := false
- // proportion enabled
- proportion := false
- for j := range tier.Plugins {
- if tier.Plugins[j].Name == "drf" &&
- tier.Plugins[j].EnabledHierarchy != nil &&
- *tier.Plugins[j].EnabledHierarchy {
- hdrf = true
- }
- if tier.Plugins[j].Name == "proportion" {
- proportion = true
- }
- plugins.ApplyPluginConfDefaults(&schedulerConf.Tiers[i].Plugins[j])
- }
- if hdrf && proportion {
- return nil, nil, nil, fmt.Errorf("proportion and drf with hierarchy enabled conflicts")
- }
- }
-
- actionNames := strings.Split(schedulerConf.Actions, ",")
- for _, actionName := range actionNames {
- if action, found := framework.GetAction(strings.TrimSpace(actionName)); found {
- actions = append(actions, action)
- } else {
- return nil, nil, nil, fmt.Errorf("failed to find Action %s, ignore it", actionName)
- }
- }
-
- return actions, schedulerConf.Tiers, schedulerConf.Configurations, nil
-}
-
-func readSchedulerConf(confPath string) (string, error) {
- dat, err := ioutil.ReadFile(confPath)
- if err != nil {
- return "", err
- }
- return string(dat), nil
-}
-
-
-
package util
-
-import (
- "context"
- "fmt"
- "sync"
- "sync/atomic"
-
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-type PredicateHelper interface {
- PredicateNodes(task *api.TaskInfo, nodes []*api.NodeInfo, fn api.PredicateFn) ([]*api.NodeInfo, *api.FitErrors)
-}
-
-type predicateHelper struct {
- taskPredicateErrorCache map[string]map[string]error
-}
-
-// PredicateNodes returns the specified number of nodes that fit a task
-func (ph *predicateHelper) PredicateNodes(task *api.TaskInfo, nodes []*api.NodeInfo, fn api.PredicateFn) ([]*api.NodeInfo, *api.FitErrors) {
- var errorLock sync.RWMutex
- fe := api.NewFitErrors()
-
- allNodes := len(nodes)
- if allNodes == 0 {
- return make([]*api.NodeInfo, 0), fe
- }
- numNodesToFind := CalculateNumOfFeasibleNodesToFind(int32(allNodes))
-
- //allocate enough space to avoid growing it
- predicateNodes := make([]*api.NodeInfo, numNodesToFind)
-
- numFoundNodes := int32(0)
- processedNodes := int32(0)
-
- taskGroupid := taskGroupID(task)
- nodeErrorCache, taskFailedBefore := ph.taskPredicateErrorCache[taskGroupid]
- if nodeErrorCache == nil {
- nodeErrorCache = map[string]error{}
- }
-
- //create a context with cancellation
- ctx, cancel := context.WithCancel(context.Background())
-
- checkNode := func(index int) {
- // Check the nodes starting from where is left off in the previous scheduling cycle,
- // to make sure all nodes have the same chance of being examined across pods.
- node := nodes[(lastProcessedNodeIndex+index)%allNodes]
- atomic.AddInt32(&processedNodes, 1)
- klog.V(4).Infof("Considering Task <%v/%v> on node <%v>: <%v> vs. <%v>",
- task.Namespace, task.Name, node.Name, task.Resreq, node.Idle)
-
- // Check if the task had "predicate" failure before.
- // And then check if the task failed to predict on this node before.
- if taskFailedBefore {
- errorLock.RLock()
- errC, ok := nodeErrorCache[node.Name]
- errorLock.RUnlock()
-
- if ok {
- errorLock.Lock()
- fe.SetNodeError(node.Name, errC)
- errorLock.Unlock()
- return
- }
- }
-
- // TODO (k82cn): Enable eCache for performance improvement.
- if err := fn(task, node); err != nil {
- klog.V(3).Infof("Predicates failed for task <%s/%s> on node <%s>: %v",
- task.Namespace, task.Name, node.Name, err)
- errorLock.Lock()
- nodeErrorCache[node.Name] = err
- ph.taskPredicateErrorCache[taskGroupid] = nodeErrorCache
- fe.SetNodeError(node.Name, err)
- errorLock.Unlock()
- return
- }
-
- //check if the number of found nodes is more than the numNodesTofind
- length := atomic.AddInt32(&numFoundNodes, 1)
- if length > numNodesToFind {
- cancel()
- atomic.AddInt32(&numFoundNodes, -1)
- } else {
- predicateNodes[length-1] = node
- }
- }
-
- //workqueue.ParallelizeUntil(context.TODO(), 16, len(nodes), checkNode)
- workqueue.ParallelizeUntil(ctx, 16, allNodes, checkNode)
-
- //processedNodes := int(numFoundNodes) + len(filteredNodesStatuses) + len(failedPredicateMap)
- lastProcessedNodeIndex = (lastProcessedNodeIndex + int(processedNodes)) % allNodes
- predicateNodes = predicateNodes[:numFoundNodes]
- return predicateNodes, fe
-}
-
-func taskGroupID(task *api.TaskInfo) string {
- return fmt.Sprintf("%s/%s", task.Job, task.GetTaskSpecKey())
-}
-
-func NewPredicateHelper() PredicateHelper {
- return &predicateHelper{taskPredicateErrorCache: map[string]map[string]error{}}
-}
-
-
-
/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package util
-
-import (
- "container/heap"
-
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-//PriorityQueue implements a scheduling queue.
-type PriorityQueue struct {
- queue priorityQueue
-}
-
-type priorityQueue struct {
- items []interface{}
- lessFn api.LessFn
-}
-
-// NewPriorityQueue returns a PriorityQueue
-func NewPriorityQueue(lessFn api.LessFn) *PriorityQueue {
- return &PriorityQueue{
- queue: priorityQueue{
- items: make([]interface{}, 0),
- lessFn: lessFn,
- },
- }
-}
-
-// Push pushes element in the priority Queue
-func (q *PriorityQueue) Push(it interface{}) {
- heap.Push(&q.queue, it)
-}
-
-// Pop pops element in the priority Queue
-func (q *PriorityQueue) Pop() interface{} {
- if q.Len() == 0 {
- return nil
- }
-
- return heap.Pop(&q.queue)
-}
-
-// Empty check if queue is empty
-func (q *PriorityQueue) Empty() bool {
- return q.queue.Len() == 0
-}
-
-// Len returns Len of the priority queue
-func (q *PriorityQueue) Len() int {
- return q.queue.Len()
-}
-
-func (pq *priorityQueue) Len() int { return len(pq.items) }
-
-func (pq *priorityQueue) Less(i, j int) bool {
- if pq.lessFn == nil {
- return i < j
- }
-
- // We want Pop to give us the highest, not lowest, priority so we use greater than here.
- return pq.lessFn(pq.items[i], pq.items[j])
-}
-
-func (pq priorityQueue) Swap(i, j int) {
- pq.items[i], pq.items[j] = pq.items[j], pq.items[i]
-}
-
-func (pq *priorityQueue) Push(x interface{}) {
- (*pq).items = append((*pq).items, x)
-}
-
-func (pq *priorityQueue) Pop() interface{} {
- old := (*pq).items
- n := len(old)
- item := old[n-1]
- (*pq).items = old[0 : n-1]
- return item
-}
-
-
-
/*
-Copyright 2019 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package util
-
-import (
- "context"
- "fmt"
- "math"
- "math/rand"
- "sort"
- "sync"
-
- "k8s.io/client-go/util/workqueue"
- "k8s.io/klog"
- k8sframework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
-
- "volcano.sh/volcano/cmd/scheduler/app/options"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-const baselinePercentageOfNodesToFind = 50
-
-var lastProcessedNodeIndex int
-
-// Reservation is used to record target job and locked nodes
-var Reservation *ResourceReservation
-
-func init() {
- Reservation = NewResourceReservation()
-}
-
-// CalculateNumOfFeasibleNodesToFind returns the number of feasible nodes that once found,
-// the scheduler stops its search for more feasible nodes.
-func CalculateNumOfFeasibleNodesToFind(numAllNodes int32) (numNodes int32) {
- opts := options.ServerOpts
- if numAllNodes <= opts.MinNodesToFind || opts.PercentageOfNodesToFind >= 100 {
- return numAllNodes
- }
-
- adaptivePercentage := opts.PercentageOfNodesToFind
- if adaptivePercentage <= 0 {
- adaptivePercentage = baselinePercentageOfNodesToFind - numAllNodes/125
- if adaptivePercentage < opts.MinPercentageOfNodesToFind {
- adaptivePercentage = opts.MinPercentageOfNodesToFind
- }
- }
-
- numNodes = numAllNodes * adaptivePercentage / 100
- if numNodes < opts.MinNodesToFind {
- numNodes = opts.MinNodesToFind
- }
- return numNodes
-}
-
-// PrioritizeNodes returns a map whose key is node's score and value are corresponding nodes
-func PrioritizeNodes(task *api.TaskInfo, nodes []*api.NodeInfo, batchFn api.BatchNodeOrderFn, mapFn api.NodeOrderMapFn, reduceFn api.NodeOrderReduceFn) map[float64][]*api.NodeInfo {
- pluginNodeScoreMap := map[string]k8sframework.NodeScoreList{}
- nodeOrderScoreMap := map[string]float64{}
- nodeScores := map[float64][]*api.NodeInfo{}
- var workerLock sync.Mutex
- scoreNode := func(index int) {
- node := nodes[index]
- mapScores, orderScore, err := mapFn(task, node)
- if err != nil {
- klog.Errorf("Error in Calculating Priority for the node:%v", err)
- return
- }
-
- workerLock.Lock()
- for plugin, score := range mapScores {
- nodeScoreMap, ok := pluginNodeScoreMap[plugin]
- if !ok {
- nodeScoreMap = k8sframework.NodeScoreList{}
- }
- hp := k8sframework.NodeScore{}
- hp.Name = node.Name
- hp.Score = int64(math.Floor(score))
- pluginNodeScoreMap[plugin] = append(nodeScoreMap, hp)
- }
- nodeOrderScoreMap[node.Name] = orderScore
- workerLock.Unlock()
- }
- workqueue.ParallelizeUntil(context.TODO(), 16, len(nodes), scoreNode)
- reduceScores, err := reduceFn(task, pluginNodeScoreMap)
- if err != nil {
- klog.Errorf("Error in Calculating Priority for the node:%v", err)
- return nodeScores
- }
-
- batchNodeScore, err := batchFn(task, nodes)
- if err != nil {
- klog.Errorf("Error in Calculating batch Priority for the node, err %v", err)
- return nodeScores
- }
-
- for _, node := range nodes {
- if score, found := reduceScores[node.Name]; found {
- if orderScore, ok := nodeOrderScoreMap[node.Name]; ok {
- score += orderScore
- }
- if batchScore, ok := batchNodeScore[node.Name]; ok {
- score += batchScore
- }
- nodeScores[score] = append(nodeScores[score], node)
- } else {
- // If no plugin is applied to this node, the default is 0.0
- score = 0.0
- if orderScore, ok := nodeOrderScoreMap[node.Name]; ok {
- score += orderScore
- }
- if batchScore, ok := batchNodeScore[node.Name]; ok {
- score += batchScore
- }
- nodeScores[score] = append(nodeScores[score], node)
- }
- }
- return nodeScores
-}
-
-// SortNodes returns nodes by order of score
-func SortNodes(nodeScores map[float64][]*api.NodeInfo) []*api.NodeInfo {
- var nodesInorder []*api.NodeInfo
- var keys []float64
- for key := range nodeScores {
- keys = append(keys, key)
- }
- sort.Sort(sort.Reverse(sort.Float64Slice(keys)))
- for _, key := range keys {
- nodes := nodeScores[key]
- nodesInorder = append(nodesInorder, nodes...)
- }
- return nodesInorder
-}
-
-// SelectBestNode returns best node whose score is highest, pick one randomly if there are many nodes with same score.
-func SelectBestNode(nodeScores map[float64][]*api.NodeInfo) *api.NodeInfo {
- var bestNodes []*api.NodeInfo
- maxScore := -1.0
- for score, nodes := range nodeScores {
- if score > maxScore {
- maxScore = score
- bestNodes = nodes
- }
- }
-
- if len(bestNodes) == 0 {
- return nil
- }
-
- return bestNodes[rand.Intn(len(bestNodes))]
-}
-
-// GetNodeList returns values of the map 'nodes'
-func GetNodeList(nodes map[string]*api.NodeInfo, nodeList []string) []*api.NodeInfo {
- result := make([]*api.NodeInfo, 0, len(nodeList))
- for _, nodename := range nodeList {
- if ni, ok := nodes[nodename]; ok {
- result = append(result, ni)
- }
- }
- return result
-}
-
-// ValidateVictims returns an error if the resources of the victims can't satisfy the preemptor
-func ValidateVictims(preemptor *api.TaskInfo, node *api.NodeInfo, victims []*api.TaskInfo) error {
- if len(victims) == 0 {
- return fmt.Errorf("no victims")
- }
- futureIdle := node.FutureIdle()
- for _, victim := range victims {
- futureIdle.Add(victim.Resreq)
- }
- // Every resource of the preemptor needs to be less or equal than corresponding
- // idle resource after preemption.
- if !preemptor.InitResreq.LessEqual(futureIdle, api.Zero) {
- return fmt.Errorf("not enough resources: requested <%v>, but future idle <%v>",
- preemptor.InitResreq, futureIdle)
- }
- return nil
-}
-
-// ResourceReservation is struct used for resource reservation
-type ResourceReservation struct {
- TargetJob *api.JobInfo
- LockedNodes map[string]*api.NodeInfo
-}
-
-// NewResourceReservation is used to create global instance
-func NewResourceReservation() *ResourceReservation {
- return &ResourceReservation{
- TargetJob: nil,
- LockedNodes: map[string]*api.NodeInfo{},
- }
-}
-
-// GetMinInt return minimum int from vals
-func GetMinInt(vals ...int) int {
- if len(vals) == 0 {
- return 0
- }
-
- min := vals[0]
- for _, val := range vals {
- if val <= min {
- min = val
- }
- }
- return min
-}
-
-
-
/*
-Copyright 2019 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package util
-
-import (
- "fmt"
- "sync"
-
- v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/resource"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/types"
- "k8s.io/client-go/kubernetes"
- volumescheduling "k8s.io/kubernetes/pkg/controller/volume/scheduling"
-
- schedulingv2 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/scheduler/api"
-)
-
-// BuildResourceList builts resource list object
-func BuildResourceList(cpu string, memory string) v1.ResourceList {
- return v1.ResourceList{
- v1.ResourceCPU: resource.MustParse(cpu),
- v1.ResourceMemory: resource.MustParse(memory),
- api.GPUResourceName: resource.MustParse("0"),
- }
-}
-
-// BuildResourceListWithGPU builts resource list with GPU
-func BuildResourceListWithGPU(cpu string, memory string, GPU string) v1.ResourceList {
- return v1.ResourceList{
- v1.ResourceCPU: resource.MustParse(cpu),
- v1.ResourceMemory: resource.MustParse(memory),
- api.GPUResourceName: resource.MustParse(GPU),
- }
-}
-
-// BuildNode builts node object
-func BuildNode(name string, alloc v1.ResourceList, labels map[string]string) *v1.Node {
- return &v1.Node{
- ObjectMeta: metav1.ObjectMeta{
- Name: name,
- Labels: labels,
- Annotations: map[string]string{},
- },
- Status: v1.NodeStatus{
- Capacity: alloc,
- Allocatable: alloc,
- },
- }
-}
-
-// BuildPod builts Pod object
-func BuildPod(namespace, name, nodename string, p v1.PodPhase, req v1.ResourceList, groupName string, labels map[string]string, selector map[string]string) *v1.Pod {
- return &v1.Pod{
- ObjectMeta: metav1.ObjectMeta{
- UID: types.UID(fmt.Sprintf("%v-%v", namespace, name)),
- Name: name,
- Namespace: namespace,
- Labels: labels,
- Annotations: map[string]string{
- schedulingv2.KubeGroupNameAnnotationKey: groupName,
- },
- },
- Status: v1.PodStatus{
- Phase: p,
- },
- Spec: v1.PodSpec{
- NodeName: nodename,
- NodeSelector: selector,
- Containers: []v1.Container{
- {
- Resources: v1.ResourceRequirements{
- Requests: req,
- },
- },
- },
- },
- }
-}
-
-// FakeBinder is used as fake binder
-type FakeBinder struct {
- Binds map[string]string
- Channel chan string
-}
-
-// Bind used by fake binder struct to bind pods
-func (fb *FakeBinder) Bind(kubeClient *kubernetes.Clientset, tasks []*api.TaskInfo) (error, []*api.TaskInfo) {
- for _, p := range tasks {
- key := fmt.Sprintf("%v/%v", p.Namespace, p.Name)
- fb.Binds[key] = p.NodeName
- }
-
- return nil, nil
-}
-
-// FakeEvictor is used as fake evictor
-type FakeEvictor struct {
- sync.Mutex
- evicts []string
- Channel chan string
-}
-
-// Evicts returns copy of evicted pods.
-func (fe *FakeEvictor) Evicts() []string {
- fe.Lock()
- defer fe.Unlock()
- return append([]string{}, fe.evicts...)
-}
-
-// Evict is used by fake evictor to evict pods
-func (fe *FakeEvictor) Evict(p *v1.Pod, reason string) error {
- fe.Lock()
- defer fe.Unlock()
-
- fmt.Println("PodName: ", p.Name)
- key := fmt.Sprintf("%v/%v", p.Namespace, p.Name)
- fe.evicts = append(fe.evicts, key)
-
- fe.Channel <- key
-
- return nil
-}
-
-// FakeStatusUpdater is used for fake status update
-type FakeStatusUpdater struct {
-}
-
-// UpdatePodCondition is a empty function
-func (ftsu *FakeStatusUpdater) UpdatePodCondition(pod *v1.Pod, podCondition *v1.PodCondition) (*v1.Pod, error) {
- // do nothing here
- return nil, nil
-}
-
-// UpdatePodGroup is a empty function
-func (ftsu *FakeStatusUpdater) UpdatePodGroup(pg *api.PodGroup) (*api.PodGroup, error) {
- // do nothing here
- return nil, nil
-}
-
-// FakeVolumeBinder is used as fake volume binder
-type FakeVolumeBinder struct {
-}
-
-// AllocateVolumes is a empty function
-func (fvb *FakeVolumeBinder) AllocateVolumes(task *api.TaskInfo, hostname string, podVolumes *volumescheduling.PodVolumes) error {
- return nil
-}
-
-// BindVolumes is a empty function
-func (fvb *FakeVolumeBinder) BindVolumes(task *api.TaskInfo, podVolumes *volumescheduling.PodVolumes) error {
- return nil
-}
-
-// GetPodVolumes is a empty function
-func (fvb *FakeVolumeBinder) GetPodVolumes(task *api.TaskInfo, node *v1.Node) (*volumescheduling.PodVolumes, error) {
- return nil, nil
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- "encoding/json"
- "fmt"
- "strconv"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-const (
- // DefaultQueue constant stores the name of the queue as "default"
- DefaultQueue = "default"
- // DefaultMaxRetry is the default number of retries.
- DefaultMaxRetry = 3
-
- defaultSchedulerName = "volcano"
-
- defaultMaxRetry int32 = 3
-)
-
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/jobs/mutate",
- Func: Jobs,
-
- MutatingConfig: &whv1beta1.MutatingWebhookConfiguration{
- Webhooks: []whv1beta1.MutatingWebhook{{
- Name: "mutatejob.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create},
- Rule: whv1beta1.Rule{
- APIGroups: []string{"batch.volcano.sh"},
- APIVersions: []string{"v1alpha1"},
- Resources: []string{"jobs"},
- },
- },
- },
- }},
- },
-}
-
-type patchOperation struct {
- Op string `json:"op"`
- Path string `json:"path"`
- Value interface{} `json:"value,omitempty"`
-}
-
-// Jobs mutate jobs.
-func Jobs(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("mutating jobs")
-
- job, err := schema.DecodeJob(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
-
- var patchBytes []byte
- switch ar.Request.Operation {
- case v1beta1.Create:
- patchBytes, _ = createPatch(job)
- default:
- err = fmt.Errorf("expect operation to be 'CREATE' ")
- return util.ToAdmissionResponse(err)
- }
-
- klog.V(3).Infof("AdmissionResponse: patch=%v", string(patchBytes))
- reviewResponse := v1beta1.AdmissionResponse{
- Allowed: true,
- Patch: patchBytes,
- }
- pt := v1beta1.PatchTypeJSONPatch
- reviewResponse.PatchType = &pt
-
- return &reviewResponse
-}
-
-func createPatch(job *v1alpha1.Job) ([]byte, error) {
- var patch []patchOperation
- pathQueue := patchDefaultQueue(job)
- if pathQueue != nil {
- patch = append(patch, *pathQueue)
- }
- pathScheduler := patchDefaultScheduler(job)
- if pathScheduler != nil {
- patch = append(patch, *pathScheduler)
- }
- pathMaxRetry := patchDefaultMaxRetry(job)
- if pathMaxRetry != nil {
- patch = append(patch, *pathMaxRetry)
- }
- pathSpec := mutateSpec(job.Spec.Tasks, "/spec/tasks")
- if pathSpec != nil {
- patch = append(patch, *pathSpec)
- }
- pathMinAvailable := patchDefaultMinAvailable(job)
- if pathMinAvailable != nil {
- patch = append(patch, *pathMinAvailable)
- }
- // Add default plugins for some distributed-framework plugin cases
- patchPlugins := patchDefaultPlugins(job)
- if patchPlugins != nil {
- patch = append(patch, *patchPlugins)
- }
- return json.Marshal(patch)
-}
-
-func patchDefaultQueue(job *v1alpha1.Job) *patchOperation {
- //Add default queue if not specified.
- if job.Spec.Queue == "" {
- return &patchOperation{Op: "add", Path: "/spec/queue", Value: DefaultQueue}
- }
- return nil
-}
-
-func patchDefaultScheduler(job *v1alpha1.Job) *patchOperation {
- // Add default scheduler name if not specified.
- if job.Spec.SchedulerName == "" {
- return &patchOperation{Op: "add", Path: "/spec/schedulerName", Value: defaultSchedulerName}
- }
- return nil
-}
-
-func patchDefaultMaxRetry(job *v1alpha1.Job) *patchOperation {
- // Add default maxRetry if maxRetry is zero.
- if job.Spec.MaxRetry == 0 {
- return &patchOperation{Op: "add", Path: "/spec/maxRetry", Value: DefaultMaxRetry}
- }
- return nil
-}
-
-func patchDefaultMinAvailable(job *v1alpha1.Job) *patchOperation {
- // Add default minAvailable if minAvailable is zero.
- if job.Spec.MinAvailable == 0 {
- var jobMinAvailable int32
- for _, task := range job.Spec.Tasks {
- if task.MinAvailable != nil {
- jobMinAvailable += *task.MinAvailable
- } else {
- jobMinAvailable += task.Replicas
- }
- }
-
- return &patchOperation{Op: "add", Path: "/spec/minAvailable", Value: jobMinAvailable}
- }
- return nil
-}
-
-func mutateSpec(tasks []v1alpha1.TaskSpec, basePath string) *patchOperation {
- patched := false
- for index := range tasks {
- // add default task name
- taskName := tasks[index].Name
- if len(taskName) == 0 {
- patched = true
- tasks[index].Name = v1alpha1.DefaultTaskSpec + strconv.Itoa(index)
- }
-
- if tasks[index].Template.Spec.HostNetwork && tasks[index].Template.Spec.DNSPolicy == "" {
- patched = true
- tasks[index].Template.Spec.DNSPolicy = v1.DNSClusterFirstWithHostNet
- }
-
- if tasks[index].MinAvailable == nil {
- patched = true
- minAvailable := tasks[index].Replicas
- tasks[index].MinAvailable = &minAvailable
- }
-
- if tasks[index].MaxRetry == 0 {
- patched = true
- tasks[index].MaxRetry = defaultMaxRetry
- }
- }
- if !patched {
- return nil
- }
- return &patchOperation{
- Op: "replace",
- Path: basePath,
- Value: tasks,
- }
-}
-
-func patchDefaultPlugins(job *v1alpha1.Job) *patchOperation {
- if job.Spec.Plugins == nil {
- return nil
- }
- plugins := map[string][]string{}
- for k, v := range job.Spec.Plugins {
- plugins[k] = v
- }
-
- // Because the tensorflow-plugin depends on svc-plugin.
- // If the svc-plugin is not defined, we should add it.
- if _, ok := job.Spec.Plugins["tensorflow"]; ok {
- if _, ok := plugins["svc"]; !ok {
- plugins["svc"] = []string{}
- }
- }
-
- return &patchOperation{
- Op: "replace",
- Path: "/spec/plugins",
- Value: plugins,
- }
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package validate
-
-import (
- "context"
- "fmt"
- "strings"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- v1 "k8s.io/api/core/v1"
- apieequality "k8s.io/apimachinery/pkg/api/equality"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/util/validation"
- "k8s.io/apimachinery/pkg/util/validation/field"
- "k8s.io/klog"
- k8score "k8s.io/kubernetes/pkg/apis/core"
- k8scorev1 "k8s.io/kubernetes/pkg/apis/core/v1"
- v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
- k8scorevalid "k8s.io/kubernetes/pkg/apis/core/validation"
-
- "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- jobhelpers "volcano.sh/volcano/pkg/controllers/job/helpers"
- "volcano.sh/volcano/pkg/controllers/job/plugins"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/jobs/validate",
- Func: AdmitJobs,
-
- Config: config,
-
- ValidatingConfig: &whv1beta1.ValidatingWebhookConfiguration{
- Webhooks: []whv1beta1.ValidatingWebhook{{
- Name: "validatejob.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create, whv1beta1.Update},
- Rule: whv1beta1.Rule{
- APIGroups: []string{"batch.volcano.sh"},
- APIVersions: []string{"v1alpha1"},
- Resources: []string{"jobs"},
- },
- },
- },
- }},
- },
-}
-
-var config = &router.AdmissionServiceConfig{}
-
-// AdmitJobs is to admit jobs and return response.
-func AdmitJobs(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("admitting jobs -- %s", ar.Request.Operation)
-
- job, err := schema.DecodeJob(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
- var msg string
- reviewResponse := v1beta1.AdmissionResponse{}
- reviewResponse.Allowed = true
-
- switch ar.Request.Operation {
- case v1beta1.Create:
- msg = validateJobCreate(job, &reviewResponse)
- case v1beta1.Update:
- oldJob, err := schema.DecodeJob(ar.Request.OldObject, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
- err = validateJobUpdate(oldJob, job)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
- default:
- err := fmt.Errorf("expect operation to be 'CREATE' or 'UPDATE'")
- return util.ToAdmissionResponse(err)
- }
-
- if !reviewResponse.Allowed {
- reviewResponse.Result = &metav1.Status{Message: strings.TrimSpace(msg)}
- }
- return &reviewResponse
-}
-
-func validateJobCreate(job *v1alpha1.Job, reviewResponse *v1beta1.AdmissionResponse) string {
- var msg string
- taskNames := map[string]string{}
- var totalReplicas int32
-
- if job.Spec.MinAvailable < 0 {
- reviewResponse.Allowed = false
- return "job 'minAvailable' must be >= 0."
- }
-
- if job.Spec.MaxRetry < 0 {
- reviewResponse.Allowed = false
- return "'maxRetry' cannot be less than zero."
- }
-
- if job.Spec.TTLSecondsAfterFinished != nil && *job.Spec.TTLSecondsAfterFinished < 0 {
- reviewResponse.Allowed = false
- return "'ttlSecondsAfterFinished' cannot be less than zero."
- }
-
- if len(job.Spec.Tasks) == 0 {
- reviewResponse.Allowed = false
- return "No task specified in job spec"
- }
-
- hasDependenciesBetweenTasks := false
- for index, task := range job.Spec.Tasks {
- if task.DependsOn != nil {
- hasDependenciesBetweenTasks = true
- }
-
- if task.Replicas < 0 {
- msg += fmt.Sprintf(" 'replicas' < 0 in task: %s;", task.Name)
- }
-
- if task.MinAvailable != nil && *task.MinAvailable > task.Replicas {
- msg += fmt.Sprintf(" 'minAvailable' is greater than 'replicas' in task: %s, job: %s", task.Name, job.Name)
- }
-
- // count replicas
- totalReplicas += task.Replicas
-
- // validate task name
- if errMsgs := validation.IsDNS1123Label(task.Name); len(errMsgs) > 0 {
- msg += fmt.Sprintf(" %v;", errMsgs)
- }
-
- // duplicate task name
- if _, found := taskNames[task.Name]; found {
- msg += fmt.Sprintf(" duplicated task name %s;", task.Name)
- break
- } else {
- taskNames[task.Name] = task.Name
- }
-
- if err := validatePolicies(task.Policies, field.NewPath("spec.tasks.policies")); err != nil {
- msg += err.Error() + fmt.Sprintf(" valid events are %v, valid actions are %v",
- getValidEvents(), getValidActions())
- }
- podName := jobhelpers.MakePodName(job.Name, task.Name, index)
- msg += validateK8sPodNameLength(podName)
- msg += validateTaskTemplate(task, job, index)
- }
-
- msg += validateJobName(job)
-
- if totalReplicas < job.Spec.MinAvailable {
- msg += "job 'minAvailable' should not be greater than total replicas in tasks;"
- }
-
- if err := validatePolicies(job.Spec.Policies, field.NewPath("spec.policies")); err != nil {
- msg = msg + err.Error() + fmt.Sprintf(" valid events are %v, valid actions are %v;",
- getValidEvents(), getValidActions())
- }
-
- // invalid job plugins
- if len(job.Spec.Plugins) != 0 {
- for name := range job.Spec.Plugins {
- if _, found := plugins.GetPluginBuilder(name); !found {
- msg += fmt.Sprintf(" unable to find job plugin: %s", name)
- }
- }
- }
-
- if err := validateIO(job.Spec.Volumes); err != nil {
- msg += err.Error()
- }
-
- queue, err := config.VolcanoClient.SchedulingV1beta1().Queues().Get(context.TODO(), job.Spec.Queue, metav1.GetOptions{})
- if err != nil {
- msg += fmt.Sprintf(" unable to find job queue: %v", err)
- } else if queue.Status.State != schedulingv1beta1.QueueStateOpen {
- msg += fmt.Sprintf("can only submit job to queue with state `Open`, "+
- "queue `%s` status is `%s`", queue.Name, queue.Status.State)
- }
-
- if hasDependenciesBetweenTasks {
- _, isDag := topoSort(job)
- if !isDag {
- msg += fmt.Sprintf("job has dependencies between tasks, but doesn't form a directed acyclic graph(DAG)")
- }
- }
-
- if msg != "" {
- reviewResponse.Allowed = false
- }
-
- return msg
-}
-
-func validateJobUpdate(old, new *v1alpha1.Job) error {
- var totalReplicas int32
- for _, task := range new.Spec.Tasks {
- if task.Replicas < 0 {
- return fmt.Errorf("'replicas' must be >= 0 in task: %s", task.Name)
- }
-
- if task.MinAvailable != nil && *task.MinAvailable > task.Replicas {
- return fmt.Errorf("'minAvailable' must be <= 'replicas' in task: %s;", task.Name)
- }
- // count replicas
- totalReplicas += task.Replicas
- }
- if new.Spec.MinAvailable > totalReplicas {
- return fmt.Errorf("job 'minAvailable' must not be greater than total replicas")
- }
- if new.Spec.MinAvailable < 0 {
- return fmt.Errorf("job 'minAvailable' must be >= 0")
- }
-
- if len(old.Spec.Tasks) != len(new.Spec.Tasks) {
- return fmt.Errorf("job updates may not add or remove tasks")
- }
- // other fields under spec are not allowed to mutate
- new.Spec.MinAvailable = old.Spec.MinAvailable
- new.Spec.PriorityClassName = old.Spec.PriorityClassName
- for i := range new.Spec.Tasks {
- new.Spec.Tasks[i].Replicas = old.Spec.Tasks[i].Replicas
- new.Spec.Tasks[i].MinAvailable = old.Spec.Tasks[i].MinAvailable
- }
-
- // job controller will update the pvc name if not provided
- for i := range new.Spec.Volumes {
- if new.Spec.Volumes[i].VolumeClaim != nil {
- new.Spec.Volumes[i].VolumeClaimName = ""
- }
- }
- for i := range old.Spec.Volumes {
- if old.Spec.Volumes[i].VolumeClaim != nil {
- old.Spec.Volumes[i].VolumeClaimName = ""
- }
- }
-
- if !apieequality.Semantic.DeepEqual(new.Spec, old.Spec) {
- return fmt.Errorf("job updates may not change fields other than `minAvailable`, `tasks[*].replicas under spec`")
- }
-
- return nil
-}
-
-func validateTaskTemplate(task v1alpha1.TaskSpec, job *v1alpha1.Job, index int) string {
- var v1PodTemplate v1.PodTemplate
- v1PodTemplate.Template = *task.Template.DeepCopy()
- k8scorev1.SetObjectDefaults_PodTemplate(&v1PodTemplate)
-
- var coreTemplateSpec k8score.PodTemplateSpec
- k8scorev1.Convert_v1_PodTemplateSpec_To_core_PodTemplateSpec(&v1PodTemplate.Template, &coreTemplateSpec, nil)
-
- // Skip verify container SecurityContex.Privileged as it depends on
- // the kube-apiserver `allow-privileged` flag.
- for i, container := range coreTemplateSpec.Spec.Containers {
- if container.SecurityContext != nil && container.SecurityContext.Privileged != nil {
- coreTemplateSpec.Spec.Containers[i].SecurityContext.Privileged = nil
- }
- }
-
- corePodTemplate := k8score.PodTemplate{
- ObjectMeta: metav1.ObjectMeta{
- Name: task.Name,
- Namespace: job.Namespace,
- },
- Template: coreTemplateSpec,
- }
-
- if allErrs := k8scorevalid.ValidatePodTemplate(&corePodTemplate); len(allErrs) > 0 {
- msg := fmt.Sprintf("spec.task[%d].", index)
- for index := range allErrs {
- msg += allErrs[index].Error() + ". "
- }
- return msg
- }
-
- msg := validateTaskTopoPolicy(task, index)
- if msg != "" {
- return msg
- }
-
- return ""
-}
-
-func validateK8sPodNameLength(podName string) string {
- if errMsgs := validation.IsQualifiedName(podName); len(errMsgs) > 0 {
- return fmt.Sprintf("create pod with name %s validate failed %v;", podName, errMsgs)
- }
- return ""
-}
-
-func validateJobName(job *v1alpha1.Job) string {
- if errMsgs := validation.IsQualifiedName(job.Name); len(errMsgs) > 0 {
- return fmt.Sprintf("create job with name %s validate failed %v", job.Name, errMsgs)
- }
- return ""
-}
-
-func validateTaskTopoPolicy(task v1alpha1.TaskSpec, index int) string {
- if task.TopologyPolicy == "" || task.TopologyPolicy == v1alpha1.None {
- return ""
- }
-
- template := task.Template.DeepCopy()
-
- for id, container := range template.Spec.Containers {
- if len(container.Resources.Requests) == 0 {
- template.Spec.Containers[id].Resources.Requests = container.Resources.Limits.DeepCopy()
- }
- }
-
- for id, container := range template.Spec.InitContainers {
- if len(container.Resources.Requests) == 0 {
- template.Spec.InitContainers[id].Resources.Requests = container.Resources.Limits.DeepCopy()
- }
- }
-
- pod := &v1.Pod{
- Spec: template.Spec,
- }
-
- if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
- return fmt.Sprintf("spec.task[%d] isn't Guaranteed pod, kind=%v", index, v1qos.GetPodQOS(pod))
- }
-
- for id, container := range append(template.Spec.Containers, template.Spec.InitContainers...) {
- requestNum := guaranteedCPUs(container)
- if requestNum == 0 {
- return fmt.Sprintf("the cpu request isn't an integer in spec.task[%d] container[%d].",
- index, id)
- }
- }
-
- return ""
-}
-
-func guaranteedCPUs(container v1.Container) int {
- cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
- if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
- return 0
- }
-
- return int(cpuQuantity.Value())
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package validate
-
-import (
- "fmt"
-
- "github.com/hashicorp/go-multierror"
- "k8s.io/apimachinery/pkg/util/validation/field"
- "k8s.io/kubernetes/pkg/apis/core/validation"
-
- batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1"
- busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
-)
-
-// policyEventMap defines all policy events and whether to allow external use.
-var policyEventMap = map[busv1alpha1.Event]bool{
- busv1alpha1.AnyEvent: true,
- busv1alpha1.PodFailedEvent: true,
- busv1alpha1.PodEvictedEvent: true,
- busv1alpha1.JobUnknownEvent: true,
- busv1alpha1.TaskCompletedEvent: true,
- busv1alpha1.TaskFailedEvent: true,
- busv1alpha1.OutOfSyncEvent: false,
- busv1alpha1.CommandIssuedEvent: false,
- busv1alpha1.JobUpdatedEvent: true,
-}
-
-// policyActionMap defines all policy actions and whether to allow external use.
-var policyActionMap = map[busv1alpha1.Action]bool{
- busv1alpha1.AbortJobAction: true,
- busv1alpha1.RestartJobAction: true,
- busv1alpha1.RestartTaskAction: true,
- busv1alpha1.TerminateJobAction: true,
- busv1alpha1.CompleteJobAction: true,
- busv1alpha1.ResumeJobAction: true,
- busv1alpha1.SyncJobAction: false,
- busv1alpha1.EnqueueAction: false,
- busv1alpha1.SyncQueueAction: false,
- busv1alpha1.OpenQueueAction: false,
- busv1alpha1.CloseQueueAction: false,
-}
-
-func validatePolicies(policies []batchv1alpha1.LifecyclePolicy, fldPath *field.Path) error {
- var err error
- policyEvents := map[busv1alpha1.Event]struct{}{}
- exitCodes := map[int32]struct{}{}
-
- for _, policy := range policies {
- if (policy.Event != "" || len(policy.Events) != 0) && policy.ExitCode != nil {
- err = multierror.Append(err, fmt.Errorf("must not specify event and exitCode simultaneously"))
- break
- }
-
- if policy.Event == "" && len(policy.Events) == 0 && policy.ExitCode == nil {
- err = multierror.Append(err, fmt.Errorf("either event and exitCode should be specified"))
- break
- }
-
- if len(policy.Event) != 0 || len(policy.Events) != 0 {
- bFlag := false
- policyEventsList := getEventList(policy)
- for _, event := range policyEventsList {
- if allow, ok := policyEventMap[event]; !ok || !allow {
- err = multierror.Append(err, field.Invalid(fldPath, event, "invalid policy event"))
- bFlag = true
- break
- }
-
- if allow, ok := policyActionMap[policy.Action]; !ok || !allow {
- err = multierror.Append(err, field.Invalid(fldPath, policy.Action, "invalid policy action"))
- bFlag = true
- break
- }
- if _, found := policyEvents[event]; found {
- err = multierror.Append(err, fmt.Errorf("duplicate event %v across different policy", event))
- bFlag = true
- break
- } else {
- policyEvents[event] = struct{}{}
- }
- }
- if bFlag {
- break
- }
- } else {
- if *policy.ExitCode == 0 {
- err = multierror.Append(err, fmt.Errorf("0 is not a valid error code"))
- break
- }
- if _, found := exitCodes[*policy.ExitCode]; found {
- err = multierror.Append(err, fmt.Errorf("duplicate exitCode %v", *policy.ExitCode))
- break
- } else {
- exitCodes[*policy.ExitCode] = struct{}{}
- }
- }
- }
-
- if _, found := policyEvents[busv1alpha1.AnyEvent]; found && len(policyEvents) > 1 {
- err = multierror.Append(err, fmt.Errorf("if there's * here, no other policy should be here"))
- }
-
- return err
-}
-
-func getEventList(policy batchv1alpha1.LifecyclePolicy) []busv1alpha1.Event {
- policyEventsList := policy.Events
- if len(policy.Event) > 0 {
- policyEventsList = append(policyEventsList, policy.Event)
- }
- uniquePolicyEventlist := removeDuplicates(policyEventsList)
- return uniquePolicyEventlist
-}
-
-func removeDuplicates(eventList []busv1alpha1.Event) []busv1alpha1.Event {
- keys := make(map[busv1alpha1.Event]bool)
- list := []busv1alpha1.Event{}
- for _, val := range eventList {
- if _, value := keys[val]; !value {
- keys[val] = true
- list = append(list, val)
- }
- }
- return list
-}
-
-func getValidEvents() []busv1alpha1.Event {
- var events []busv1alpha1.Event
- for e, allow := range policyEventMap {
- if allow {
- events = append(events, e)
- }
- }
-
- return events
-}
-
-func getValidActions() []busv1alpha1.Action {
- var actions []busv1alpha1.Action
- for a, allow := range policyActionMap {
- if allow {
- actions = append(actions, a)
- }
- }
-
- return actions
-}
-
-// validateIO validates IO configuration.
-func validateIO(volumes []batchv1alpha1.VolumeSpec) error {
- volumeMap := map[string]bool{}
- for _, volume := range volumes {
- if len(volume.MountPath) == 0 {
- return fmt.Errorf(" mountPath is required;")
- }
- if _, found := volumeMap[volume.MountPath]; found {
- return fmt.Errorf(" duplicated mountPath: %s;", volume.MountPath)
- }
- if volume.VolumeClaim == nil && volume.VolumeClaimName == "" {
- return fmt.Errorf(" either VolumeClaim or VolumeClaimName must be specified;")
- }
- if len(volume.VolumeClaimName) != 0 {
- if volume.VolumeClaim != nil {
- return fmt.Errorf("conflict: If you want to use an existing PVC, just specify VolumeClaimName." +
- "If you want to create a new PVC, you do not need to specify VolumeClaimName")
- }
- if errMsgs := validation.ValidatePersistentVolumeName(volume.VolumeClaimName, false); len(errMsgs) > 0 {
- return fmt.Errorf("invalid VolumeClaimName %s : %v", volume.VolumeClaimName, errMsgs)
- }
- }
-
- volumeMap[volume.MountPath] = true
- }
- return nil
-}
-
-// topoSort uses topo sort to sort job tasks based on dependsOn field
-// it will return an array contains all sorted task names and a bool which indicates whether it's a valid dag
-func topoSort(job *batchv1alpha1.Job) ([]string, bool) {
- graph, inDegree, taskList := makeGraph(job)
- var taskStack []string
- for task, degree := range inDegree {
- if degree == 0 {
- taskStack = append(taskStack, task)
- }
- }
-
- sortedTasks := make([]string, 0)
- for len(taskStack) > 0 {
- length := len(taskStack)
- var out string
- out, taskStack = taskStack[length-1], taskStack[:length-1]
- sortedTasks = append(sortedTasks, out)
- for in, connected := range graph[out] {
- if connected {
- graph[out][in] = false
- inDegree[in]--
- if inDegree[in] == 0 {
- taskStack = append(taskStack, in)
- }
- }
- }
- }
-
- isDag := len(sortedTasks) == len(taskList)
- if !isDag {
- return nil, false
- }
-
- return sortedTasks, isDag
-}
-
-func makeGraph(job *batchv1alpha1.Job) (map[string]map[string]bool, map[string]int, []string) {
- graph := make(map[string]map[string]bool)
- inDegree := make(map[string]int)
- taskList := make([]string, 0)
-
- for _, task := range job.Spec.Tasks {
- taskList = append(taskList, task.Name)
- inDegree[task.Name] = 0
- if task.DependsOn != nil {
- for _, dependOnTask := range task.DependsOn.Name {
- if graph[dependOnTask] == nil {
- graph[dependOnTask] = make(map[string]bool)
- }
-
- graph[dependOnTask][task.Name] = true
- inDegree[task.Name]++
- }
- }
- }
-
- return graph, inDegree, taskList
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- "github.com/imdario/mergo"
- "gopkg.in/yaml.v2"
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- wkconfig "volcano.sh/volcano/pkg/webhooks/config"
-)
-
-type annotationResGroup struct{}
-
-const (
- // defaultAnnotationKey: default annotation key
- defaultAnnotationKey = "volcano.sh/resource-group"
-)
-
-// NewAnnotationResGroup create a new structure
-func NewAnnotationResGroup() ResGroup {
- return &annotationResGroup{}
-}
-
-// getAnnotation get annotations from the resource group
-func getAnnotation(resGroupConfig wkconfig.ResGroupConfig) map[string]string {
- annotations := make(map[string]string)
- for _, val := range resGroupConfig.Object.Value {
- tmp := make(map[string]string)
- err := yaml.Unmarshal([]byte(val), &tmp)
- if err != nil {
- continue
- }
-
- if err := mergo.Merge(&annotations, &tmp); err != nil {
- klog.Errorf("annotations merge failed, err=%v", err)
- continue
- }
- }
-
- return annotations
-}
-
-// IsBelongResGroup adjust whether pod is belong to the resource group
-func (resGroup *annotationResGroup) IsBelongResGroup(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) bool {
- if resGroupConfig.Object.Key != "" && resGroupConfig.Object.Key != "annotation" {
- return false
- }
-
- annotations := getAnnotation(resGroupConfig)
- klog.V(3).Infof("annotations : %v", annotations)
- for key, annotation := range annotations {
- if pod.Annotations[key] == annotation {
- return true
- }
- }
-
- if resGroupConfig.Object.Key == "" && pod.Annotations[defaultAnnotationKey] == resGroupConfig.ResourceGroup {
- return true
- }
-
- return false
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- v1 "k8s.io/api/core/v1"
-
- wkconfig "volcano.sh/volcano/pkg/webhooks/config"
-)
-
-// ResGroup interface for resource group
-type ResGroup interface {
- IsBelongResGroup(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) bool
-}
-
-// GetResGroup return the interface besed on resourceGroup.Object.Key
-func GetResGroup(resourceGroup wkconfig.ResGroupConfig) ResGroup {
- switch resourceGroup.Object.Key {
- case "namespace":
- return NewNamespaceResGroup()
- case "annotation":
- return NewAnnotationResGroup()
- }
- return NewAnnotationResGroup()
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- "encoding/json"
- "fmt"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- v1 "k8s.io/api/core/v1"
- "k8s.io/klog"
-
- wkconfig "volcano.sh/volcano/pkg/webhooks/config"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-// patchOperation define the patch operation structure
-type patchOperation struct {
- Op string `json:"op"`
- Path string `json:"path"`
- Value interface{} `json:"value,omitempty"`
-}
-
-// init register mutate pod
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/pods/mutate",
- Func: Pods,
- Config: config,
- MutatingConfig: &whv1beta1.MutatingWebhookConfiguration{
- Webhooks: []whv1beta1.MutatingWebhook{{
- Name: "mutatepod.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create},
- Rule: whv1beta1.Rule{
- APIGroups: []string{""},
- APIVersions: []string{"v1"},
- Resources: []string{"pods"},
- },
- },
- },
- }},
- },
-}
-
-var config = &router.AdmissionServiceConfig{}
-
-// Pods mutate pods.
-func Pods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("mutating pods -- %s", ar.Request.Operation)
- pod, err := schema.DecodePod(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
-
- if pod.Namespace == "" {
- pod.Namespace = ar.Request.Namespace
- }
-
- var patchBytes []byte
- switch ar.Request.Operation {
- case v1beta1.Create:
- patchBytes, _ = createPatch(pod)
- default:
- err = fmt.Errorf("expect operation to be 'CREATE' ")
- return util.ToAdmissionResponse(err)
- }
-
- reviewResponse := v1beta1.AdmissionResponse{
- Allowed: true,
- Patch: patchBytes,
- }
- pt := v1beta1.PatchTypeJSONPatch
- reviewResponse.PatchType = &pt
-
- return &reviewResponse
-}
-
-// createPatch patch pod
-func createPatch(pod *v1.Pod) ([]byte, error) {
- if config.ConfigData == nil {
- klog.V(5).Infof("admission configuration is empty.")
- return nil, nil
- }
-
- var patch []patchOperation
- config.ConfigData.Lock()
- defer config.ConfigData.Unlock()
-
- for _, resourceGroup := range config.ConfigData.ResGroupsConfig {
- klog.V(3).Infof("resourceGroup %s", resourceGroup.ResourceGroup)
- group := GetResGroup(resourceGroup)
- if !group.IsBelongResGroup(pod, resourceGroup) {
- continue
- }
-
- patchLabel := patchLabels(pod, resourceGroup)
- if patchLabel != nil {
- patch = append(patch, *patchLabel)
- }
-
- patchToleration := patchTaintToleration(pod, resourceGroup)
- if patchToleration != nil {
- patch = append(patch, *patchToleration)
- }
- patchScheduler := patchSchedulerName(resourceGroup)
- if patchScheduler != nil {
- patch = append(patch, *patchScheduler)
- }
-
- klog.V(5).Infof("pod patch %v", patch)
- return json.Marshal(patch)
- }
-
- return json.Marshal(patch)
-}
-
-// patchLabels patch label
-func patchLabels(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) *patchOperation {
- if len(resGroupConfig.Labels) == 0 {
- return nil
- }
-
- nodeSelector := make(map[string]string)
- for key, label := range pod.Spec.NodeSelector {
- nodeSelector[key] = label
- }
-
- for key, label := range resGroupConfig.Labels {
- nodeSelector[key] = label
- }
-
- return &patchOperation{Op: "add", Path: "/spec/nodeSelector", Value: nodeSelector}
-}
-
-// patchTaintToleration patch taint toleration
-func patchTaintToleration(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) *patchOperation {
- if len(resGroupConfig.Tolerations) == 0 {
- return nil
- }
-
- var dst []v1.Toleration
- dst = append(dst, pod.Spec.Tolerations...)
- dst = append(dst, resGroupConfig.Tolerations...)
-
- return &patchOperation{Op: "add", Path: "/spec/tolerations", Value: dst}
-}
-
-// patchSchedulerName patch scheduler
-func patchSchedulerName(resGroupConfig wkconfig.ResGroupConfig) *patchOperation {
- if resGroupConfig.SchedulerName == "" {
- return nil
- }
-
- return &patchOperation{Op: "add", Path: "/spec/schedulerName", Value: resGroupConfig.SchedulerName}
-}
-
-
-
/*
-Copyright 2021 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- v1 "k8s.io/api/core/v1"
-
- wkconfig "volcano.sh/volcano/pkg/webhooks/config"
-)
-
-type namespaceResGroup struct{}
-
-// NewNamespaceResGroup create a new structure
-func NewNamespaceResGroup() ResGroup {
- return &namespaceResGroup{}
-}
-
-// IsBelongResGroup adjust whether pod is belong to the resource group
-func (resGroup *namespaceResGroup) IsBelongResGroup(pod *v1.Pod, resGroupConfig wkconfig.ResGroupConfig) bool {
- if resGroupConfig.Object.Key != "namespace" {
- return false
- }
-
- for _, val := range resGroupConfig.Object.Value {
- if pod.Namespace == val {
- return true
- }
- }
-
- return false
-}
-
-
-
/*
-Copyright 2019 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package validate
-
-import (
- "context"
- "fmt"
- "strconv"
- "strings"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- v1 "k8s.io/api/core/v1"
- apierrors "k8s.io/apimachinery/pkg/api/errors"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/util/intstr"
- "k8s.io/klog"
-
- "volcano.sh/apis/pkg/apis/helpers"
- vcv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/pods/validate",
- Func: AdmitPods,
-
- Config: config,
-
- ValidatingConfig: &whv1beta1.ValidatingWebhookConfiguration{
- Webhooks: []whv1beta1.ValidatingWebhook{{
- Name: "validatepod.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create},
- Rule: whv1beta1.Rule{
- APIGroups: []string{""},
- APIVersions: []string{"v1"},
- Resources: []string{"pods"},
- },
- },
- },
- }},
- },
-}
-
-var config = &router.AdmissionServiceConfig{}
-
-// AdmitPods is to admit pods and return response.
-func AdmitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("admitting pods -- %s", ar.Request.Operation)
-
- pod, err := schema.DecodePod(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
-
- var msg string
- reviewResponse := v1beta1.AdmissionResponse{}
- reviewResponse.Allowed = true
-
- switch ar.Request.Operation {
- case v1beta1.Create:
- msg = validatePod(pod, &reviewResponse)
- default:
- err := fmt.Errorf("expect operation to be 'CREATE'")
- return util.ToAdmissionResponse(err)
- }
-
- if !reviewResponse.Allowed {
- reviewResponse.Result = &metav1.Status{Message: strings.TrimSpace(msg)}
- }
- return &reviewResponse
-}
-
-/*
-allow pods to create when
-1. schedulerName of pod isn't volcano
-2. pod has Podgroup whose phase isn't Pending
-3. normal pods whose schedulerName is volcano don't have podgroup.
-4. check pod budget annotations configure
-*/
-func validatePod(pod *v1.Pod, reviewResponse *v1beta1.AdmissionResponse) string {
- if pod.Spec.SchedulerName != config.SchedulerName {
- return ""
- }
-
- pgName := ""
- msg := ""
-
- // vc-job, SN == volcano
- if pod.Annotations != nil {
- pgName = pod.Annotations[vcv1beta1.KubeGroupNameAnnotationKey]
- }
- if pgName != "" {
- if err := checkPGPhase(pod, pgName, true); err != nil {
- msg = err.Error()
- reviewResponse.Allowed = false
- }
- return msg
- }
-
- // normal pod, SN == volcano
- pgName = helpers.GeneratePodgroupName(pod)
- if err := checkPGPhase(pod, pgName, false); err != nil {
- msg = err.Error()
- reviewResponse.Allowed = false
- }
-
- // check pod annotatations
- if err := validateAnnotation(pod); err != nil {
- msg = err.Error()
- reviewResponse.Allowed = false
- }
-
- return msg
-}
-
-func checkPGPhase(pod *v1.Pod, pgName string, isVCJob bool) error {
- pg, err := config.VolcanoClient.SchedulingV1beta1().PodGroups(pod.Namespace).Get(context.TODO(), pgName, metav1.GetOptions{})
- if err != nil {
- if isVCJob || (!isVCJob && !apierrors.IsNotFound(err)) {
- return fmt.Errorf("failed to get PodGroup for pod <%s/%s>: %v", pod.Namespace, pod.Name, err)
- }
- return nil
- }
- if pg.Status.Phase != vcv1beta1.PodGroupPending {
- return nil
- }
- return fmt.Errorf("failed to create pod <%s/%s> as the podgroup phase is Pending",
- pod.Namespace, pod.Name)
-}
-
-func validateAnnotation(pod *v1.Pod) error {
- num := 0
- if len(pod.Annotations) > 0 {
- keys := []string{
- vcv1beta1.JDBMinAvailable,
- vcv1beta1.JDBMaxUnavailable,
- }
- for _, key := range keys {
- if value, found := pod.Annotations[key]; found {
- num++
- if err := validateIntPercentageStr(key, value); err != nil {
- recordEvent(err)
- return err
- }
- }
- }
- if num > 1 {
- return fmt.Errorf("not allow configure multiple annotations <%v> at same time", keys)
- }
- }
- return nil
-}
-
-func recordEvent(err error) {
- config.Recorder.Eventf(nil, v1.EventTypeWarning, "Admit", "Create pod failed due to %v", err)
-}
-
-func validateIntPercentageStr(key, value string) error {
- tmp := intstr.Parse(value)
- switch tmp.Type {
- case intstr.Int:
- if tmp.IntValue() <= 0 {
- return fmt.Errorf("invalid value <%q> for %v, it must be a positive integer", value, key)
- }
- return nil
- case intstr.String:
- s := strings.Replace(tmp.StrVal, "%", "", -1)
- v, err := strconv.Atoi(s)
- if err != nil {
- return fmt.Errorf("invalid value %v for %v", err, key)
- }
- if v <= 0 || v >= 100 {
- return fmt.Errorf("invalid value <%q> for %v, it must be a valid percentage which between 1%% ~ 99%%", tmp.StrVal, key)
- }
- return nil
- }
- return fmt.Errorf("invalid type: neither int nor percentage for %v", key)
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package mutate
-
-import (
- "encoding/json"
- "fmt"
- "strings"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/klog"
-
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/queues/mutate",
- Func: Queues,
-
- MutatingConfig: &whv1beta1.MutatingWebhookConfiguration{
- Webhooks: []whv1beta1.MutatingWebhook{{
- Name: "mutatequeue.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create},
- Rule: whv1beta1.Rule{
- APIGroups: []string{schedulingv1beta1.SchemeGroupVersion.Group},
- APIVersions: []string{schedulingv1beta1.SchemeGroupVersion.Version},
- Resources: []string{"queues"},
- },
- },
- },
- }},
- },
-}
-
-type patchOperation struct {
- Op string `json:"op"`
- Path string `json:"path"`
- Value interface{} `json:"value,omitempty"`
-}
-
-// Queues mutate queues.
-func Queues(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("Mutating %s queue %s.", ar.Request.Operation, ar.Request.Name)
-
- queue, err := schema.DecodeQueue(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
-
- var patchBytes []byte
- switch ar.Request.Operation {
- case v1beta1.Create:
- patchBytes, err = createQueuePatch(queue)
- default:
- return util.ToAdmissionResponse(fmt.Errorf("invalid operation `%s`, "+
- "expect operation to be `CREATE`", ar.Request.Operation))
- }
-
- if err != nil {
- return &v1beta1.AdmissionResponse{
- Allowed: false,
- Result: &metav1.Status{Message: err.Error()},
- }
- }
-
- pt := v1beta1.PatchTypeJSONPatch
- return &v1beta1.AdmissionResponse{
- Allowed: true,
- Patch: patchBytes,
- PatchType: &pt,
- }
-}
-
-func createQueuePatch(queue *schedulingv1beta1.Queue) ([]byte, error) {
- var patch []patchOperation
-
- // add root node if the root node not specified
- hierarchy := queue.Annotations[schedulingv1beta1.KubeHierarchyAnnotationKey]
- hierarchicalWeights := queue.Annotations[schedulingv1beta1.KubeHierarchyWeightAnnotationKey]
-
- if hierarchy != "" && hierarchicalWeights != "" && !strings.HasPrefix(hierarchy, "root") {
- // based on https://tools.ietf.org/html/rfc6901#section-3
- // escape "/" with "~1"
- patch = append(patch, patchOperation{
- Op: "add",
- Path: fmt.Sprintf("/metadata/annotations/%s", strings.ReplaceAll(schedulingv1beta1.KubeHierarchyAnnotationKey, "/", "~1")),
- Value: fmt.Sprintf("root/%s", hierarchy),
- })
- patch = append(patch, patchOperation{
- Op: "add",
- Path: fmt.Sprintf("/metadata/annotations/%s", strings.ReplaceAll(schedulingv1beta1.KubeHierarchyWeightAnnotationKey, "/", "~1")),
- Value: fmt.Sprintf("1/%s", hierarchicalWeights),
- })
- }
-
- trueValue := true
- if queue.Spec.Reclaimable == nil {
- patch = append(patch, patchOperation{
- Op: "add",
- Path: "/spec/reclaimable",
- Value: &trueValue,
- })
- }
-
- defaultWeight := 1
- if queue.Spec.Weight == 0 {
- patch = append(patch, patchOperation{
- Op: "add",
- Path: "/spec/weight",
- Value: &defaultWeight,
- })
- }
-
- return json.Marshal(patch)
-}
-
-
-
/*
-Copyright 2018 The Volcano Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package validate
-
-import (
- "context"
- "fmt"
- "strconv"
- "strings"
-
- "k8s.io/api/admission/v1beta1"
- whv1beta1 "k8s.io/api/admissionregistration/v1beta1"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/util/validation/field"
- "k8s.io/klog"
-
- schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
- "volcano.sh/volcano/pkg/webhooks/router"
- "volcano.sh/volcano/pkg/webhooks/schema"
- "volcano.sh/volcano/pkg/webhooks/util"
-)
-
-func init() {
- router.RegisterAdmission(service)
-}
-
-var service = &router.AdmissionService{
- Path: "/queues/validate",
- Func: AdmitQueues,
-
- Config: config,
-
- ValidatingConfig: &whv1beta1.ValidatingWebhookConfiguration{
- Webhooks: []whv1beta1.ValidatingWebhook{{
- Name: "validatequeue.volcano.sh",
- Rules: []whv1beta1.RuleWithOperations{
- {
- Operations: []whv1beta1.OperationType{whv1beta1.Create, whv1beta1.Update, whv1beta1.Delete},
- Rule: whv1beta1.Rule{
- APIGroups: []string{schedulingv1beta1.SchemeGroupVersion.Group},
- APIVersions: []string{schedulingv1beta1.SchemeGroupVersion.Version},
- Resources: []string{"queues"},
- },
- },
- },
- }},
- },
-}
-
-var config = &router.AdmissionServiceConfig{}
-
-// AdmitQueues is to admit queues and return response.
-func AdmitQueues(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse {
- klog.V(3).Infof("Admitting %s queue %s.", ar.Request.Operation, ar.Request.Name)
-
- queue, err := schema.DecodeQueue(ar.Request.Object, ar.Request.Resource)
- if err != nil {
- return util.ToAdmissionResponse(err)
- }
-
- switch ar.Request.Operation {
- case v1beta1.Create, v1beta1.Update:
- err = validateQueue(queue)
- case v1beta1.Delete:
- err = validateQueueDeleting(ar.Request.Name)
- default:
- return util.ToAdmissionResponse(fmt.Errorf("invalid operation `%s`, "+
- "expect operation to be `CREATE`, `UPDATE` or `DELETE`", ar.Request.Operation))
- }
-
- if err != nil {
- return &v1beta1.AdmissionResponse{
- Allowed: false,
- Result: &metav1.Status{Message: err.Error()},
- }
- }
-
- return &v1beta1.AdmissionResponse{
- Allowed: true,
- }
-}
-
-func validateQueue(queue *schedulingv1beta1.Queue) error {
- errs := field.ErrorList{}
- resourcePath := field.NewPath("requestBody")
-
- errs = append(errs, validateStateOfQueue(queue.Status.State, resourcePath.Child("spec").Child("state"))...)
- errs = append(errs, validateWeightOfQueue(queue.Spec.Weight, resourcePath.Child("spec").Child("weight"))...)
- errs = append(errs, validateHierarchicalAttributes(queue, resourcePath.Child("metadata").Child("annotations"))...)
-
- if len(errs) > 0 {
- return errs.ToAggregate()
- }
-
- return nil
-}
-func validateHierarchicalAttributes(queue *schedulingv1beta1.Queue, fldPath *field.Path) field.ErrorList {
- errs := field.ErrorList{}
- hierarchy := queue.Annotations[schedulingv1beta1.KubeHierarchyAnnotationKey]
- hierarchicalWeights := queue.Annotations[schedulingv1beta1.KubeHierarchyWeightAnnotationKey]
- if hierarchy != "" || hierarchicalWeights != "" {
- paths := strings.Split(hierarchy, "/")
- weights := strings.Split(hierarchicalWeights, "/")
- // path length must be the same with weights length
- if len(paths) != len(weights) {
- return append(errs, field.Invalid(fldPath, hierarchy,
- fmt.Sprintf("%s must have the same length with %s",
- schedulingv1beta1.KubeHierarchyAnnotationKey,
- schedulingv1beta1.KubeHierarchyWeightAnnotationKey,
- )))
- }
-
- // check weights format
- for _, weight := range weights {
- weightFloat, err := strconv.ParseFloat(weight, 64)
- if err != nil {
- return append(errs, field.Invalid(fldPath, hierarchicalWeights,
- fmt.Sprintf("%s in the %s is invalid number: %v",
- weight, hierarchicalWeights, err,
- )))
- }
- if weightFloat <= 0 {
- return append(errs, field.Invalid(fldPath, hierarchicalWeights,
- fmt.Sprintf("%s in the %s must be larger than 0",
- weight, hierarchicalWeights,
- )))
- }
- }
-
- // The node is not allowed to be in the sub path of a node.
- // For example, a queue with "root/sci" conflicts with a queue with "root/sci/dev"
- queueList, err := config.VolcanoClient.SchedulingV1beta1().Queues().List(context.TODO(), metav1.ListOptions{})
- if err != nil {
- return append(errs, field.Invalid(fldPath, hierarchy,
- fmt.Sprintf("checking %s, list queues failed: %v",
- schedulingv1beta1.KubeHierarchyAnnotationKey,
- err,
- )))
- }
- for _, queueInTree := range queueList.Items {
- hierarchyInTree := queueInTree.Annotations[schedulingv1beta1.KubeHierarchyAnnotationKey]
- if hierarchyInTree != "" && queue.Name != queueInTree.Name &&
- strings.HasPrefix(hierarchyInTree, hierarchy) {
- return append(errs, field.Invalid(fldPath, hierarchy,
- fmt.Sprintf("%s is not allowed to be in the sub path of %s of queue %s",
- hierarchy, hierarchyInTree, queueInTree.Name)))
- }
- }
- }
- return errs
-}
-
-func validateStateOfQueue(value schedulingv1beta1.QueueState, fldPath *field.Path) field.ErrorList {
- errs := field.ErrorList{}
-
- if len(value) == 0 {
- return errs
- }
-
- validQueueStates := []schedulingv1beta1.QueueState{
- schedulingv1beta1.QueueStateOpen,
- schedulingv1beta1.QueueStateClosed,
- }
-
- for _, validQueue := range validQueueStates {
- if value == validQueue {
- return errs
- }
- }
-
- return append(errs, field.Invalid(fldPath, value, fmt.Sprintf("queue state must be in %v", validQueueStates)))
-}
-
-func validateWeightOfQueue(value int32, fldPath *field.Path) field.ErrorList {
- errs := field.ErrorList{}
- if value > 0 {
- return errs
- }
- return append(errs, field.Invalid(fldPath, value, "queue weight must be a positive integer"))
-}
-
-func validateQueueDeleting(queue string) error {
- if queue == "default" {
- return fmt.Errorf("`%s` queue can not be deleted", "default")
- }
-
- q, err := config.VolcanoClient.SchedulingV1beta1().Queues().Get(context.TODO(), queue, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- if q.Status.State != schedulingv1beta1.QueueStateClosed {
- return fmt.Errorf("only queue with state `%s` can be deleted, queue `%s` state is `%s`",
- schedulingv1beta1.QueueStateClosed, q.Name, q.Status.State)
- }
-
- return nil
-}
-
-
-
-
-
-
diff --git a/pkg/cli/podgroup/podgroup.go b/pkg/cli/podgroup/podgroup.go
new file mode 100644
index 0000000000..ad9ea2c779
--- /dev/null
+++ b/pkg/cli/podgroup/podgroup.go
@@ -0,0 +1,26 @@
+package podgroup
+
+import "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
+
+type PodGroupStatistics struct {
+ Inqueue int
+ Pending int
+ Running int
+ Unknown int
+ Completed int
+}
+
+func (pgStats *PodGroupStatistics) StatPodGroupCountsForQueue(pg *v1beta1.PodGroup) {
+ switch pg.Status.Phase {
+ case v1beta1.PodGroupInqueue:
+ pgStats.Inqueue++
+ case v1beta1.PodGroupPending:
+ pgStats.Pending++
+ case v1beta1.PodGroupRunning:
+ pgStats.Running++
+ case v1beta1.PodGroupUnknown:
+ pgStats.Unknown++
+ case v1beta1.PodGroupCompleted:
+ pgStats.Completed++
+ }
+}
diff --git a/pkg/cli/queue/get.go b/pkg/cli/queue/get.go
index 057bffda1e..3253ec8bd1 100644
--- a/pkg/cli/queue/get.go
+++ b/pkg/cli/queue/get.go
@@ -28,6 +28,7 @@ import (
"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/apis/pkg/client/clientset/versioned"
+ "volcano.sh/volcano/pkg/cli/podgroup"
)
type getFlags struct {
@@ -63,21 +64,37 @@ func GetQueue(ctx context.Context) error {
return err
}
- PrintQueue(queue, os.Stdout)
+ // Although the featuregate called CustomResourceFieldSelectors is enabled by default after v1.31, there are still
+ // users using k8s versions lower than v1.31. Therefore we can only get all the podgroups from kube-apiserver
+ // and then filtering them.
+ pgList, err := queueClient.SchedulingV1beta1().PodGroups("").List(ctx, metav1.ListOptions{})
+ if err != nil {
+ return fmt.Errorf("failed to list podgroup for queue %s with err: %v", getQueueFlags.Name, err)
+ }
+
+ pgStats := &podgroup.PodGroupStatistics{}
+ for _, pg := range pgList.Items {
+ if pg.Spec.Queue == getQueueFlags.Name {
+ pgStats.StatPodGroupCountsForQueue(&pg)
+ }
+ }
+
+ PrintQueue(queue, pgStats, os.Stdout)
return nil
}
// PrintQueue prints queue information.
-func PrintQueue(queue *v1beta1.Queue, writer io.Writer) {
- _, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s\n",
- Name, Weight, State, Inqueue, Pending, Running, Unknown)
+func PrintQueue(queue *v1beta1.Queue, pgStats *podgroup.PodGroupStatistics, writer io.Writer) {
+ _, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s%-8s\n",
+ Name, Weight, State, Inqueue, Pending, Running, Unknown, Completed)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}
- _, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d\n",
- queue.Name, queue.Spec.Weight, queue.Status.State, queue.Status.Inqueue,
- queue.Status.Pending, queue.Status.Running, queue.Status.Unknown)
+
+ _, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d%-8d\n",
+ queue.Name, queue.Spec.Weight, queue.Status.State, pgStats.Inqueue,
+ pgStats.Pending, pgStats.Running, pgStats.Unknown, pgStats.Completed)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}
diff --git a/pkg/cli/queue/list.go b/pkg/cli/queue/list.go
index b6ceba2f0c..1814c9e7ed 100644
--- a/pkg/cli/queue/list.go
+++ b/pkg/cli/queue/list.go
@@ -28,6 +28,7 @@ import (
"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/apis/pkg/client/clientset/versioned"
+ "volcano.sh/volcano/pkg/cli/podgroup"
)
type listFlags struct {
@@ -53,6 +54,9 @@ const (
// Inqueue status of queue
Inqueue string = "Inqueue"
+ // Completed status of the queue
+ Completed string = "Completed"
+
// State is state of queue
State string = "State"
)
@@ -81,22 +85,41 @@ func ListQueue(ctx context.Context) error {
fmt.Printf("No resources found\n")
return nil
}
- PrintQueues(queues, os.Stdout)
+
+ // Although the featuregate called CustomResourceFieldSelectors is enabled by default after v1.31, there are still
+ // users using k8s versions lower than v1.31. Therefore we can only get all the podgroups from kube-apiserver
+ // and then filtering them.
+ pgList, err := jobClient.SchedulingV1beta1().PodGroups("").List(ctx, metav1.ListOptions{})
+ if err != nil {
+ return fmt.Errorf("failed to list podgroups with err: %v", err)
+ }
+
+ queueStats := make(map[string]*podgroup.PodGroupStatistics, len(queues.Items))
+ for _, queue := range queues.Items {
+ queueStats[queue.Name] = &podgroup.PodGroupStatistics{}
+ }
+
+ for _, pg := range pgList.Items {
+ queueStats[pg.Spec.Queue].StatPodGroupCountsForQueue(&pg)
+ }
+
+ PrintQueues(queues, queueStats, os.Stdout)
return nil
}
// PrintQueues prints queue information.
-func PrintQueues(queues *v1beta1.QueueList, writer io.Writer) {
- _, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s\n",
- Name, Weight, State, Inqueue, Pending, Running, Unknown)
+func PrintQueues(queues *v1beta1.QueueList, queueStats map[string]*podgroup.PodGroupStatistics, writer io.Writer) {
+ _, err := fmt.Fprintf(writer, "%-25s%-8s%-8s%-8s%-8s%-8s%-8s%-8s\n",
+ Name, Weight, State, Inqueue, Pending, Running, Unknown, Completed)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}
+
for _, queue := range queues.Items {
- _, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d\n",
- queue.Name, queue.Spec.Weight, queue.Status.State, queue.Status.Inqueue,
- queue.Status.Pending, queue.Status.Running, queue.Status.Unknown)
+ _, err = fmt.Fprintf(writer, "%-25s%-8d%-8s%-8d%-8d%-8d%-8d%-8d\n",
+ queue.Name, queue.Spec.Weight, queue.Status.State, queueStats[queue.Name].Inqueue, queueStats[queue.Name].Pending,
+ queueStats[queue.Name].Running, queueStats[queue.Name].Unknown, queueStats[queue.Name].Completed)
if err != nil {
fmt.Printf("Failed to print queue command result: %s.\n", err)
}
diff --git a/pkg/controllers/metrics/queue.go b/pkg/controllers/metrics/queue.go
new file mode 100644
index 0000000000..eadd453c4d
--- /dev/null
+++ b/pkg/controllers/metrics/queue.go
@@ -0,0 +1,93 @@
+package metrics
+
+import (
+ "github.com/prometheus/client_golang/prometheus"
+ "github.com/prometheus/client_golang/prometheus/promauto"
+
+ "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
+ "volcano.sh/volcano/pkg/scheduler/metrics"
+)
+
+var (
+ queuePodGroupInqueue = promauto.NewGaugeVec(
+ prometheus.GaugeOpts{
+ Subsystem: metrics.VolcanoNamespace,
+ Name: "queue_pod_group_inqueue_count",
+ Help: "The number of Inqueue PodGroup in this queue",
+ }, []string{"queue_name"},
+ )
+
+ queuePodGroupPending = promauto.NewGaugeVec(
+ prometheus.GaugeOpts{
+ Subsystem: metrics.VolcanoNamespace,
+ Name: "queue_pod_group_pending_count",
+ Help: "The number of Pending PodGroup in this queue",
+ }, []string{"queue_name"},
+ )
+
+ queuePodGroupRunning = promauto.NewGaugeVec(
+ prometheus.GaugeOpts{
+ Subsystem: metrics.VolcanoNamespace,
+ Name: "queue_pod_group_running_count",
+ Help: "The number of Running PodGroup in this queue",
+ }, []string{"queue_name"},
+ )
+
+ queuePodGroupUnknown = promauto.NewGaugeVec(
+ prometheus.GaugeOpts{
+ Subsystem: metrics.VolcanoNamespace,
+ Name: "queue_pod_group_unknown_count",
+ Help: "The number of Unknown PodGroup in this queue",
+ }, []string{"queue_name"},
+ )
+
+ queuePodGroupCompleted = promauto.NewGaugeVec(
+ prometheus.GaugeOpts{
+ Subsystem: metrics.VolcanoNamespace,
+ Name: "queue_pod_group_completed_count",
+ Help: "The number of Completed PodGroup in this queue",
+ }, []string{"queue_name"},
+ )
+)
+
+// UpdateQueuePodGroupInqueueCount records the number of Inqueue PodGroup in this queue
+func UpdateQueuePodGroupInqueueCount(queueName string, count int32) {
+ queuePodGroupInqueue.WithLabelValues(queueName).Set(float64(count))
+}
+
+// UpdateQueuePodGroupPendingCount records the number of Pending PodGroup in this queue
+func UpdateQueuePodGroupPendingCount(queueName string, count int32) {
+ queuePodGroupPending.WithLabelValues(queueName).Set(float64(count))
+}
+
+// UpdateQueuePodGroupRunningCount records the number of Running PodGroup in this queue
+func UpdateQueuePodGroupRunningCount(queueName string, count int32) {
+ queuePodGroupRunning.WithLabelValues(queueName).Set(float64(count))
+}
+
+// UpdateQueuePodGroupUnknownCount records the number of Unknown PodGroup in this queue
+func UpdateQueuePodGroupUnknownCount(queueName string, count int32) {
+ queuePodGroupUnknown.WithLabelValues(queueName).Set(float64(count))
+}
+
+// UpdateQueuePodGroupCompletedCount records the number of Completed PodGroup in this queue
+func UpdateQueuePodGroupCompletedCount(queueName string, count int32) {
+ queuePodGroupCompleted.WithLabelValues(queueName).Set(float64(count))
+}
+
+// DeleteQueueMetrics delete all metrics related to the queue
+func DeleteQueueMetrics(queueName string) {
+ queuePodGroupInqueue.DeleteLabelValues(queueName)
+ queuePodGroupPending.DeleteLabelValues(queueName)
+ queuePodGroupRunning.DeleteLabelValues(queueName)
+ queuePodGroupUnknown.DeleteLabelValues(queueName)
+ queuePodGroupCompleted.DeleteLabelValues(queueName)
+}
+
+func UpdateQueueMetrics(queueName string, queueStatus *v1beta1.QueueStatus) {
+ UpdateQueuePodGroupPendingCount(queueName, queueStatus.Pending)
+ UpdateQueuePodGroupRunningCount(queueName, queueStatus.Running)
+ UpdateQueuePodGroupUnknownCount(queueName, queueStatus.Unknown)
+ UpdateQueuePodGroupInqueueCount(queueName, queueStatus.Inqueue)
+ UpdateQueuePodGroupCompletedCount(queueName, queueStatus.Completed)
+}
diff --git a/pkg/controllers/queue/queue_controller_action.go b/pkg/controllers/queue/queue_controller_action.go
index 60e16b7d09..57a308c962 100644
--- a/pkg/controllers/queue/queue_controller_action.go
+++ b/pkg/controllers/queue/queue_controller_action.go
@@ -23,7 +23,6 @@ import (
"strings"
v1 "k8s.io/api/core/v1"
- "k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
@@ -34,7 +33,9 @@ import (
"volcano.sh/apis/pkg/apis/bus/v1alpha1"
busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
+ v1beta1apply "volcano.sh/apis/pkg/client/applyconfiguration/scheduling/v1beta1"
"volcano.sh/volcano/pkg/controllers/apis"
+ "volcano.sh/volcano/pkg/controllers/metrics"
"volcano.sh/volcano/pkg/controllers/queue/state"
)
@@ -83,9 +84,14 @@ func (c *queuecontroller) syncQueue(queue *schedulingv1beta1.Queue, updateStateF
queueStatus.Unknown++
case schedulingv1beta1.PodGroupInqueue:
queueStatus.Inqueue++
+ case schedulingv1beta1.PodGroupCompleted:
+ queueStatus.Completed++
}
}
+ // Update the metrics
+ metrics.UpdateQueueMetrics(queue.Name, &queueStatus)
+
if updateStateFn != nil {
updateStateFn(&queueStatus, podGroups)
} else {
@@ -101,13 +107,12 @@ func (c *queuecontroller) syncQueue(queue *schedulingv1beta1.Queue, updateStateF
}
newQueue := queue.DeepCopy()
- // ignore update when status does not change
- if !equality.Semantic.DeepEqual(queueStatus, queue.Status) {
- newQueue.Status = queueStatus
- var err error
- newQueue, err = c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{})
- if err != nil {
- klog.Errorf("Failed to update status of Queue %s: %v.", newQueue.Name, err)
+ // ignore update when state does not change
+ if queueStatus.State != queue.Status.State {
+ queueStatusApply := v1beta1apply.QueueStatus().WithState(queueStatus.State).WithAllocated(queueStatus.Allocated)
+ queueApply := v1beta1apply.Queue(queue.Name).WithStatus(queueStatusApply)
+ if newQueue, err = c.vcClient.SchedulingV1beta1().Queues().ApplyStatus(context.TODO(), queueApply, metav1.ApplyOptions{FieldManager: controllerName}); err != nil {
+ klog.Errorf("Update queue state from %s to %s failed for %v", queue.Status.State, queueStatus.State, err)
return err
}
}
@@ -126,37 +131,19 @@ func (c *queuecontroller) openQueue(queue *schedulingv1beta1.Queue, updateStateF
}
newQueue := queue.DeepCopy()
- newQueue.Status.State = schedulingv1beta1.QueueStateOpen
+ if updateStateFn != nil {
+ updateStateFn(&newQueue.Status, nil)
+ }
if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().Update(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
+ queueStatusApply := v1beta1apply.QueueStatus().WithState(newQueue.Status.State).WithAllocated(newQueue.Status.Allocated)
+ queueApply := v1beta1apply.Queue(queue.Name).WithStatus(queueStatusApply)
+ if _, err := c.vcClient.SchedulingV1beta1().Queues().ApplyStatus(context.TODO(), queueApply, metav1.ApplyOptions{FieldManager: controllerName}); err != nil {
c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.OpenQueueAction),
- fmt.Sprintf("Open queue failed for %v", err))
- return err
- }
-
- c.recorder.Event(newQueue, v1.EventTypeNormal, string(v1alpha1.OpenQueueAction), "Open queue succeed")
-
- q, err := c.vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), newQueue.Name, metav1.GetOptions{})
- if err != nil {
+ fmt.Sprintf("Update queue status from %s to %s failed for %v",
+ queue.Status.State, newQueue.Status.State, err))
return err
}
-
- newQueue = q.DeepCopy()
- if updateStateFn != nil {
- updateStateFn(&newQueue.Status, nil)
- } else {
- return fmt.Errorf("internal error, update state function should be provided")
- }
-
- if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.OpenQueueAction),
- fmt.Sprintf("Update queue status from %s to %s failed for %v",
- queue.Status.State, newQueue.Status.State, err))
- return err
- }
- }
}
_, err := c.updateQueueAnnotation(queue, ClosedByParentAnnotationKey, ClosedByParentAnnotationFalseValue)
@@ -173,41 +160,21 @@ func (c *queuecontroller) closeQueue(queue *schedulingv1beta1.Queue, updateState
}
}
+ podGroups := c.getPodGroups(queue.Name)
newQueue := queue.DeepCopy()
- newQueue.Status.State = schedulingv1beta1.QueueStateClosed
-
- if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().Update(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
- c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.CloseQueueAction),
- fmt.Sprintf("Close queue failed for %v", err))
- return err
- }
-
- c.recorder.Event(newQueue, v1.EventTypeNormal, string(v1alpha1.CloseQueueAction), "Close queue succeed")
- } else {
- return nil
- }
-
- q, err := c.vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), newQueue.Name, metav1.GetOptions{})
- if err != nil {
- return err
- }
-
- newQueue = q.DeepCopy()
- podGroups := c.getPodGroups(newQueue.Name)
if updateStateFn != nil {
updateStateFn(&newQueue.Status, podGroups)
- } else {
- return fmt.Errorf("internal error, update state function should be provided")
}
if queue.Status.State != newQueue.Status.State {
- if _, err := c.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}); err != nil {
+ queueStatusApply := v1beta1apply.QueueStatus().WithState(newQueue.Status.State).WithAllocated(newQueue.Status.Allocated)
+ queueApply := v1beta1apply.Queue(queue.Name).WithStatus(queueStatusApply)
+ if _, err := c.vcClient.SchedulingV1beta1().Queues().ApplyStatus(context.TODO(), queueApply, metav1.ApplyOptions{FieldManager: controllerName}); err != nil {
c.recorder.Event(newQueue, v1.EventTypeWarning, string(v1alpha1.CloseQueueAction),
- fmt.Sprintf("Update queue status from %s to %s failed for %v",
- queue.Status.State, newQueue.Status.State, err))
+ fmt.Sprintf("Close queue failed for %v", err))
return err
}
+ c.recorder.Event(newQueue, v1.EventTypeNormal, string(v1alpha1.CloseQueueAction), "Close queue succeed")
}
return nil
diff --git a/pkg/controllers/queue/queue_controller_handler.go b/pkg/controllers/queue/queue_controller_handler.go
index e5dc6aa979..1abae7c0e0 100644
--- a/pkg/controllers/queue/queue_controller_handler.go
+++ b/pkg/controllers/queue/queue_controller_handler.go
@@ -23,6 +23,7 @@ import (
busv1alpha1 "volcano.sh/apis/pkg/apis/bus/v1alpha1"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"volcano.sh/volcano/pkg/controllers/apis"
+ "volcano.sh/volcano/pkg/controllers/metrics"
)
func (c *queuecontroller) enqueue(req *apis.Request) {
@@ -57,6 +58,7 @@ func (c *queuecontroller) deleteQueue(obj interface{}) {
}
}
+ metrics.DeleteQueueMetrics(queue.Name)
c.pgMutex.Lock()
defer c.pgMutex.Unlock()
delete(c.podGroups, queue.Name)
diff --git a/pkg/controllers/queue/queue_controller_test.go b/pkg/controllers/queue/queue_controller_test.go
index b10d8168a7..79db781798 100644
--- a/pkg/controllers/queue/queue_controller_test.go
+++ b/pkg/controllers/queue/queue_controller_test.go
@@ -19,10 +19,9 @@ package queue
import (
"context"
"fmt"
- "reflect"
"testing"
- v1 "k8s.io/api/core/v1"
+ "github.com/stretchr/testify/assert"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kubeclient "k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/tools/cache"
@@ -31,6 +30,7 @@ import (
vcclient "volcano.sh/apis/pkg/client/clientset/versioned/fake"
informerfactory "volcano.sh/apis/pkg/client/informers/externalversions"
"volcano.sh/volcano/pkg/controllers/framework"
+ "volcano.sh/volcano/pkg/controllers/queue/state"
)
func newFakeController() *queuecontroller {
@@ -241,141 +241,63 @@ func TestUpdatePodGroup(t *testing.T) {
}
func TestSyncQueue(t *testing.T) {
- namespace := "c1"
-
testCases := []struct {
- Name string
- pgsInCache []*schedulingv1beta1.PodGroup
- pgsInInformer []*schedulingv1beta1.PodGroup
- queue *schedulingv1beta1.Queue
- ExpectStatus schedulingv1beta1.QueueStatus
+ Name string
+ queue *schedulingv1beta1.Queue
+ updateStatusFnFactory func(queue *schedulingv1beta1.Queue) state.UpdateQueueStatusFn
+ ExpectState schedulingv1beta1.QueueState
}{
{
- Name: "syncQueue",
- pgsInCache: []*schedulingv1beta1.PodGroup{
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "pg1",
- Namespace: namespace,
- },
- Spec: schedulingv1beta1.PodGroupSpec{
- Queue: "c1",
- },
- Status: schedulingv1beta1.PodGroupStatus{
- Phase: schedulingv1beta1.PodGroupPending,
- },
- },
- },
- pgsInInformer: []*schedulingv1beta1.PodGroup{
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "pg1",
- Namespace: namespace,
- },
- Spec: schedulingv1beta1.PodGroupSpec{
- Queue: "c1",
- },
- Status: schedulingv1beta1.PodGroupStatus{
- Phase: schedulingv1beta1.PodGroupPending,
- },
- },
- },
+ Name: "From empty state to open",
queue: &schedulingv1beta1.Queue{
ObjectMeta: metav1.ObjectMeta{
- Name: "c1",
+ Name: "root",
},
- Spec: schedulingv1beta1.QueueSpec{
- Weight: 1,
+ Status: schedulingv1beta1.QueueStatus{
+ State: "",
},
},
- ExpectStatus: schedulingv1beta1.QueueStatus{
- Pending: 1,
- Reservation: schedulingv1beta1.Reservation{},
- Allocated: v1.ResourceList{},
+ ExpectState: schedulingv1beta1.QueueStateOpen,
+ updateStatusFnFactory: func(queue *schedulingv1beta1.Queue) state.UpdateQueueStatusFn {
+ return func(status *schedulingv1beta1.QueueStatus, podGroupList []string) {
+ if len(queue.Status.State) == 0 {
+ status.State = schedulingv1beta1.QueueStateOpen
+ }
+ }
},
},
{
- Name: "syncQueueHandlingNotFoundPg",
- pgsInCache: []*schedulingv1beta1.PodGroup{
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "pg1",
- Namespace: namespace,
- },
- Spec: schedulingv1beta1.PodGroupSpec{
- Queue: "c2",
- },
- Status: schedulingv1beta1.PodGroupStatus{
- Phase: schedulingv1beta1.PodGroupPending,
- },
- },
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "pg2",
- Namespace: namespace,
- },
- Spec: schedulingv1beta1.PodGroupSpec{
- Queue: "c2",
- },
- Status: schedulingv1beta1.PodGroupStatus{
- Phase: schedulingv1beta1.PodGroupPending,
- },
- },
- },
- pgsInInformer: []*schedulingv1beta1.PodGroup{
- {
- ObjectMeta: metav1.ObjectMeta{
- Name: "pg2",
- Namespace: namespace,
- },
- Spec: schedulingv1beta1.PodGroupSpec{
- Queue: "c2",
- },
- Status: schedulingv1beta1.PodGroupStatus{
- Phase: schedulingv1beta1.PodGroupPending,
- },
- },
- },
+ Name: "From open to close",
queue: &schedulingv1beta1.Queue{
ObjectMeta: metav1.ObjectMeta{
- Name: "c2",
+ Name: "root",
},
- Spec: schedulingv1beta1.QueueSpec{
- Weight: 1,
+ Status: schedulingv1beta1.QueueStatus{
+ State: schedulingv1beta1.QueueStateOpen,
},
},
- ExpectStatus: schedulingv1beta1.QueueStatus{
- Pending: 1,
- Reservation: schedulingv1beta1.Reservation{},
- Allocated: v1.ResourceList{},
+ ExpectState: schedulingv1beta1.QueueStateClosed,
+ updateStatusFnFactory: func(queue *schedulingv1beta1.Queue) state.UpdateQueueStatusFn {
+ return func(status *schedulingv1beta1.QueueStatus, podGroupList []string) {
+ status.State = schedulingv1beta1.QueueStateClosed
+ }
},
},
}
- for i, testcase := range testCases {
+ for _, testcase := range testCases {
c := newFakeController()
- for j := range testcase.pgsInCache {
- key, _ := cache.MetaNamespaceKeyFunc(testcase.pgsInCache[j])
- if _, ok := c.podGroups[testcase.pgsInCache[j].Spec.Queue]; !ok {
- c.podGroups[testcase.pgsInCache[j].Spec.Queue] = make(map[string]struct{})
- }
- c.podGroups[testcase.pgsInCache[j].Spec.Queue][key] = struct{}{}
- }
-
- for j := range testcase.pgsInInformer {
- c.pgInformer.Informer().GetIndexer().Add(testcase.pgsInInformer[j])
- }
+ _, err := c.vcClient.SchedulingV1beta1().Queues().Create(context.TODO(), testcase.queue, metav1.CreateOptions{})
+ assert.NoError(t, err)
- c.queueInformer.Informer().GetIndexer().Add(testcase.queue)
- c.vcClient.SchedulingV1beta1().Queues().Create(context.TODO(), testcase.queue, metav1.CreateOptions{})
+ updateStatusFn := testcase.updateStatusFnFactory(testcase.queue)
+ err = c.syncQueue(testcase.queue, updateStatusFn)
+ assert.NoError(t, err)
- err := c.syncQueue(testcase.queue, nil)
-
- item, _ := c.vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), testcase.queue.Name, metav1.GetOptions{})
- if err != nil && !reflect.DeepEqual(testcase.ExpectStatus, item.Status) {
- t.Errorf("case %d (%s): expected: %v, got %v ", i, testcase.Name, testcase.ExpectStatus, item.Status)
- }
+ item, err := c.vcClient.SchedulingV1beta1().Queues().Get(context.TODO(), testcase.queue.Name, metav1.GetOptions{})
+ assert.NoError(t, err)
+ assert.Equal(t, testcase.ExpectState, item.Status.State)
}
}
diff --git a/pkg/controllers/queue/queue_controller_util.go b/pkg/controllers/queue/queue_controller_util.go
index d54d56a475..d3cf3c4a6c 100644
--- a/pkg/controllers/queue/queue_controller_util.go
+++ b/pkg/controllers/queue/queue_controller_util.go
@@ -22,6 +22,10 @@ import (
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
)
+const (
+ controllerName = "queue-controller"
+)
+
type patchOperation struct {
Op string `json:"op"`
Path string `json:"path"`
diff --git a/pkg/scheduler/metrics/queue.go b/pkg/scheduler/metrics/queue.go
index ce2504d0f5..8d2d4675ff 100644
--- a/pkg/scheduler/metrics/queue.go
+++ b/pkg/scheduler/metrics/queue.go
@@ -93,38 +93,6 @@ var (
Help: "If one queue is overused",
}, []string{"queue_name"},
)
-
- queuePodGroupInqueue = promauto.NewGaugeVec(
- prometheus.GaugeOpts{
- Subsystem: VolcanoNamespace,
- Name: "queue_pod_group_inqueue_count",
- Help: "The number of Inqueue PodGroup in this queue",
- }, []string{"queue_name"},
- )
-
- queuePodGroupPending = promauto.NewGaugeVec(
- prometheus.GaugeOpts{
- Subsystem: VolcanoNamespace,
- Name: "queue_pod_group_pending_count",
- Help: "The number of Pending PodGroup in this queue",
- }, []string{"queue_name"},
- )
-
- queuePodGroupRunning = promauto.NewGaugeVec(
- prometheus.GaugeOpts{
- Subsystem: VolcanoNamespace,
- Name: "queue_pod_group_running_count",
- Help: "The number of Running PodGroup in this queue",
- }, []string{"queue_name"},
- )
-
- queuePodGroupUnknown = promauto.NewGaugeVec(
- prometheus.GaugeOpts{
- Subsystem: VolcanoNamespace,
- Name: "queue_pod_group_unknown_count",
- Help: "The number of Unknown PodGroup in this queue",
- }, []string{"queue_name"},
- )
)
// UpdateQueueAllocated records allocated resources for one queue
@@ -166,26 +134,6 @@ func UpdateQueueOverused(queueName string, overused bool) {
queueOverused.WithLabelValues(queueName).Set(value)
}
-// UpdateQueuePodGroupInqueueCount records the number of Inqueue PodGroup in this queue
-func UpdateQueuePodGroupInqueueCount(queueName string, count int32) {
- queuePodGroupInqueue.WithLabelValues(queueName).Set(float64(count))
-}
-
-// UpdateQueuePodGroupPendingCount records the number of Pending PodGroup in this queue
-func UpdateQueuePodGroupPendingCount(queueName string, count int32) {
- queuePodGroupPending.WithLabelValues(queueName).Set(float64(count))
-}
-
-// UpdateQueuePodGroupRunningCount records the number of Running PodGroup in this queue
-func UpdateQueuePodGroupRunningCount(queueName string, count int32) {
- queuePodGroupRunning.WithLabelValues(queueName).Set(float64(count))
-}
-
-// UpdateQueuePodGroupUnknownCount records the number of Unknown PodGroup in this queue
-func UpdateQueuePodGroupUnknownCount(queueName string, count int32) {
- queuePodGroupUnknown.WithLabelValues(queueName).Set(float64(count))
-}
-
// DeleteQueueMetrics delete all metrics related to the queue
func DeleteQueueMetrics(queueName string) {
queueAllocatedMilliCPU.DeleteLabelValues(queueName)
@@ -197,8 +145,4 @@ func DeleteQueueMetrics(queueName string) {
queueShare.DeleteLabelValues(queueName)
queueWeight.DeleteLabelValues(queueName)
queueOverused.DeleteLabelValues(queueName)
- queuePodGroupInqueue.DeleteLabelValues(queueName)
- queuePodGroupPending.DeleteLabelValues(queueName)
- queuePodGroupRunning.DeleteLabelValues(queueName)
- queuePodGroupUnknown.DeleteLabelValues(queueName)
}
diff --git a/pkg/scheduler/plugins/capacity/capacity.go b/pkg/scheduler/plugins/capacity/capacity.go
index 32cc3d6e3b..9c661588c0 100644
--- a/pkg/scheduler/plugins/capacity/capacity.go
+++ b/pkg/scheduler/plugins/capacity/capacity.go
@@ -375,10 +375,6 @@ func (cp *capacityPlugin) buildQueueAttrs(ssn *framework.Session) {
metrics.UpdateQueueDeserved(attr.name, attr.deserved.MilliCPU, attr.deserved.Memory)
metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory)
metrics.UpdateQueueRequest(attr.name, attr.request.MilliCPU, attr.request.Memory)
- metrics.UpdateQueuePodGroupInqueueCount(attr.name, queue.Queue.Status.Inqueue)
- metrics.UpdateQueuePodGroupPendingCount(attr.name, queue.Queue.Status.Pending)
- metrics.UpdateQueuePodGroupRunningCount(attr.name, queue.Queue.Status.Running)
- metrics.UpdateQueuePodGroupUnknownCount(attr.name, queue.Queue.Status.Unknown)
continue
}
deservedCPU, deservedMem := 0.0, 0.0
@@ -389,10 +385,6 @@ func (cp *capacityPlugin) buildQueueAttrs(ssn *framework.Session) {
metrics.UpdateQueueDeserved(queueInfo.Name, deservedCPU, deservedMem)
metrics.UpdateQueueAllocated(queueInfo.Name, 0, 0)
metrics.UpdateQueueRequest(queueInfo.Name, 0, 0)
- metrics.UpdateQueuePodGroupInqueueCount(queueInfo.Name, 0)
- metrics.UpdateQueuePodGroupPendingCount(queueInfo.Name, 0)
- metrics.UpdateQueuePodGroupRunningCount(queueInfo.Name, 0)
- metrics.UpdateQueuePodGroupUnknownCount(queueInfo.Name, 0)
}
ssn.AddQueueOrderFn(cp.Name(), func(l, r interface{}) int {
@@ -509,15 +501,10 @@ func (cp *capacityPlugin) buildHierarchicalQueueAttrs(ssn *framework.Session) bo
// Record metrics
for queueID := range ssn.Queues {
- queue := ssn.Queues[queueID]
attr := cp.queueOpts[queueID]
metrics.UpdateQueueDeserved(attr.name, attr.deserved.MilliCPU, attr.deserved.Memory)
metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory)
metrics.UpdateQueueRequest(attr.name, attr.request.MilliCPU, attr.request.Memory)
- metrics.UpdateQueuePodGroupInqueueCount(attr.name, queue.Queue.Status.Inqueue)
- metrics.UpdateQueuePodGroupPendingCount(attr.name, queue.Queue.Status.Pending)
- metrics.UpdateQueuePodGroupRunningCount(attr.name, queue.Queue.Status.Running)
- metrics.UpdateQueuePodGroupUnknownCount(attr.name, queue.Queue.Status.Unknown)
}
ssn.AddQueueOrderFn(cp.Name(), func(l, r interface{}) int {
diff --git a/pkg/scheduler/plugins/proportion/proportion.go b/pkg/scheduler/plugins/proportion/proportion.go
index b0081ee478..948487c331 100644
--- a/pkg/scheduler/plugins/proportion/proportion.go
+++ b/pkg/scheduler/plugins/proportion/proportion.go
@@ -167,19 +167,10 @@ func (pp *proportionPlugin) OnSessionOpen(ssn *framework.Session) {
metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory)
metrics.UpdateQueueRequest(attr.name, attr.request.MilliCPU, attr.request.Memory)
metrics.UpdateQueueWeight(attr.name, attr.weight)
- queue := ssn.Queues[attr.queueID]
- metrics.UpdateQueuePodGroupInqueueCount(attr.name, queue.Queue.Status.Inqueue)
- metrics.UpdateQueuePodGroupPendingCount(attr.name, queue.Queue.Status.Pending)
- metrics.UpdateQueuePodGroupRunningCount(attr.name, queue.Queue.Status.Running)
- metrics.UpdateQueuePodGroupUnknownCount(attr.name, queue.Queue.Status.Unknown)
continue
}
metrics.UpdateQueueAllocated(queueInfo.Name, 0, 0)
metrics.UpdateQueueRequest(queueInfo.Name, 0, 0)
- metrics.UpdateQueuePodGroupInqueueCount(queueInfo.Name, 0)
- metrics.UpdateQueuePodGroupPendingCount(queueInfo.Name, 0)
- metrics.UpdateQueuePodGroupRunningCount(queueInfo.Name, 0)
- metrics.UpdateQueuePodGroupUnknownCount(queueInfo.Name, 0)
}
remaining := pp.totalResource.Clone()
diff --git a/test/e2e/jobseq/queue_job_status.go b/test/e2e/jobseq/queue_job_status.go
index 53bdac30c9..534b7608e1 100644
--- a/test/e2e/jobseq/queue_job_status.go
+++ b/test/e2e/jobseq/queue_job_status.go
@@ -26,7 +26,6 @@ import (
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- "k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/tools/cache"
watchtools "k8s.io/client-go/tools/watch"
@@ -77,17 +76,15 @@ var _ = Describe("Queue Job Status Transition", func() {
By("Verify queue have pod groups inqueue")
err := e2eutil.WaitQueueStatus(func() (bool, error) {
- queue, err := ctx.Vcclient.SchedulingV1beta1().Queues().Get(context.TODO(), q1, metav1.GetOptions{})
- Expect(err).NotTo(HaveOccurred(), "Get queue %s failed", q1)
- return queue.Status.Inqueue > 0, nil
+ pgStats := e2eutil.GetPodGroupStatistics(ctx, ctx.Namespace, q1)
+ return pgStats.Inqueue > 0, nil
})
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue inqueue")
By("Verify queue have pod groups running")
err = e2eutil.WaitQueueStatus(func() (bool, error) {
- queue, err := ctx.Vcclient.SchedulingV1beta1().Queues().Get(context.TODO(), q1, metav1.GetOptions{})
- Expect(err).NotTo(HaveOccurred(), "Get queue %s failed", q1)
- return queue.Status.Running > 0, nil
+ pgStats := e2eutil.GetPodGroupStatistics(ctx, ctx.Namespace, q1)
+ return pgStats.Running > 0, nil
})
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
})
@@ -134,9 +131,8 @@ var _ = Describe("Queue Job Status Transition", func() {
By("Verify queue have pod groups running")
err := e2eutil.WaitQueueStatus(func() (bool, error) {
- queue, err := ctx.Vcclient.SchedulingV1beta1().Queues().Get(context.TODO(), q1, metav1.GetOptions{})
- Expect(err).NotTo(HaveOccurred(), "Get queue %s failed", q1)
- return queue.Status.Running > 0, nil
+ pgStats := e2eutil.GetPodGroupStatistics(ctx, ctx.Namespace, q1)
+ return pgStats.Running > 0, nil
})
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
@@ -150,9 +146,8 @@ var _ = Describe("Queue Job Status Transition", func() {
By("Verify queue have pod groups Pending")
err = e2eutil.WaitQueueStatus(func() (bool, error) {
- queue, err := ctx.Vcclient.SchedulingV1beta1().Queues().Get(context.TODO(), q1, metav1.GetOptions{})
- Expect(err).NotTo(HaveOccurred(), "Get queue %s failed", q1)
- return queue.Status.Pending > 0, nil
+ pgStats := e2eutil.GetPodGroupStatistics(ctx, ctx.Namespace, q1)
+ return pgStats.Pending > 0, nil
})
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue Pending")
})
@@ -195,9 +190,8 @@ var _ = Describe("Queue Job Status Transition", func() {
By("Verify queue have pod groups running")
err := e2eutil.WaitQueueStatus(func() (bool, error) {
- queue, err := ctx.Vcclient.SchedulingV1beta1().Queues().Get(context.TODO(), q1, metav1.GetOptions{})
- Expect(err).NotTo(HaveOccurred(), "Get queue %s failed", q1)
- return queue.Status.Running > 0, nil
+ pgStats := e2eutil.GetPodGroupStatistics(ctx, ctx.Namespace, q1)
+ return pgStats.Running > 0, nil
})
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
@@ -220,11 +214,9 @@ var _ = Describe("Queue Job Status Transition", func() {
}
By("Verify queue have pod groups unknown")
- fieldSelector := fields.OneTermEqualSelector("metadata.name", q1).String()
w := &cache.ListWatch{
WatchFunc: func(options metav1.ListOptions) (i watch.Interface, e error) {
- options.FieldSelector = fieldSelector
- return ctx.Vcclient.SchedulingV1beta1().Queues().Watch(context.TODO(), options)
+ return ctx.Vcclient.SchedulingV1beta1().PodGroups(podNamespace).Watch(context.TODO(), options)
},
}
wctx, cancel := watchtools.ContextWithOptionalTimeout(context.Background(), 5*time.Minute)
@@ -232,8 +224,8 @@ var _ = Describe("Queue Job Status Transition", func() {
_, err = watchtools.Until(wctx, clusterPods.ResourceVersion, w, func(event watch.Event) (bool, error) {
switch t := event.Object.(type) {
- case *v1beta1.Queue:
- if t.Status.Unknown > 0 {
+ case *v1beta1.PodGroup:
+ if t.Status.Phase == v1beta1.PodGroupUnknown {
return true, nil
}
}
diff --git a/test/e2e/schedulingaction/reclaim.go b/test/e2e/schedulingaction/reclaim.go
index fe1c478529..8845b9789c 100644
--- a/test/e2e/schedulingaction/reclaim.go
+++ b/test/e2e/schedulingaction/reclaim.go
@@ -70,14 +70,25 @@ var _ = Describe("Reclaim E2E Test", func() {
queue, err := ctx.Vcclient.SchedulingV1beta1().Queues().Get(context.TODO(), queue, metav1.GetOptions{})
Expect(err).NotTo(HaveOccurred(), "Get queue %s failed", queue)
switch status {
- case "Running":
- return queue.Status.Running == num, nil
case "Open":
return queue.Status.State == schedulingv1beta1.QueueStateOpen, nil
+ default:
+ return false, nil
+ }
+ })
+ return err
+ }
+
+ CheckPodGroupStatistics := func(ctx *e2eutil.TestContext, status string, num int, queue string) error {
+ err := e2eutil.WaitQueueStatus(func() (bool, error) {
+ pgStats := e2eutil.GetPodGroupStatistics(ctx, ctx.Namespace, queue)
+ switch status {
+ case "Running":
+ return pgStats.Running == num, nil
case "Pending":
- return queue.Status.Pending == num, nil
+ return pgStats.Pending == num, nil
case "Inqueue":
- return queue.Status.Inqueue == num, nil
+ return pgStats.Inqueue == num, nil
default:
return false, nil
}
@@ -117,13 +128,13 @@ var _ = Describe("Reclaim E2E Test", func() {
By("Make sure all job running")
- err = WaitQueueStatus(ctx, "Running", 1, q1)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q1)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Running", 1, q2)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q2)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Running", 1, q3)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q3)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
})
@@ -176,10 +187,10 @@ var _ = Describe("Reclaim E2E Test", func() {
Expect(err).NotTo(HaveOccurred(), "Get %s pod failed", j3)
By("Make sure q1 q2 with job running in it.")
- err = WaitQueueStatus(ctx, "Running", 1, q1)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q1)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Running", 1, q2)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q2)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
for _, pod := range job3pods.Items {
@@ -188,7 +199,7 @@ var _ = Describe("Reclaim E2E Test", func() {
}
By("Q3 pending when we delete it.")
- err = WaitQueueStatus(ctx, "Pending", 1, q3)
+ err = CheckPodGroupStatistics(ctx, "Pending", 1, q3)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue pending")
})
@@ -223,10 +234,10 @@ var _ = Describe("Reclaim E2E Test", func() {
By("Make sure all job running")
- err = WaitQueueStatus(ctx, "Running", 1, q1)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q1)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Running", 1, q2)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q2)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
})
@@ -265,10 +276,10 @@ var _ = Describe("Reclaim E2E Test", func() {
By("Make sure all job running")
- err = WaitQueueStatus(ctx, "Running", 1, q1)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q1)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Running", 1, q2)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q2)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
})
@@ -306,16 +317,16 @@ var _ = Describe("Reclaim E2E Test", func() {
time.Sleep(10 * time.Second)
By("Make sure all job running")
- err = WaitQueueStatus(ctx, "Running", 1, q1)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q1)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Running", 1, q2)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q2)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Running", 1, q3)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q3)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Inqueue", 1, q3)
+ err = CheckPodGroupStatistics(ctx, "Inqueue", 1, q3)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue Inqueue")
})
@@ -352,14 +363,14 @@ var _ = Describe("Reclaim E2E Test", func() {
time.Sleep(10 * time.Second)
By("Make sure all job running")
- err = WaitQueueStatus(ctx, "Running", 1, q1)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q1)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Running", 1, q2)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q2)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
// TODO: it is a bug : the job status is pending but podgroup status is running
- err = WaitQueueStatus(ctx, "Running", 1, q3)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q3)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue Running")
})
@@ -412,13 +423,13 @@ var _ = Describe("Reclaim E2E Test", func() {
time.Sleep(10 * time.Second)
By("Make sure all job running")
- err = WaitQueueStatus(ctx, "Running", 1, q1)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q1)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Running", 1, q2)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q2)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Inqueue", 1, q3)
+ err = CheckPodGroupStatistics(ctx, "Inqueue", 1, q3)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue Inqueue")
})
@@ -457,13 +468,13 @@ var _ = Describe("Reclaim E2E Test", func() {
By("Make sure all job running")
- err = WaitQueueStatus(ctx, "Running", 1, q1)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q1)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Running", 1, q2)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q2)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
- err = WaitQueueStatus(ctx, "Running", 1, q3)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q3)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue running")
})
@@ -514,10 +525,10 @@ var _ = Describe("Reclaim E2E Test", func() {
err = e2eutil.WaitJobReady(ctx, job2)
Expect(err).NotTo(HaveOccurred())
- err = WaitQueueStatus(ctx, "Running", 1, q1)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q1)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue1 running")
- err = WaitQueueStatus(ctx, "Running", 1, q2)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q2)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue2 running")
By("Create coming jobs")
@@ -530,10 +541,10 @@ var _ = Describe("Reclaim E2E Test", func() {
By("Make sure all job running")
- err = WaitQueueStatus(ctx, "Running", 1, q3)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q3)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue3 running")
- err = WaitQueueStatus(ctx, "Running", 1, q4)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q4)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue4 running")
})
@@ -619,10 +630,10 @@ var _ = Describe("Reclaim E2E Test", func() {
err = e2eutil.WaitJobReady(ctx, job2)
Expect(err).NotTo(HaveOccurred())
- err = WaitQueueStatus(ctx, "Running", 1, q1)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q1)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue1 running")
- err = WaitQueueStatus(ctx, "Running", 1, q2)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q2)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue2 running")
By("Create coming jobs")
@@ -641,10 +652,10 @@ var _ = Describe("Reclaim E2E Test", func() {
By("Make sure all job running")
- err = WaitQueueStatus(ctx, "Running", 1, q3)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q3)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue3 running")
- err = WaitQueueStatus(ctx, "Running", 3, q4)
+ err = CheckPodGroupStatistics(ctx, "Running", 3, q4)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue4 running")
})
@@ -742,10 +753,10 @@ var _ = Describe("Reclaim E2E Test", func() {
err = e2eutil.WaitJobReady(ctx, job2)
Expect(err).NotTo(HaveOccurred())
- err = WaitQueueStatus(ctx, "Running", 1, q2)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q2)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue11 running")
- err = WaitQueueStatus(ctx, "Running", 1, q11)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q11)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue2 running")
By("Create coming jobs")
@@ -758,10 +769,10 @@ var _ = Describe("Reclaim E2E Test", func() {
By("Make sure all job running")
- err = WaitQueueStatus(ctx, "Running", 1, q12)
+ err = CheckPodGroupStatistics(ctx, "Running", 1, q12)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue12 running")
- err = WaitQueueStatus(ctx, "Running", 2, q11)
+ err = CheckPodGroupStatistics(ctx, "Running", 2, q11)
Expect(err).NotTo(HaveOccurred(), "Error waiting for queue11 running")
})
@@ -800,9 +811,8 @@ var _ = Describe("Reclaim E2E Test", func() {
Expect(err).NotTo(HaveOccurred())
err = e2eutil.WaitQueueStatus(func() (bool, error) {
- queue, err := ctx.Vcclient.SchedulingV1beta1().Queues().Get(context.TODO(), q1, metav1.GetOptions{})
- Expect(err).NotTo(HaveOccurred())
- return queue.Status.Running == 1, nil
+ pgStats := e2eutil.GetPodGroupStatistics(ctx, ctx.Namespace, q1)
+ return pgStats.Running == 1, nil
})
Expect(err).NotTo(HaveOccurred())
@@ -842,9 +852,8 @@ var _ = Describe("Reclaim E2E Test", func() {
err = e2eutil.WaitJobStatePending(ctx, job3)
Expect(err).NotTo(HaveOccurred())
err = e2eutil.WaitQueueStatus(func() (bool, error) {
- queue, err := ctx.Vcclient.SchedulingV1beta1().Queues().Get(context.TODO(), q1, metav1.GetOptions{})
- Expect(err).NotTo(HaveOccurred())
- return queue.Status.Pending == 1, nil
+ pgStats := e2eutil.GetPodGroupStatistics(ctx, ctx.Namespace, q1)
+ return pgStats.Pending == 1, nil
})
Expect(err).NotTo(HaveOccurred())
})
diff --git a/test/e2e/util/podgroup.go b/test/e2e/util/podgroup.go
index 1055307b55..1452018aab 100644
--- a/test/e2e/util/podgroup.go
+++ b/test/e2e/util/podgroup.go
@@ -28,6 +28,7 @@ import (
"k8s.io/apimachinery/pkg/util/wait"
schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
+ "volcano.sh/volcano/pkg/cli/podgroup"
)
// CreatePodGroup creates a PodGroup with the specified name in the namespace
@@ -90,3 +91,15 @@ func PodGroupIsReady(ctx *TestContext, namespace string) (bool, error) {
return false, fmt.Errorf("pod group phase is Pending")
}
+
+func GetPodGroupStatistics(ctx *TestContext, namespace, queue string) *podgroup.PodGroupStatistics {
+ pgList, err := ctx.Vcclient.SchedulingV1beta1().PodGroups(namespace).List(context.TODO(), metav1.ListOptions{})
+ Expect(err).NotTo(HaveOccurred(), "List podgroups failed")
+ pgStats := &podgroup.PodGroupStatistics{}
+ for _, pg := range pgList.Items {
+ if pg.Spec.Queue == queue {
+ pgStats.StatPodGroupCountsForQueue(&pg)
+ }
+ }
+ return pgStats
+}