diff --git a/solutions/kueue-admission-check/README.md b/solutions/kueue-admission-check/README.md index 13f976023..543e941a8 100644 --- a/solutions/kueue-admission-check/README.md +++ b/solutions/kueue-admission-check/README.md @@ -154,12 +154,12 @@ metadata: namespace: kueue-system spec: clusterSets: - - spoke + - spoke tolerations: - - key: cluster.open-cluster-management.io/unreachable - operator: Exists - - key: cluster.open-cluster-management.io/unavailable - operator: Exists + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists predicates: - requiredClusterSelector: labelSelector: @@ -209,16 +209,16 @@ As an admin, I want to leverage OCM's `AddonPlacementScore` for dynamic workload apiVersion: cluster.open-cluster-management.io/v1beta1 kind: Placement metadata: - name: placement-sample2 + name: placement-demo2 namespace: kueue-system spec: clusterSets: - - spoke + - spoke tolerations: - - key: cluster.open-cluster-management.io/unreachable - operator: Exists - - key: cluster.open-cluster-management.io/unavailable - operator: Exists + - key: cluster.open-cluster-management.io/unreachable + operator: Exists + - key: cluster.open-cluster-management.io/unavailable + operator: Exists predicates: - requiredClusterSelector: labelSelector: @@ -232,13 +232,35 @@ spec: type: AddOn addOn: resourceName: resource-usage-score - scoreName: gpuAvailable + scoreName: gpuClusterAvailable weight: 1 ``` -- You can manually edit the GPU resources on the managed clusters for testing. +- You can manually edit the GPU resources on the managed clusters for testing, for example on `kind-cluster2`, set 3 fake GPU resources on the `control-plane-node`. ```bash -kubectl edit-status node cluster2-control-plane --context kind-cluster2 -kubectl edit-status node cluster3-control-plane --context kind-cluster3 +kubectl edit-status node cluster2-control-plane --context kind-cluster2 # Same operation with other clusters/nodes. +``` +- Edit the `status` of the node `cluster2-control-plane`: +```yaml + allocatable: + cpu: "8" + ephemeral-storage: 61202244Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + hugepages-32Mi: "0" + hugepages-64Ki: "0" + memory: 8027168Ki + nvidia.com/gpu: "3" # Add 3 fake GPUs in allocatable + pods: "110" + capacity: + cpu: "8" + ephemeral-storage: 61202244Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + hugepages-32Mi: "0" + hugepages-64Ki: "0" + memory: 8027168Ki + nvidia.com/gpu: "3" # Add 3 fake GPUs in capacity + pods: "110" ``` - Apply the changes in the `Placement` to update MultiKueue dynamically. ```bash @@ -268,16 +290,20 @@ The OCM Admission Check Controller will integrate OCM `Placement` results into M Example OCM Admission Check Controller design: ```yaml +# OCM implements an admissioncheck controller to automate the MultiKueue setup process. +# MultiKueueConfigs and MultiKueueClusters are generated dynamically based on OCM placement decisions. apiVersion: kueue.x-k8s.io/v1beta1 kind: AdmissionCheck metadata: - name: ocm-multikueue + name: placement-demo2 spec: controllerName: open-cluster-management.io/placement parameters: apiGroup: cluster.open-cluster-management.io - kind: Placement # Placement is under kueue-system namespace. - name: placement-demo2-1 + kind: Placement + name: placement-demo2 +# Leverages OCM's placement mechanism to select clusters based on specific criteria. +# For example `Placement-demo2-1` selects clusters with the `nvidia-tesla-t4` accelerator label. ``` ### Changes in the Configuration Process with OCM Admission Check Controller @@ -324,35 +350,19 @@ spec: With the OCM Admission Check Controller, the need for manual configuration of `MultiKueueConfig` and `MultiKueueCluster` is eliminated. Instead, the administrator only needs to configure two additional admission checks in the ClusterQueue resource: -- `ocm-multikueue`: Automates the process of setting up `MultiKueueConfig` and `MultiKueueCluster`. -- `placement-sample1`: Leverages OCM's placement mechanism to select clusters based on specific criteria. For example `Placement-sample1` selects clusters with the `nvidia-tesla-t4` accelerator label. - -```yaml -apiVersion: kueue.x-k8s.io/v1beta1 -kind: AdmissionCheck -metadata: - name: ocm-multikueue -spec: - controllerName: kueue.x-k8s.io/multikueue - parameters: - apiGroup: kueue.x-k8s.io - kind: MultiKueueConfig - name: placement -``` - -Admin configures the above two admission check controllers in the `ClusterQueue` +- Admin configures two admission check controllers in the `ClusterQueue`, for example in `multikueue-setup-demo2`: ```yaml apiVersion: kueue.x-k8s.io/v1beta1 kind: ClusterQueue metadata: - name: "cluster-queue" + name: "cluster-queue-demo2" spec: namespaceSelector: {} # match all. resourceGroups: - coveredResources: ["cpu", "memory","nvidia.com/gpu"] flavors: - - name: "default-flavor" + - name: "default-flavor-demo2" resources: - name: "cpu" nominalQuota: 9 @@ -361,9 +371,9 @@ spec: - name: "nvidia.com/gpu" nominalQuota: 3 admissionChecks: - - multikueue - - ocm-multikueue - ``` + - multikueue-demo2 + - placement-demo2 +``` #### OCM Admission Check Controller Workflow diff --git a/solutions/kueue-admission-check/env/authtokenrequest-c2.yaml b/solutions/kueue-admission-check/env/cp-c1.yaml similarity index 70% rename from solutions/kueue-admission-check/env/authtokenrequest-c2.yaml rename to solutions/kueue-admission-check/env/cp-c1.yaml index aa8addb57..6f5eccf48 100644 --- a/solutions/kueue-admission-check/env/authtokenrequest-c2.yaml +++ b/solutions/kueue-admission-check/env/cp-c1.yaml @@ -1,17 +1,10 @@ -apiVersion: multicluster.x-k8s.io/v1alpha1 -kind: AuthTokenRequest +apiVersion: rbac.open-cluster-management.io/v1alpha1 +kind: ClusterPermission metadata: - name: kueue-cluster2 - namespace: kueue-system + name: kueue-admin-cluster1 + namespace: cluster1 spec: - targetClusterProfile: - apiGroup: multicluster.x-k8s.io - kind: ClusterProfile - name: cluster2 - namespace: open-cluster-management - serviceAccountName: kueue-admin-cluster2 - clusterRoles: - - name: kueue-admin-cluster2 + clusterRole: rules: - apiGroups: - batch @@ -63,4 +56,8 @@ spec: - get - patch - update - + clusterRoleBinding: + subject: + kind: ServiceAccount + name: kueue-admin-cluster1 + namespace: open-cluster-management-agent-addon diff --git a/solutions/kueue-admission-check/env/authtokenrequest-c3.yaml b/solutions/kueue-admission-check/env/cp-c2.yaml similarity index 70% rename from solutions/kueue-admission-check/env/authtokenrequest-c3.yaml rename to solutions/kueue-admission-check/env/cp-c2.yaml index b8fd26de6..6199444b5 100644 --- a/solutions/kueue-admission-check/env/authtokenrequest-c3.yaml +++ b/solutions/kueue-admission-check/env/cp-c2.yaml @@ -1,17 +1,10 @@ -apiVersion: multicluster.x-k8s.io/v1alpha1 -kind: AuthTokenRequest +apiVersion: rbac.open-cluster-management.io/v1alpha1 +kind: ClusterPermission metadata: - name: kueue-cluster3 - namespace: kueue-system + name: kueue-admin-cluster2 + namespace: cluster2 spec: - targetClusterProfile: - apiGroup: multicluster.x-k8s.io - kind: ClusterProfile - name: cluster3 - namespace: open-cluster-management - serviceAccountName: kueue-admin-cluster3 - clusterRoles: - - name: kueue-admin-cluster3 + clusterRole: rules: - apiGroups: - batch @@ -63,4 +56,8 @@ spec: - get - patch - update - + clusterRoleBinding: + subject: + kind: ServiceAccount + name: kueue-admin-cluster2 + namespace: open-cluster-management-agent-addon diff --git a/solutions/kueue-admission-check/env/authtokenrequest-c1.yaml b/solutions/kueue-admission-check/env/cp-c3.yaml similarity index 69% rename from solutions/kueue-admission-check/env/authtokenrequest-c1.yaml rename to solutions/kueue-admission-check/env/cp-c3.yaml index 748e40404..842d9480f 100644 --- a/solutions/kueue-admission-check/env/authtokenrequest-c1.yaml +++ b/solutions/kueue-admission-check/env/cp-c3.yaml @@ -1,17 +1,10 @@ -apiVersion: multicluster.x-k8s.io/v1alpha1 -kind: AuthTokenRequest +apiVersion: rbac.open-cluster-management.io/v1alpha1 +kind: ClusterPermission metadata: - name: kueue-cred-cluster1 - namespace: kueue-system + name: kueue-admin-cluster3 + namespace: cluster3 spec: - targetClusterProfile: - apiGroup: multicluster.x-k8s.io - kind: ClusterProfile - name: cluster1 - namespace: open-cluster-management - serviceAccountName: kueue-admin-cluster1 - clusterRoles: - - name: kueue-admin-cluster1 + clusterRole: rules: - apiGroups: - batch @@ -63,4 +56,8 @@ spec: - get - patch - update - + clusterRoleBinding: + subject: + kind: ServiceAccount + name: kueue-admin-cluster3 + namespace: open-cluster-management-agent-addon diff --git a/solutions/kueue-admission-check/env/mg-sa-cma-0.6.0.yaml b/solutions/kueue-admission-check/env/mg-sa-cma-0.6.0.yaml deleted file mode 100644 index 8c348e0e1..000000000 --- a/solutions/kueue-admission-check/env/mg-sa-cma-0.6.0.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: addon.open-cluster-management.io/v1alpha1 -kind: ClusterManagementAddOn -metadata: - annotations: - addon.open-cluster-management.io/lifecycle: addon-manager - meta.helm.sh/release-name: managed-serviceaccount - meta.helm.sh/release-namespace: open-cluster-management-addon - labels: - app.kubernetes.io/managed-by: Helm - name: managed-serviceaccount -spec: - addOnMeta: - description: managed-serviceaccount - displayName: managed-serviceaccount - installStrategy: - placements: - - name: placement-spoke - namespace: default - rolloutStrategy: - type: All - type: Placements - supportedConfigs: - - group: addon.open-cluster-management.io - resource: addondeploymentconfigs - - defaultConfig: - name: managed-serviceaccount-0.6.0 - group: addon.open-cluster-management.io - resource: addontemplates diff --git a/solutions/kueue-admission-check/env/msa-c1.yaml b/solutions/kueue-admission-check/env/msa-c1.yaml new file mode 100644 index 000000000..b7466e992 --- /dev/null +++ b/solutions/kueue-admission-check/env/msa-c1.yaml @@ -0,0 +1,7 @@ +apiVersion: authentication.open-cluster-management.io/v1beta1 +kind: ManagedServiceAccount +metadata: + name: kueue-admin-cluster1 + namespace: cluster1 +spec: + rotation: {} diff --git a/solutions/kueue-admission-check/env/msa-c2.yaml b/solutions/kueue-admission-check/env/msa-c2.yaml new file mode 100644 index 000000000..91971cdfd --- /dev/null +++ b/solutions/kueue-admission-check/env/msa-c2.yaml @@ -0,0 +1,7 @@ +apiVersion: authentication.open-cluster-management.io/v1beta1 +kind: ManagedServiceAccount +metadata: + name: kueue-admin-cluster2 + namespace: cluster2 +spec: + rotation: {} diff --git a/solutions/kueue-admission-check/env/msa-c3.yaml b/solutions/kueue-admission-check/env/msa-c3.yaml new file mode 100644 index 000000000..d9f8046e6 --- /dev/null +++ b/solutions/kueue-admission-check/env/msa-c3.yaml @@ -0,0 +1,7 @@ +apiVersion: authentication.open-cluster-management.io/v1beta1 +kind: ManagedServiceAccount +metadata: + name: kueue-admin-cluster3 + namespace: cluster3 +spec: + rotation: {} diff --git a/solutions/kueue-admission-check/env/multicluster.x-k8s.io_authtokenrequests.yaml b/solutions/kueue-admission-check/env/multicluster.x-k8s.io_authtokenrequests.yaml deleted file mode 100644 index 27eced9fb..000000000 --- a/solutions/kueue-admission-check/env/multicluster.x-k8s.io_authtokenrequests.yaml +++ /dev/null @@ -1,356 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.14.0 - name: authtokenrequests.multicluster.x-k8s.io -spec: - group: multicluster.x-k8s.io - names: - kind: AuthTokenRequest - listKind: AuthTokenRequestList - plural: authtokenrequests - singular: authtokenrequest - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: AuthTokenRequest represents a request for access token in a multi-cluster - environment. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - AuthTokenRequestSpec specifies the spec of an AuthTokenRequest object. - - - For simiplicity reasons, the current design assumes that: - - the referenced service account, roles, and cluster roles are guaranteed to be non-existent - in the target cluster (that is, for now we disregard the scenario where some service accounts, - roles, cluster roles have already existed in the cluster and the application is simply requesting - a token to be created or some bindings to be made). - - no rotation is necessary. - properties: - clusterRoles: - description: ClusterRoleRules is a list of cluster roles that is associated - with the service account. - items: - description: ClusterRole describes a set of permissions that should - be set under the cluster scope. - properties: - name: - description: Name is the name of the cluster role that should - be created. - type: string - rules: - description: Rules is a list of policies for the resources in - the cluster scope. - items: - description: |- - PolicyRule holds information that describes a policy rule, but does not contain information - about who the rule applies to or which namespace the rule applies to. - properties: - apiGroups: - description: |- - APIGroups is the name of the APIGroup that contains the resources. If multiple API groups are specified, any action requested against one of - the enumerated resources in any API group will be allowed. "" represents the core API group and "*" represents all API groups. - items: - type: string - type: array - x-kubernetes-list-type: atomic - nonResourceURLs: - description: |- - NonResourceURLs is a set of partial urls that a user should have access to. *s are allowed, but only as the full, final step in the path - Since non-resource URLs are not namespaced, this field is only applicable for ClusterRoles referenced from a ClusterRoleBinding. - Rules can either apply to API resources (such as "pods" or "secrets") or non-resource URL paths (such as "/api"), but not both. - items: - type: string - type: array - x-kubernetes-list-type: atomic - resourceNames: - description: ResourceNames is an optional white list of - names that the rule applies to. An empty set means - that everything is allowed. - items: - type: string - type: array - x-kubernetes-list-type: atomic - resources: - description: Resources is a list of resources this rule - applies to. '*' represents all resources. - items: - type: string - type: array - x-kubernetes-list-type: atomic - verbs: - description: Verbs is a list of Verbs that apply to ALL - the ResourceKinds contained in this rule. '*' represents - all verbs. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - verbs - type: object - type: array - x-kubernetes-list-type: atomic - required: - - name - type: object - maxItems: 20 - type: array - x-kubernetes-list-type: atomic - x-kubernetes-validations: - - message: ClusterRoles is immutable - rule: self == oldSelf - roles: - description: Roles is a list of roles that is associated with the - service account. - items: - description: Role describes a set of permissions that should be - set under a specific namespace. - properties: - name: - description: Name is the name of the role that should be created. - type: string - namespace: - description: |- - Namespace is the namespace where the set of permissions is applied. - The namespace will be created if it does not already exist. - type: string - rules: - description: Rules is a list of policies for the resources in - the specified namespace. - items: - description: |- - PolicyRule holds information that describes a policy rule, but does not contain information - about who the rule applies to or which namespace the rule applies to. - properties: - apiGroups: - description: |- - APIGroups is the name of the APIGroup that contains the resources. If multiple API groups are specified, any action requested against one of - the enumerated resources in any API group will be allowed. "" represents the core API group and "*" represents all API groups. - items: - type: string - type: array - x-kubernetes-list-type: atomic - nonResourceURLs: - description: |- - NonResourceURLs is a set of partial urls that a user should have access to. *s are allowed, but only as the full, final step in the path - Since non-resource URLs are not namespaced, this field is only applicable for ClusterRoles referenced from a ClusterRoleBinding. - Rules can either apply to API resources (such as "pods" or "secrets") or non-resource URL paths (such as "/api"), but not both. - items: - type: string - type: array - x-kubernetes-list-type: atomic - resourceNames: - description: ResourceNames is an optional white list of - names that the rule applies to. An empty set means - that everything is allowed. - items: - type: string - type: array - x-kubernetes-list-type: atomic - resources: - description: Resources is a list of resources this rule - applies to. '*' represents all resources. - items: - type: string - type: array - x-kubernetes-list-type: atomic - verbs: - description: Verbs is a list of Verbs that apply to ALL - the ResourceKinds contained in this rule. '*' represents - all verbs. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - verbs - type: object - type: array - x-kubernetes-list-type: atomic - required: - - name - - namespace - type: object - maxItems: 20 - type: array - x-kubernetes-list-type: atomic - x-kubernetes-validations: - - message: Roles is immutable - rule: self == oldSelf - serviceAccountName: - description: |- - ServiceAccountName is the name of the service account that the - access token should be associated with. - maxLength: 63 - type: string - x-kubernetes-validations: - - message: ServiceAccountName is immutable - rule: self == oldSelf - targetClusterProfile: - description: TargetClusterProfile is the cluster profile that the - access token is requested for. - properties: - apiGroup: - description: APIGroup is the API group of the referred cluster - profile object. - type: string - kind: - description: Kind is the kind of the referred cluster profile - object. - type: string - name: - description: Name is the name of the referred cluster profile - object. - type: string - namespace: - description: Namespace is the namespace of the referred cluster - profile object. - type: string - required: - - apiGroup - - kind - - name - - namespace - type: object - x-kubernetes-map-type: atomic - x-kubernetes-validations: - - message: TargetClusterProfile is immutable - rule: self == oldSelf - required: - - serviceAccountName - - targetClusterProfile - type: object - x-kubernetes-validations: - - message: Roles is required once set - rule: '!has(oldSelf.roles) || has(self.roles)' - - message: ClusterRoles is required once set - rule: '!has(oldSelf.clusterRoles) || has(self.clusterRoles)' - status: - description: AuthTokenRequestStatus specifies the status of an AuthTokenRequest - object. - properties: - conditions: - description: Conditions is an array of conditions for the token request. - items: - description: "Condition contains details for one aspect of the current - state of this API Resource.\n---\nThis struct is intended for - direct use as an array at the field path .status.conditions. For - example,\n\n\n\ttype FooStatus struct{\n\t // Represents the - observations of a foo's current state.\n\t // Known .status.conditions.type - are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // - +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t - \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" - patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t - \ // other fields\n\t}" - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: |- - type of condition in CamelCase or in foo.example.com/CamelCase. - --- - Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be - useful (see .node.status.conditions), the ability to deconflict is important. - The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - tokenResponse: - description: |- - ConfigMapRef points to a specific ConfigMap object. - - - Note that for security reasons, the token response object (i.e., the config map) is - always kept in the same namespace as the token request object. - properties: - apiGroup: - description: APIGroup is the API group of the referred config - map object. - type: string - kind: - description: Kind is the kind of the referred config map object. - type: string - name: - description: Name is the name of the referred config map object. - type: string - required: - - apiGroup - - kind - - name - type: object - x-kubernetes-map-type: atomic - type: object - required: - - spec - type: object - served: true - storage: true - subresources: - status: {} diff --git a/solutions/kueue-admission-check/env/patch-mg-sa-cma.json b/solutions/kueue-admission-check/env/patch-mg-sa-cma.json new file mode 100644 index 000000000..09cfb0367 --- /dev/null +++ b/solutions/kueue-admission-check/env/patch-mg-sa-cma.json @@ -0,0 +1,18 @@ +[ + { + "op": "replace", + "path": "/spec/installStrategy", + "value": { + "placements": [ + { + "name": "placement-spoke", + "namespace": "default", + "rolloutStrategy": { + "type": "All" + } + } + ], + "type": "Placements" + } + } +] diff --git a/solutions/kueue-admission-check/job-demo1.yaml b/solutions/kueue-admission-check/job-demo1.yaml index 598e81bed..e68cda738 100644 --- a/solutions/kueue-admission-check/job-demo1.yaml +++ b/solutions/kueue-admission-check/job-demo1.yaml @@ -17,9 +17,9 @@ spec: args: ["30s"] resources: requests: - cpu: 1 + cpu: "1" memory: "200Mi" limits: - cpu: 1 + cpu: "1" memory: "200Mi" restartPolicy: Never diff --git a/solutions/kueue-admission-check/job-demo2.yaml b/solutions/kueue-admission-check/job-demo2.yaml index b8c16b9c2..7b4aa845a 100644 --- a/solutions/kueue-admission-check/job-demo2.yaml +++ b/solutions/kueue-admission-check/job-demo2.yaml @@ -17,11 +17,11 @@ spec: args: ["600s"] resources: requests: - cpu: 1 + cpu: "1" memory: "200Mi" nvidia.com/gpu: "1" limits: - cpu: 1 + cpu: "1" memory: "200Mi" nvidia.com/gpu: "1" # This job requires one GPU. restartPolicy: Never diff --git a/solutions/kueue-admission-check/multikueue-setup-demo2.yaml b/solutions/kueue-admission-check/multikueue-setup-demo2.yaml index d6c2a19ff..ae4a2e525 100644 --- a/solutions/kueue-admission-check/multikueue-setup-demo2.yaml +++ b/solutions/kueue-admission-check/multikueue-setup-demo2.yaml @@ -1,7 +1,7 @@ apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: - name: "default-flavor" + name: "default-flavor-demo2" --- apiVersion: kueue.x-k8s.io/v1beta1 kind: ClusterQueue @@ -12,17 +12,17 @@ spec: resourceGroups: - coveredResources: ["cpu", "memory","nvidia.com/gpu"] flavors: - - name: "default-flavor" + - name: "default-flavor-demo2" resources: - name: "cpu" nominalQuota: 9 - name: "memory" nominalQuota: 36Gi - name: "nvidia.com/gpu" - nominalQuota: 6 + nominalQuota: 3 admissionChecks: - - multikueue - - ocm-multikueue + - multikueue-demo2 + - placement-demo2 --- apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue @@ -35,21 +35,23 @@ spec: apiVersion: kueue.x-k8s.io/v1beta1 kind: AdmissionCheck metadata: - name: multikueue + name: multikueue-demo2 spec: controllerName: kueue.x-k8s.io/multikueue parameters: apiGroup: kueue.x-k8s.io - kind: MultiKueueConfig # Automates the process of setting up `MultiKueueConfig` and `MultiKueueCluster`. - name: ocm-multikueue + kind: MultiKueueConfig + name: placement-demo2 --- +# OCM implements an admissioncheck controller to automate the MultiKueue setup process. +# MultiKueueConfigs and MultiKueueClusters are generated dynamically based on OCM placement decisions. apiVersion: kueue.x-k8s.io/v1beta1 kind: AdmissionCheck metadata: - name: ocm-multikueue + name: placement-demo2 spec: controllerName: open-cluster-management.io/placement parameters: apiGroup: cluster.open-cluster-management.io kind: Placement - name: placement-sample1 # An example placement to select clusters labeled with "nvidia-tesla-t4" GPU accelerator. + name: placement-demo2 diff --git a/solutions/kueue-admission-check/setup-env.sh b/solutions/kueue-admission-check/setup-env.sh index 1e7a7d437..42dc092fa 100755 --- a/solutions/kueue-admission-check/setup-env.sh +++ b/solutions/kueue-admission-check/setup-env.sh @@ -14,11 +14,6 @@ c1ctx="kind-${c1}" c2ctx="kind-${c2}" c3ctx="kind-${c3}" -#kind delete cluster --name ${hub} -#kind delete cluster --name ${c1} -#kind delete cluster --name ${c2} -#kind delete cluster --name ${c3} - kind create cluster --name "${hub}" --image kindest/node:v1.29.0@sha256:eaa1450915475849a73a9227b8f201df25e55e268e5d619312131292e324d570 kind create cluster --name "${c1}" --image kindest/node:v1.29.0@sha256:eaa1450915475849a73a9227b8f201df25e55e268e5d619312131292e324d570 kind create cluster --name "${c2}" --image kindest/node:v1.29.0@sha256:eaa1450915475849a73a9227b8f201df25e55e268e5d619312131292e324d570 @@ -43,13 +38,13 @@ clusteradm accept --context ${hubctx} --clusters ${c1},${c2},${c3} --wait kubectl get managedclusters --all-namespaces --context ${hubctx} -echo "Install Kueue" +echo "Install Kueue (this can be replaced with OCM Manifestwork in the future)" kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${hubctx} kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${c1ctx} kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${c2ctx} kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.7.1/manifests.yaml --context ${c3ctx} -echo "Install Jobset for MultiKueue" +echo "Install Jobset for MultiKueue (this can be replaced with OCM Manifestwork in the future)" kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml --context ${hubctx} kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml --context ${c1ctx} kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml --context ${c2ctx} @@ -69,11 +64,11 @@ kubectl patch clustermanager cluster-manager --type=json -p='[{"op": "replace", kubectl patch clustermanager cluster-manager --type=json -p='[{"op": "replace", "path": "/spec/placementImagePullSpec", "value": "quay.io/haoqing/placement:latest"}]' echo "Install CRDs" -kubectl create -f env/multicluster.x-k8s.io_authtokenrequests.yaml kubectl create -f env/multicluster.x-k8s.io_clusterprofiles.yaml echo "Install managed-serviceaccount" -cd /path/to/managed-serviceaccount # TODO: Replace here with your actual path. +git clone git@github.com:open-cluster-management-io/managed-serviceaccount.git || true +cd managed-serviceaccount helm uninstall -n open-cluster-management-addon managed-serviceaccount || true helm install \ -n open-cluster-management-addon --create-namespace \ @@ -83,24 +78,30 @@ helm install \ --set enableAddOnDeploymentConfig=true \ --set hubDeployMode=AddOnTemplate cd - +rm -r managed-serviceaccount echo "Install managed-serviceaccount mca" clusteradm create clusterset spoke clusteradm clusterset set spoke --clusters ${c1},${c2},${c3} clusteradm clusterset bind spoke --namespace default kubectl apply -f env/placement.yaml || true -kubectl apply -f env/mg-sa-cma-0.6.0.yaml || true +kubectl patch clustermanagementaddon managed-serviceaccount --type='json' -p="$(cat env/patch-mg-sa-cma.json)" || true echo "Install cluster-permission" -cd /path/to/OCM/cluster-permission # TODO: Replace here with your actual path. -make install -make deploy +git clone git@github.com:open-cluster-management-io/cluster-permission.git || true +cd cluster-permission +kubectl apply -f config/crds +kubectl apply -f config/rbac +kubectl apply -f config/deploy cd - +rm -r cluster-permission echo "Install resource-usage-collect-addon" -cd /path/to/addon-contrib/resource-usage-collect-addon # TODO: Replace here with your actual path. -IMAGE_NAME=zheshen/resource-usage-collect-addon:latest make deploy +git clone git@github.com:open-cluster-management-io/addon-contrib.git || true +cd addon-contrib/resource-usage-collect-addon +make deploy cd - +rm -r addon-contrib echo "Enable MultiKueue on the hub" kubectl patch deployment kueue-controller-manager -n kueue-system --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args", "value": ["--config=/controller_manager_config.yaml", "--zap-log-level=2", "--feature-gates=MultiKueue=true"]}]' @@ -109,6 +110,17 @@ echo "Setup queue on the spoke" kubectl apply -f env/single-clusterqueue-setup-mwrs.yaml echo "Setup credentials for clusterprofile" -kubectl apply -f env/authtokenrequest-c1.yaml -kubectl apply -f env/authtokenrequest-c2.yaml -kubectl apply -f env/authtokenrequest-c3.yaml +kubectl apply -f env/cp-c1.yaml +kubectl apply -f env/cp-c2.yaml +kubectl apply -f env/cp-c3.yaml +kubectl apply -f env/msa-c1.yaml +kubectl apply -f env/msa-c2.yaml +kubectl apply -f env/msa-c3.yaml + +echo "Setup faked GPU on the spoke" +kubectl label managedcluster cluster2 accelerator=nvidia-tesla-t4 +kubectl label managedcluster cluster3 accelerator=nvidia-tesla-t4 + +echo "IMPORTANT: RUN BELOW COMMAND MANUALLY on cluster2 and cluster3 !!!" +echo "kubectl edit-status node cluster2-control-plane --context ${c2ctx}" with nvidia.com/gpu: "3" +echo "kubectl edit-status node cluster3-control-plane --context ${c3ctx}" with nvidia.com/gpu: "3"