From 97057e44c2e3aa487077020d096f1f07025c8a6a Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Mon, 12 Aug 2024 18:03:08 +0300 Subject: [PATCH 1/9] adding kwok fake gpu device plugin --- Dockerfile | 40 ++++--- Makefile | 3 +- cmd/kwok-gpu-device-plugin/main.go | 16 +++ .../kwok-gpu-device-plugin/clusterrole.yaml | 26 +++++ .../clusterrolebinding.yaml | 12 ++ .../kwok-gpu-device-plugin/deployment.yaml | 39 +++++++ .../kwok-gpu-device-plugin/role.yaml | 13 +++ .../kwok-gpu-device-plugin/rolebinding.yaml | 12 ++ .../serviceaccount.yaml | 4 + deploy/fake-gpu-operator/values.yaml | 15 ++- internal/kwok-gpu-device-plugin/app.go | 77 +++++++++++++ .../controllers/configmap/controller.go | 103 ++++++++++++++++++ .../handlers/configmap/handler.go | 59 ++++++++++ 13 files changed, 402 insertions(+), 17 deletions(-) create mode 100644 cmd/kwok-gpu-device-plugin/main.go create mode 100644 deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrole.yaml create mode 100644 deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrolebinding.yaml create mode 100644 deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/deployment.yaml create mode 100644 deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/role.yaml create mode 100644 deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/rolebinding.yaml create mode 100644 deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/serviceaccount.yaml create mode 100644 internal/kwok-gpu-device-plugin/app.go create mode 100644 internal/kwok-gpu-device-plugin/controllers/configmap/controller.go create mode 100644 internal/kwok-gpu-device-plugin/handlers/configmap/handler.go diff --git a/Dockerfile b/Dockerfile index dbc6162..b1053de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=$BUILDPLATFORM golang:1.22.1 as common-builder +FROM --platform=$BUILDPLATFORM golang:1.22.1 AS common-builder WORKDIR $GOPATH/src/github.com/run-ai/fake-gpu-operator COPY go.mod . COPY go.sum . @@ -7,60 +7,70 @@ COPY Makefile . COPY internal/common ./internal/common ARG TARGETOS TARGETARCH -FROM common-builder as device-plugin-builder +FROM common-builder AS device-plugin-builder COPY ./cmd/device-plugin/ ./cmd/device-plugin/ COPY ./internal/deviceplugin/ ./internal/deviceplugin/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=device-plugin -FROM common-builder as status-updater-builder +FROM common-builder AS status-updater-builder COPY ./cmd/status-updater/ ./cmd/status-updater/ COPY ./internal/status-updater/ ./internal/status-updater/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=status-updater -FROM common-builder as status-exporter-builder +FROM common-builder AS kwok-gpu-device-plugin-builder +COPY ./cmd/kwok-gpu-device-plugin/ ./cmd/kwok-gpu-device-plugin/ +COPY ./internal/status-updater/ ./internal/status-updater/ +COPY ./internal/kwok-gpu-device-plugin/ ./internal/kwok-gpu-device-plugin/ +RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=kwok-gpu-device-plugin + +FROM common-builder AS status-exporter-builder COPY ./cmd/status-exporter/ ./cmd/status-exporter/ COPY ./internal/ ./internal/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=status-exporter -FROM common-builder as topology-server-builder +FROM common-builder AS topology-server-builder COPY ./cmd/topology-server/ ./cmd/topology-server/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=topology-server -FROM common-builder as nvidia-smi-builder +FROM common-builder AS nvidia-smi-builder COPY ./cmd/nvidia-smi/ ./cmd/nvidia-smi/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=nvidia-smi -FROM common-builder as mig-faker-builder +FROM common-builder AS mig-faker-builder COPY ./cmd/mig-faker/ ./cmd/mig-faker/ COPY ./internal/ ./internal/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=mig-faker -FROM common-builder as preloader-builder +FROM common-builder AS preloader-builder COPY ./cmd/preloader/ ./cmd/preloader/ RUN make build-preloader -FROM jupyter/minimal-notebook as jupyter-notebook +FROM jupyter/minimal-notebook AS jupyter-notebook COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/ -FROM ubuntu as device-plugin +FROM ubuntu AS device-plugin COPY --from=device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/device-plugin /bin/ COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/ COPY --from=preloader-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/preloader /shared/memory/preloader.so COPY --from=preloader-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/preloader /shared/pid/preloader.so ENTRYPOINT ["/bin/device-plugin"] -FROM ubuntu as status-updater +FROM ubuntu AS status-updater COPY --from=status-updater-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-updater /bin/ ENTRYPOINT ["/bin/status-updater"] -FROM ubuntu as status-exporter +FROM ubuntu AS status-exporter COPY --from=status-exporter-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-exporter /bin/ ENTRYPOINT ["/bin/status-exporter"] -FROM ubuntu as topology-server +FROM ubuntu AS topology-server COPY --from=topology-server-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/topology-server /bin/ ENTRYPOINT ["/bin/topology-server"] -FROM ubuntu as mig-faker +FROM ubuntu AS mig-faker COPY --from=mig-faker-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/mig-faker /bin/ -ENTRYPOINT ["/bin/mig-faker"] \ No newline at end of file +ENTRYPOINT ["/bin/mig-faker"] + +FROM ubuntu AS kwok-gpu-device-plugin +COPY --from=kwok-gpu-device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/kwok-gpu-device-plugin /bin/ +ENTRYPOINT ["/bin/kwok-gpu-device-plugin"] diff --git a/Makefile b/Makefile index 151ca6c..f2951c9 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ COMPONENT="$1" DOCKER_REPO_BASE=gcr.io/run-ai-lab/fake-gpu-operator DOCKER_REPO_FULL=${DOCKER_REPO_BASE}/${COMPONENT} -DOCKER_TAG=0.0.0-dev +DOCKER_TAG?=0.0.0-dev DOCKER_IMAGE_NAME=${DOCKER_REPO_FULL}:${DOCKER_TAG} NAMESPACE=gpu-operator @@ -39,6 +39,7 @@ image: init-buildx images: make image COMPONENT=device-plugin make image COMPONENT=status-updater + make image COMPONENT=kwok-gpu-device-plugin make image COMPONENT=status-exporter make image COMPONENT=topology-server make image COMPONENT=mig-faker diff --git a/cmd/kwok-gpu-device-plugin/main.go b/cmd/kwok-gpu-device-plugin/main.go new file mode 100644 index 0000000..d781784 --- /dev/null +++ b/cmd/kwok-gpu-device-plugin/main.go @@ -0,0 +1,16 @@ +package main + +import ( + "github.com/run-ai/fake-gpu-operator/internal/common/app" + "github.com/run-ai/fake-gpu-operator/internal/common/config" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + status_updater "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin" +) + +func main() { + requiredEnvVars := []string{constants.EnvTopologyCmName, constants.EnvTopologyCmNamespace, constants.EnvFakeGpuOperatorNs} + config.ValidateConfig(requiredEnvVars) + + appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{}) + appRunner.Run() +} diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrole.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrole.yaml new file mode 100644 index 0000000..a70546c --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrole.yaml @@ -0,0 +1,26 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: fake-kwok-gpu-device-plugin +rules: + - apiGroups: + - "" + resources: + - nodes + - nodes/status + verbs: + - update + - list + - get + - watch + - patch + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - update + - create + - list + - delete diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrolebinding.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrolebinding.yaml new file mode 100644 index 0000000..a04d31a --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fake-kwok-gpu-device-plugin +roleRef: + kind: ClusterRole + apiGroup: rbac.authorization.k8s.io + name: fake-kwok-gpu-device-plugin +subjects: + - kind: ServiceAccount + name: kwok-gpu-device-plugin + namespace: "{{ .Release.Namespace }}" diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/deployment.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/deployment.yaml new file mode 100644 index 0000000..e4844fb --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/deployment.yaml @@ -0,0 +1,39 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kwok-gpu-device-plugin + annotations: + checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} + labels: + app: kwok-gpu-device-plugin +spec: + selector: + matchLabels: + app: kwok-gpu-device-plugin + component: kwok-gpu-device-plugin + replicas: 1 + template: + metadata: + annotations: + checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} + labels: + app: kwok-gpu-device-plugin + component: kwok-gpu-device-plugin + spec: + containers: + - name: kwok-gpu-device-plugin + image: "{{ .Values.kwokGpuDevicePlugin.image.repository }}:{{ .Values.kwokGpuDevicePlugin.image.tag }}" + imagePullPolicy: "{{ .Values.kwokGpuDevicePlugin.image.pullPolicy }}" + resources: + {{- toYaml .Values.kwokGpuDevicePlugin.resources | nindent 12 }} + env: + - name: TOPOLOGY_CM_NAME + value: topology + - name: TOPOLOGY_CM_NAMESPACE + value: "{{ .Release.Namespace }}" + - name: FAKE_GPU_OPERATOR_NAMESPACE + value: "{{ .Release.Namespace }}" + restartPolicy: Always + serviceAccountName: kwok-gpu-device-plugin + imagePullSecrets: + - name: gcr-secret diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/role.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/role.yaml new file mode 100644 index 0000000..3ac8803 --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/role.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: plugin +rules: + - apiGroups: + - "" + resources: + - configmaps + verbs: + - list + - get + - watch diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/rolebinding.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/rolebinding.yaml new file mode 100644 index 0000000..2d04e08 --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/rolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: fake-kwok-gpu-device-plugin +roleRef: + kind: Role + apiGroup: rbac.authorization.k8s.io + name: fake-kwok-gpu-device-plugin +subjects: + - kind: ServiceAccount + name: kwok-gpu-device-plugin + namespace: "{{ .Release.Namespace }}" diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/serviceaccount.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/serviceaccount.yaml new file mode 100644 index 0000000..01c0739 --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/serviceaccount.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kwok-gpu-device-plugin diff --git a/deploy/fake-gpu-operator/values.yaml b/deploy/fake-gpu-operator/values.yaml index a740a22..d7d09bf 100644 --- a/deploy/fake-gpu-operator/values.yaml +++ b/deploy/fake-gpu-operator/values.yaml @@ -54,6 +54,19 @@ statusExporter: memory: "200Mi" topologyMaxExportInterval: 10s +kwokGpuDevicePlugin: + image: + pullPolicy: Always + repository: gcr.io/run-ai-lab/fake-gpu-operator/kwok-gpu-device-plugin + tag: 0.0.1 + resources: + requests: + cpu: "100m" + memory: "200Mi" + limits: + cpu: "200m" + memory: "400Mi" + migFaker: image: pullPolicy: Always @@ -72,4 +85,4 @@ topology: gpuCount: 2 gpuMemory: 11441 nodePoolLabelKey: run.ai/simulated-gpu-node-pool - migStrategy: mixed \ No newline at end of file + migStrategy: mixed diff --git a/internal/kwok-gpu-device-plugin/app.go b/internal/kwok-gpu-device-plugin/app.go new file mode 100644 index 0000000..f2487d8 --- /dev/null +++ b/internal/kwok-gpu-device-plugin/app.go @@ -0,0 +1,77 @@ +package kwokgdp + +import ( + "sync" + + "k8s.io/client-go/dynamic" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + + "github.com/spf13/viper" + ctrl "sigs.k8s.io/controller-runtime" + + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + cmcontroller "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/controllers/configmap" + "github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers" +) + +var InClusterConfigFn = ctrl.GetConfigOrDie +var KubeClientFn = func(c *rest.Config) kubernetes.Interface { + return kubernetes.NewForConfigOrDie(c) +} + +var DynamicClientFn = func(c *rest.Config) dynamic.Interface { + return dynamic.NewForConfigOrDie(c) +} + +type StatusUpdaterAppConfiguration struct { + TopologyCmName string `mapstructure:"TOPOLOGY_CM_NAME" validate:"required"` + TopologyCmNamespace string `mapstructure:"TOPOLOGY_CM_NAMESPACE" validate:"required"` +} + +type StatusUpdaterApp struct { + Controllers []controllers.Interface + kubeClient kubernetes.Interface + stopCh chan struct{} + wg *sync.WaitGroup +} + +func (app *StatusUpdaterApp) Run() { + app.wg.Add(len(app.Controllers)) + for _, controller := range app.Controllers { + go func(controller controllers.Interface) { + defer app.wg.Done() + controller.Run(app.stopCh) + }(controller) + } + + app.wg.Wait() +} + +func (app *StatusUpdaterApp) Init(stopCh chan struct{}) { + app.stopCh = stopCh + + clusterConfig := InClusterConfigFn() + clusterConfig.QPS = 100 + clusterConfig.Burst = 200 + + app.wg = &sync.WaitGroup{} + + app.kubeClient = KubeClientFn(clusterConfig) + + app.Controllers = append( + app.Controllers, cmcontroller.NewConfigMapController( + app.kubeClient, viper.GetString(constants.EnvTopologyCmNamespace), + ), + ) +} + +func (app *StatusUpdaterApp) Name() string { + return "StatusUpdater" +} + +func (app *StatusUpdaterApp) GetConfig() interface{} { + var config StatusUpdaterAppConfiguration + + return config +} diff --git a/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go new file mode 100644 index 0000000..3cd0296 --- /dev/null +++ b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go @@ -0,0 +1,103 @@ +package configmamp + +import ( + "log" + + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + "github.com/run-ai/fake-gpu-operator/internal/common/topology" + "github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers" + "github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers/util" + + cmhandler "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/handlers/configmap" + + v1 "k8s.io/api/core/v1" + listersv1 "k8s.io/client-go/listers/core/v1" + + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" +) + +type ConfigMapController struct { + kubeClient kubernetes.Interface + cmInformer cache.SharedIndexInformer + nodeLister listersv1.NodeLister + informerFactory informers.SharedInformerFactory + handler cmhandler.Interface + + clusterTopology *topology.ClusterTopology +} + +var _ controllers.Interface = &ConfigMapController{} + +func NewConfigMapController( + kubeClient kubernetes.Interface, namespace string, +) *ConfigMapController { + clusterTopology, err := topology.GetClusterTopologyFromCM(kubeClient) + if err != nil { + log.Fatalf("Failed to get cluster topology: %v", err) + } + + informerFactory := informers.NewSharedInformerFactoryWithOptions( + kubeClient, 0, informers.WithNamespace(namespace)) + c := &ConfigMapController{ + kubeClient: kubeClient, + cmInformer: informerFactory.Core().V1().ConfigMaps().Informer(), + nodeLister: informerFactory.Core().V1().Nodes().Lister(), + handler: cmhandler.NewConfigMapHandler(kubeClient, clusterTopology), + clusterTopology: clusterTopology, + } + + _, err = c.cmInformer.AddEventHandler(cache.FilteringResourceEventHandler{ + FilterFunc: func(obj interface{}) bool { + switch cm := obj.(type) { + case *v1.ConfigMap: + return c.isFakeGpuKWOKNodeConfigMap(cm) + default: + return false + } + }, + Handler: cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + go func() { + c.callConfigMapHandler(obj.(*v1.ConfigMap)) + }() + }, + }, + }) + if err != nil { + log.Fatalf("Failed to add config map event handler: %v", err) + } + + return c +} + +func (c *ConfigMapController) Run(stopCh <-chan struct{}) { + log.Println("Starting config map controller") + c.informerFactory.Start(stopCh) +} + +func (c *ConfigMapController) isFakeGpuKWOKNodeConfigMap(cm *v1.ConfigMap) bool { + if cm == nil || cm.Labels == nil { + return false + } + nodeName := cm.Labels[constants.LabelTopologyCMNodeName] + + node, err := c.nodeLister.Get(nodeName) + if err != nil { + return false + } + + _, isNodeAssignedToNodePool := node.Labels[c.clusterTopology.NodePoolLabelKey] + return isNodeAssignedToNodePool && node.Annotations[constants.AnnotationKwokNode] == "fake" +} + +func (c *ConfigMapController) callConfigMapHandler(cm *v1.ConfigMap) { + nodeName := cm.Labels[constants.LabelTopologyCMNodeName] + node, err := c.nodeLister.Get(nodeName) + if err != nil { + log.Printf("Failed to get node %s: %v", nodeName, err) + return + } + util.LogErrorIfExist(c.handler.HandleAdd(cm, node), "Failed to handle cm addition") +} diff --git a/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go b/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go new file mode 100644 index 0000000..5cc5332 --- /dev/null +++ b/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go @@ -0,0 +1,59 @@ +package configmap + +import ( + "context" + "fmt" + "log" + + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + "github.com/run-ai/fake-gpu-operator/internal/common/topology" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" +) + +type Interface interface { + HandleAdd(cm *v1.ConfigMap, node *v1.Node) error +} + +type ConfigMapHandler struct { + kubeClient kubernetes.Interface + + clusterTopology *topology.ClusterTopology +} + +var _ Interface = &ConfigMapHandler{} + +func NewConfigMapHandler(kubeClient kubernetes.Interface, clusterTopology *topology.ClusterTopology) *ConfigMapHandler { + return &ConfigMapHandler{ + kubeClient: kubeClient, + clusterTopology: clusterTopology, + } +} + +func (p *ConfigMapHandler) HandleAdd(cm *v1.ConfigMap, node *v1.Node) error { + log.Printf("Handling node addition: %s\n", cm.Name) + + nodeTopology, err := topology.FromNodeTopologyCM(cm) + if err != nil { + return fmt.Errorf("failed to create node topology ConfigMap: %w", err) + } + + return p.applyFakeDevicePlugin(len(nodeTopology.Gpus), node) +} + +func (p *ConfigMapHandler) applyFakeDevicePlugin(gpuCount int, node *v1.Node) error { + patch := fmt.Sprintf( + `{"status": {"capacity": {"%s": "%d"}, "allocatable": {"%s": "%d"}}}`, + constants.GpuResourceName, gpuCount, constants.GpuResourceName, gpuCount, + ) + _, err := p.kubeClient.CoreV1().Nodes().Patch( + context.TODO(), node.Name, types.MergePatchType, []byte(patch), metav1.PatchOptions{}, "status", + ) + if err != nil { + return fmt.Errorf("failed to update node capacity and allocatable: %v", err) + } + + return nil +} From eeb1ecf625fc11c47984023df0c909b4b7bf9929 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Thu, 15 Aug 2024 00:58:00 +0300 Subject: [PATCH 2/9] adding kwok dp app test --- cmd/kwok-gpu-device-plugin/main.go | 2 +- internal/kwok-gpu-device-plugin/app.go | 28 +--- internal/kwok-gpu-device-plugin/app_test.go | 146 ++++++++++++++++++ .../controllers/configmap/controller.go | 1 + .../handlers/configmap/handler.go | 4 +- 5 files changed, 156 insertions(+), 25 deletions(-) create mode 100644 internal/kwok-gpu-device-plugin/app_test.go diff --git a/cmd/kwok-gpu-device-plugin/main.go b/cmd/kwok-gpu-device-plugin/main.go index d781784..822f3f2 100644 --- a/cmd/kwok-gpu-device-plugin/main.go +++ b/cmd/kwok-gpu-device-plugin/main.go @@ -11,6 +11,6 @@ func main() { requiredEnvVars := []string{constants.EnvTopologyCmName, constants.EnvTopologyCmNamespace, constants.EnvFakeGpuOperatorNs} config.ValidateConfig(requiredEnvVars) - appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{}) + appRunner := app.NewAppRunner(&status_updater.KWOKDevicePluginApp{}) appRunner.Run() } diff --git a/internal/kwok-gpu-device-plugin/app.go b/internal/kwok-gpu-device-plugin/app.go index f2487d8..bd39fca 100644 --- a/internal/kwok-gpu-device-plugin/app.go +++ b/internal/kwok-gpu-device-plugin/app.go @@ -1,8 +1,6 @@ package kwokgdp import ( - "sync" - "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -24,39 +22,27 @@ var DynamicClientFn = func(c *rest.Config) dynamic.Interface { return dynamic.NewForConfigOrDie(c) } -type StatusUpdaterAppConfiguration struct { - TopologyCmName string `mapstructure:"TOPOLOGY_CM_NAME" validate:"required"` - TopologyCmNamespace string `mapstructure:"TOPOLOGY_CM_NAMESPACE" validate:"required"` -} - -type StatusUpdaterApp struct { +type KWOKDevicePluginApp struct { Controllers []controllers.Interface kubeClient kubernetes.Interface stopCh chan struct{} - wg *sync.WaitGroup } -func (app *StatusUpdaterApp) Run() { - app.wg.Add(len(app.Controllers)) +func (app *KWOKDevicePluginApp) Run() { for _, controller := range app.Controllers { go func(controller controllers.Interface) { - defer app.wg.Done() controller.Run(app.stopCh) }(controller) } - - app.wg.Wait() } -func (app *StatusUpdaterApp) Init(stopCh chan struct{}) { +func (app *KWOKDevicePluginApp) Init(stopCh chan struct{}) { app.stopCh = stopCh clusterConfig := InClusterConfigFn() clusterConfig.QPS = 100 clusterConfig.Burst = 200 - app.wg = &sync.WaitGroup{} - app.kubeClient = KubeClientFn(clusterConfig) app.Controllers = append( @@ -66,12 +52,10 @@ func (app *StatusUpdaterApp) Init(stopCh chan struct{}) { ) } -func (app *StatusUpdaterApp) Name() string { +func (app *KWOKDevicePluginApp) Name() string { return "StatusUpdater" } -func (app *StatusUpdaterApp) GetConfig() interface{} { - var config StatusUpdaterAppConfiguration - - return config +func (app *KWOKDevicePluginApp) GetConfig() interface{} { + return nil } diff --git a/internal/kwok-gpu-device-plugin/app_test.go b/internal/kwok-gpu-device-plugin/app_test.go new file mode 100644 index 0000000..781f6ee --- /dev/null +++ b/internal/kwok-gpu-device-plugin/app_test.go @@ -0,0 +1,146 @@ +package kwokgdp + +import ( + "context" + "sync" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/spf13/viper" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/fake" + + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + "github.com/run-ai/fake-gpu-operator/internal/common/topology" + cmcontroller "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/controllers/configmap" + "github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers" +) + +const ( + gpuOperatorNamespace = "gpu-operator" + nodePoolLabelKey = "run.ai/node-pool" + defaultNodePoolName = "default" +) + +func TestKwokGpuDevicePlugin(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "KwokGpuDevicePlugin Suite") +} + +var _ = Describe("KwokGpuDevicePlugin", func() { + var ( + app *KWOKDevicePluginApp + kubeClient kubernetes.Interface + stopChan chan struct{} + wg *sync.WaitGroup + ) + + BeforeEach(func() { + clusterTopology := topology.ClusterTopology{ + NodePoolLabelKey: nodePoolLabelKey, + NodePools: map[string]topology.NodePoolTopology{ + defaultNodePoolName: { + GpuCount: 4, + GpuMemory: 1000, + GpuProduct: "nvidia-tesla-t4", + }, + }, + MigStrategy: "none", + } + clusterTopologyCM, err := topology.ToClusterTopologyCM(&clusterTopology) + Expect(err).ToNot(HaveOccurred()) + clusterTopologyCM.Name = "cluster-topology" + clusterTopologyCM.Namespace = gpuOperatorNamespace + + kubeClient = fake.NewSimpleClientset(clusterTopologyCM) + stopChan = make(chan struct{}) + + viper.SetDefault(constants.EnvTopologyCmName, clusterTopologyCM.Name) + viper.SetDefault(constants.EnvTopologyCmNamespace, gpuOperatorNamespace) + + app = &KWOKDevicePluginApp{ + Controllers: []controllers.Interface{ + cmcontroller.NewConfigMapController( + kubeClient, gpuOperatorNamespace, + ), + }, + kubeClient: kubeClient, + stopCh: stopChan, + } + wg = &sync.WaitGroup{} + wg.Add(1) + go func() { + app.Run() + wg.Done() + }() + }) + + AfterEach(func() { + close(stopChan) + wg.Wait() + }) + + Context("app", func() { + It("should run until channel is closed", func() {}) + + Context("ConfigMap", func() { + It("should handle new Config Map without node labels", func() { + _, err := kubeClient.CoreV1().ConfigMaps(gpuOperatorNamespace).Create(context.TODO(), &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "configmap1", + Namespace: gpuOperatorNamespace, + }, + }, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should add gpu devices to kwok nodes by configmap data", func() { + node1 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Labels: map[string]string{ + nodePoolLabelKey: defaultNodePoolName, + }, + Annotations: map[string]string{ + constants.AnnotationKwokNode: "fake", + }, + }, + } + _, err := kubeClient.CoreV1().Nodes().Create(context.TODO(), node1, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + nodeTopology := topology.NodeTopology{ + GpuMemory: 1000, + GpuProduct: "nvidia-tesla-t4", + Gpus: []topology.GpuDetails{ + {ID: "fake-gpu-id-1", Status: topology.GpuStatus{}}, + {ID: "fake-gpu-id-2", Status: topology.GpuStatus{}}, + {ID: "fake-gpu-id-3", Status: topology.GpuStatus{}}, + {ID: "fake-gpu-id-4", Status: topology.GpuStatus{}}, + }, + } + cm, err := topology.ToNodeTopologyCM(&nodeTopology, node1.Name) + Expect(err).ToNot(HaveOccurred()) + cm.Namespace = gpuOperatorNamespace + + _, err = kubeClient.CoreV1().ConfigMaps(gpuOperatorNamespace).Create(context.TODO(), cm, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + Eventually(func() bool { + node, err := kubeClient.CoreV1().Nodes().Get(context.TODO(), node1.Name, metav1.GetOptions{}) + if err != nil { + return false + } + + gpuQuantity := node.Status.Capacity[constants.GpuResourceName] + return gpuQuantity.Value() == int64(4) + }, 2*time.Second, 100*time.Millisecond).Should(BeTrue()) + }) + }) + }) +}) diff --git a/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go index 3cd0296..6efd194 100644 --- a/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go +++ b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go @@ -44,6 +44,7 @@ func NewConfigMapController( kubeClient: kubeClient, cmInformer: informerFactory.Core().V1().ConfigMaps().Informer(), nodeLister: informerFactory.Core().V1().Nodes().Lister(), + informerFactory: informerFactory, handler: cmhandler.NewConfigMapHandler(kubeClient, clusterTopology), clusterTopology: clusterTopology, } diff --git a/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go b/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go index 5cc5332..92683d6 100644 --- a/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go +++ b/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go @@ -33,11 +33,11 @@ func NewConfigMapHandler(kubeClient kubernetes.Interface, clusterTopology *topol } func (p *ConfigMapHandler) HandleAdd(cm *v1.ConfigMap, node *v1.Node) error { - log.Printf("Handling node addition: %s\n", cm.Name) + log.Printf("Handling config map addition: %s\n", cm.Name) nodeTopology, err := topology.FromNodeTopologyCM(cm) if err != nil { - return fmt.Errorf("failed to create node topology ConfigMap: %w", err) + return fmt.Errorf("failed to read node topology ConfigMap: %w", err) } return p.applyFakeDevicePlugin(len(nodeTopology.Gpus), node) From 62b10f0ba57eb6b1d143df3b4697bd7627bd97fa Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Thu, 15 Aug 2024 14:19:55 +0300 Subject: [PATCH 3/9] delete device plugin deployment template --- .../device-plugin/deployment-template.yaml | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 deploy/fake-gpu-operator/templates/device-plugin/deployment-template.yaml diff --git a/deploy/fake-gpu-operator/templates/device-plugin/deployment-template.yaml b/deploy/fake-gpu-operator/templates/device-plugin/deployment-template.yaml deleted file mode 100644 index 751b421..0000000 --- a/deploy/fake-gpu-operator/templates/device-plugin/deployment-template.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "fake-gpu-operator.device-plugin.common.metadata.name" . }} - labels: - {{- include "fake-gpu-operator.device-plugin.common.metadata.labels" . | nindent 4 }} - run.ai/fake-node-deployment-template: "true" -spec: - replicas: 0 - selector: - {{- include "fake-gpu-operator.device-plugin.common.podSelector" . | nindent 4 }} - template: - metadata: - {{- include "fake-gpu-operator.device-plugin.common.podTemplate.metadata" . | nindent 6 }} - spec: - {{- include "fake-gpu-operator.device-plugin.common.podTemplate.spec" . | nindent 6 }} \ No newline at end of file From 1df0f55ad39086134ae79ae7d15f0e00824fabb7 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Thu, 15 Aug 2024 14:42:32 +0300 Subject: [PATCH 4/9] fix kwok device plugin app config --- internal/kwok-gpu-device-plugin/app.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/internal/kwok-gpu-device-plugin/app.go b/internal/kwok-gpu-device-plugin/app.go index bd39fca..f32eb50 100644 --- a/internal/kwok-gpu-device-plugin/app.go +++ b/internal/kwok-gpu-device-plugin/app.go @@ -1,7 +1,6 @@ package kwokgdp import ( - "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -18,8 +17,9 @@ var KubeClientFn = func(c *rest.Config) kubernetes.Interface { return kubernetes.NewForConfigOrDie(c) } -var DynamicClientFn = func(c *rest.Config) dynamic.Interface { - return dynamic.NewForConfigOrDie(c) +type StatusUpdaterAppConfiguration struct { + TopologyCmName string `mapstructure:"TOPOLOGY_CM_NAME" validate:"required"` + TopologyCmNamespace string `mapstructure:"TOPOLOGY_CM_NAMESPACE" validate:"required"` } type KWOKDevicePluginApp struct { @@ -57,5 +57,7 @@ func (app *KWOKDevicePluginApp) Name() string { } func (app *KWOKDevicePluginApp) GetConfig() interface{} { - return nil + var config StatusUpdaterAppConfiguration + + return config } From 2d750a970d443526160f83dac7b62a36ef31d714 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Thu, 15 Aug 2024 14:46:53 +0300 Subject: [PATCH 5/9] fix kwok device plugin role name --- .../templates/kwok-gpu-device-plugin/role.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/role.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/role.yaml index 3ac8803..9c8d2cc 100644 --- a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/role.yaml +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/role.yaml @@ -1,7 +1,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: plugin + name: fake-kwok-gpu-device-plugin rules: - apiGroups: - "" From d2f27b5eda6627242421683a97a73a829e219212 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Sun, 18 Aug 2024 03:01:50 +0300 Subject: [PATCH 6/9] adding kwok cm controller retries --- .../controllers/configmap/controller.go | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go index 6efd194..d2e57e8 100644 --- a/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go +++ b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go @@ -1,7 +1,9 @@ package configmamp import ( + "context" "log" + "time" "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/run-ai/fake-gpu-operator/internal/common/topology" @@ -11,6 +13,7 @@ import ( cmhandler "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/handlers/configmap" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" listersv1 "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/informers" @@ -18,6 +21,11 @@ import ( "k8s.io/client-go/tools/cache" ) +const ( + maxRetryCount = 10 + baseRetryDelay = time.Millisecond * 100 +) + type ConfigMapController struct { kubeClient kubernetes.Interface cmInformer cache.SharedIndexInformer @@ -61,7 +69,7 @@ func NewConfigMapController( Handler: cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { go func() { - c.callConfigMapHandler(obj.(*v1.ConfigMap)) + c.callConfigMapHandler(obj.(*v1.ConfigMap), 0) }() }, }, @@ -82,22 +90,33 @@ func (c *ConfigMapController) isFakeGpuKWOKNodeConfigMap(cm *v1.ConfigMap) bool if cm == nil || cm.Labels == nil { return false } - nodeName := cm.Labels[constants.LabelTopologyCMNodeName] + nodeName, foundNodeName := cm.Labels[constants.LabelTopologyCMNodeName] + if !foundNodeName { + return false + } node, err := c.nodeLister.Get(nodeName) if err != nil { - return false + node, err = c.kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) + if err != nil { + log.Printf("Failed to get node %s: %v", nodeName, err) + return false + } } - _, isNodeAssignedToNodePool := node.Labels[c.clusterTopology.NodePoolLabelKey] - return isNodeAssignedToNodePool && node.Annotations[constants.AnnotationKwokNode] == "fake" + return node.Annotations[constants.AnnotationKwokNode] == "fake" } -func (c *ConfigMapController) callConfigMapHandler(cm *v1.ConfigMap) { +func (c *ConfigMapController) callConfigMapHandler(cm *v1.ConfigMap, retryCount int) { nodeName := cm.Labels[constants.LabelTopologyCMNodeName] node, err := c.nodeLister.Get(nodeName) if err != nil { - log.Printf("Failed to get node %s: %v", nodeName, err) + delay := baseRetryDelay * (1 << retryCount) + log.Printf("Failed to get node %s: %v. retry in %v", nodeName, err, delay) + time.Sleep(delay) + if retryCount < maxRetryCount { + c.callConfigMapHandler(cm, retryCount+1) + } return } util.LogErrorIfExist(c.handler.HandleAdd(cm, node), "Failed to handle cm addition") From 4e370cb7fec7103d51d97c4693228e90062cae8c Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Tue, 20 Aug 2024 16:18:10 +0300 Subject: [PATCH 7/9] mark configmap with kwok status to simplify kwok gdp --- internal/common/topology/kubernetes.go | 24 +++++++---- internal/kwok-gpu-device-plugin/app_test.go | 5 ++- .../controllers/configmap/controller.go | 40 ++----------------- .../handlers/configmap/handler.go | 11 ++--- internal/status-exporter/app_test.go | 4 +- .../handlers/node/topology_cm.go | 2 +- 6 files changed, 34 insertions(+), 52 deletions(-) diff --git a/internal/common/topology/kubernetes.go b/internal/common/topology/kubernetes.go index de612fa..b40603c 100644 --- a/internal/common/topology/kubernetes.go +++ b/internal/common/topology/kubernetes.go @@ -10,6 +10,7 @@ import ( "github.com/spf13/viper" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + apcorev1 "k8s.io/client-go/applyconfigurations/core/v1" "k8s.io/client-go/kubernetes" ) @@ -25,11 +26,17 @@ func GetNodeTopologyFromCM(kubeclient kubernetes.Interface, nodeName string) (*N return FromNodeTopologyCM(cm) } -func CreateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, nodeName string) error { - cm, err := ToNodeTopologyCM(nodeTopology, nodeName) +func CreateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, node *corev1.Node) error { + cm, _, err := ToNodeTopologyCM(nodeTopology, node.Name) if err != nil { return err } + if value, found := node.Annotations[constants.AnnotationKwokNode]; found { + if cm.Annotations == nil { + cm.Annotations = make(map[string]string) + } + cm.Annotations[constants.AnnotationKwokNode] = value + } _, err = kubeclient.CoreV1().ConfigMaps( viper.GetString(constants.EnvTopologyCmNamespace)).Create(context.TODO(), cm, metav1.CreateOptions{}) @@ -37,13 +44,13 @@ func CreateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTop } func UpdateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, nodeName string) error { - cm, err := ToNodeTopologyCM(nodeTopology, nodeName) + _, cm, err := ToNodeTopologyCM(nodeTopology, nodeName) if err != nil { return err } _, err = kubeclient.CoreV1().ConfigMaps( - viper.GetString(constants.EnvTopologyCmNamespace)).Update(context.TODO(), cm, metav1.UpdateOptions{}) + viper.GetString(constants.EnvTopologyCmNamespace)).Apply(context.TODO(), cm, metav1.ApplyOptions{}) return err } @@ -108,7 +115,7 @@ func ToClusterTopologyCM(clusterTopology *ClusterTopology) (*corev1.ConfigMap, e return cm, nil } -func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.ConfigMap, error) { +func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.ConfigMap, *apcorev1.ConfigMapApplyConfiguration, error) { cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: GetNodeTopologyCMName(nodeName), @@ -120,15 +127,18 @@ func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.Conf }, Data: make(map[string]string), } + cmApplyConfig := apcorev1.ConfigMap(cm.Name, cm.Namespace).WithLabels(cm.Labels) topologyData, err := yaml.Marshal(nodeTopology) if err != nil { - return nil, err + return nil, nil, err } cm.Data[cmTopologyKey] = string(topologyData) - return cm, nil + cmApplyConfig = cmApplyConfig.WithData(cm.Data) + + return cm, cmApplyConfig, nil } func GetNodeTopologyCMName(nodeName string) string { diff --git a/internal/kwok-gpu-device-plugin/app_test.go b/internal/kwok-gpu-device-plugin/app_test.go index 781f6ee..46b9c3b 100644 --- a/internal/kwok-gpu-device-plugin/app_test.go +++ b/internal/kwok-gpu-device-plugin/app_test.go @@ -124,9 +124,12 @@ var _ = Describe("KwokGpuDevicePlugin", func() { {ID: "fake-gpu-id-4", Status: topology.GpuStatus{}}, }, } - cm, err := topology.ToNodeTopologyCM(&nodeTopology, node1.Name) + cm, _, err := topology.ToNodeTopologyCM(&nodeTopology, node1.Name) Expect(err).ToNot(HaveOccurred()) cm.Namespace = gpuOperatorNamespace + cm.Annotations = map[string]string{ + constants.AnnotationKwokNode: "fake", + } _, err = kubeClient.CoreV1().ConfigMaps(gpuOperatorNamespace).Create(context.TODO(), cm, metav1.CreateOptions{}) Expect(err).ToNot(HaveOccurred()) diff --git a/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go index d2e57e8..cec9719 100644 --- a/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go +++ b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go @@ -1,9 +1,7 @@ package configmamp import ( - "context" "log" - "time" "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/run-ai/fake-gpu-operator/internal/common/topology" @@ -13,7 +11,6 @@ import ( cmhandler "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/handlers/configmap" v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" listersv1 "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/informers" @@ -21,11 +18,6 @@ import ( "k8s.io/client-go/tools/cache" ) -const ( - maxRetryCount = 10 - baseRetryDelay = time.Millisecond * 100 -) - type ConfigMapController struct { kubeClient kubernetes.Interface cmInformer cache.SharedIndexInformer @@ -69,7 +61,7 @@ func NewConfigMapController( Handler: cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { go func() { - c.callConfigMapHandler(obj.(*v1.ConfigMap), 0) + util.LogErrorIfExist(c.handler.HandleAdd(obj.(*v1.ConfigMap)), "Failed to handle cm addition") }() }, }, @@ -87,37 +79,13 @@ func (c *ConfigMapController) Run(stopCh <-chan struct{}) { } func (c *ConfigMapController) isFakeGpuKWOKNodeConfigMap(cm *v1.ConfigMap) bool { - if cm == nil || cm.Labels == nil { + if cm == nil || cm.Labels == nil || cm.Annotations == nil { return false } - nodeName, foundNodeName := cm.Labels[constants.LabelTopologyCMNodeName] + _, foundNodeName := cm.Labels[constants.LabelTopologyCMNodeName] if !foundNodeName { return false } - node, err := c.nodeLister.Get(nodeName) - if err != nil { - node, err = c.kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) - if err != nil { - log.Printf("Failed to get node %s: %v", nodeName, err) - return false - } - } - - return node.Annotations[constants.AnnotationKwokNode] == "fake" -} - -func (c *ConfigMapController) callConfigMapHandler(cm *v1.ConfigMap, retryCount int) { - nodeName := cm.Labels[constants.LabelTopologyCMNodeName] - node, err := c.nodeLister.Get(nodeName) - if err != nil { - delay := baseRetryDelay * (1 << retryCount) - log.Printf("Failed to get node %s: %v. retry in %v", nodeName, err, delay) - time.Sleep(delay) - if retryCount < maxRetryCount { - c.callConfigMapHandler(cm, retryCount+1) - } - return - } - util.LogErrorIfExist(c.handler.HandleAdd(cm, node), "Failed to handle cm addition") + return cm.Annotations[constants.AnnotationKwokNode] == "fake" } diff --git a/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go b/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go index 92683d6..3c342f2 100644 --- a/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go +++ b/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go @@ -14,7 +14,7 @@ import ( ) type Interface interface { - HandleAdd(cm *v1.ConfigMap, node *v1.Node) error + HandleAdd(cm *v1.ConfigMap) error } type ConfigMapHandler struct { @@ -32,24 +32,25 @@ func NewConfigMapHandler(kubeClient kubernetes.Interface, clusterTopology *topol } } -func (p *ConfigMapHandler) HandleAdd(cm *v1.ConfigMap, node *v1.Node) error { +func (p *ConfigMapHandler) HandleAdd(cm *v1.ConfigMap) error { log.Printf("Handling config map addition: %s\n", cm.Name) nodeTopology, err := topology.FromNodeTopologyCM(cm) if err != nil { return fmt.Errorf("failed to read node topology ConfigMap: %w", err) } + nodeName := cm.Labels[constants.LabelTopologyCMNodeName] - return p.applyFakeDevicePlugin(len(nodeTopology.Gpus), node) + return p.applyFakeDevicePlugin(len(nodeTopology.Gpus), nodeName) } -func (p *ConfigMapHandler) applyFakeDevicePlugin(gpuCount int, node *v1.Node) error { +func (p *ConfigMapHandler) applyFakeDevicePlugin(gpuCount int, nodeName string) error { patch := fmt.Sprintf( `{"status": {"capacity": {"%s": "%d"}, "allocatable": {"%s": "%d"}}}`, constants.GpuResourceName, gpuCount, constants.GpuResourceName, gpuCount, ) _, err := p.kubeClient.CoreV1().Nodes().Patch( - context.TODO(), node.Name, types.MergePatchType, []byte(patch), metav1.PatchOptions{}, "status", + context.TODO(), nodeName, types.MergePatchType, []byte(patch), metav1.PatchOptions{}, "status", ) if err != nil { return fmt.Errorf("failed to update node capacity and allocatable: %v", err) diff --git a/internal/status-exporter/app_test.go b/internal/status-exporter/app_test.go index 7c5039f..7a6a43f 100644 --- a/internal/status-exporter/app_test.go +++ b/internal/status-exporter/app_test.go @@ -74,7 +74,7 @@ var _ = Describe("StatusExporter", func() { time.Sleep(1000 * time.Millisecond) initialTopology := createNodeTopology() - cm, err := topology.ToNodeTopologyCM(initialTopology, nodeName) + cm, _, err := topology.ToNodeTopologyCM(initialTopology, nodeName) Expect(err).To(Not(HaveOccurred())) _, err = clientset.CoreV1().ConfigMaps(topologyCmNamespace).Create(context.TODO(), cm, metav1.CreateOptions{}) Expect(err).To(Not(HaveOccurred())) @@ -86,7 +86,7 @@ var _ = Describe("StatusExporter", func() { caseDetails := caseDetails It(caseName, func() { - cm, err := topology.ToNodeTopologyCM(caseDetails.nodeTopologies[nodeName], nodeName) + cm, _, err := topology.ToNodeTopologyCM(caseDetails.nodeTopologies[nodeName], nodeName) Expect(err).ToNot(HaveOccurred()) _, err = clientset.CoreV1().ConfigMaps(topologyCmNamespace).Update(context.TODO(), cm, metav1.UpdateOptions{}) diff --git a/internal/status-updater/handlers/node/topology_cm.go b/internal/status-updater/handlers/node/topology_cm.go index e6a4067..44c707c 100644 --- a/internal/status-updater/handlers/node/topology_cm.go +++ b/internal/status-updater/handlers/node/topology_cm.go @@ -31,7 +31,7 @@ func (p *NodeHandler) createNodeTopologyCM(node *v1.Node) error { MigStrategy: p.clusterTopology.MigStrategy, } - err := topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name) + err := topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node) if err != nil { return fmt.Errorf("failed to create node topology: %w", err) } From cb76c824f094d7aa729afec6642d0b472f67a608 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Tue, 20 Aug 2024 17:48:53 +0300 Subject: [PATCH 8/9] cr 1 --- .../controllers/configmap/controller.go | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go index cec9719..1981ae0 100644 --- a/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go +++ b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go @@ -11,7 +11,6 @@ import ( cmhandler "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/handlers/configmap" v1 "k8s.io/api/core/v1" - listersv1 "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" @@ -19,11 +18,9 @@ import ( ) type ConfigMapController struct { - kubeClient kubernetes.Interface - cmInformer cache.SharedIndexInformer - nodeLister listersv1.NodeLister - informerFactory informers.SharedInformerFactory - handler cmhandler.Interface + kubeClient kubernetes.Interface + cmInformer cache.SharedIndexInformer + handler cmhandler.Interface clusterTopology *topology.ClusterTopology } @@ -43,8 +40,6 @@ func NewConfigMapController( c := &ConfigMapController{ kubeClient: kubeClient, cmInformer: informerFactory.Core().V1().ConfigMaps().Informer(), - nodeLister: informerFactory.Core().V1().Nodes().Lister(), - informerFactory: informerFactory, handler: cmhandler.NewConfigMapHandler(kubeClient, clusterTopology), clusterTopology: clusterTopology, } @@ -75,7 +70,7 @@ func NewConfigMapController( func (c *ConfigMapController) Run(stopCh <-chan struct{}) { log.Println("Starting config map controller") - c.informerFactory.Start(stopCh) + c.cmInformer.Run(stopCh) } func (c *ConfigMapController) isFakeGpuKWOKNodeConfigMap(cm *v1.ConfigMap) bool { From 63ee0bb55edf3ecb571a35d98a307a0defa00d60 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Tue, 20 Aug 2024 18:03:52 +0300 Subject: [PATCH 9/9] increase test timeout --- internal/kwok-gpu-device-plugin/app_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/kwok-gpu-device-plugin/app_test.go b/internal/kwok-gpu-device-plugin/app_test.go index 46b9c3b..c633f49 100644 --- a/internal/kwok-gpu-device-plugin/app_test.go +++ b/internal/kwok-gpu-device-plugin/app_test.go @@ -142,7 +142,7 @@ var _ = Describe("KwokGpuDevicePlugin", func() { gpuQuantity := node.Status.Capacity[constants.GpuResourceName] return gpuQuantity.Value() == int64(4) - }, 2*time.Second, 100*time.Millisecond).Should(BeTrue()) + }, 3*time.Second, 100*time.Millisecond).Should(BeTrue()) }) }) })