diff --git a/Dockerfile b/Dockerfile index dbc6162..b1053de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=$BUILDPLATFORM golang:1.22.1 as common-builder +FROM --platform=$BUILDPLATFORM golang:1.22.1 AS common-builder WORKDIR $GOPATH/src/github.com/run-ai/fake-gpu-operator COPY go.mod . COPY go.sum . @@ -7,60 +7,70 @@ COPY Makefile . COPY internal/common ./internal/common ARG TARGETOS TARGETARCH -FROM common-builder as device-plugin-builder +FROM common-builder AS device-plugin-builder COPY ./cmd/device-plugin/ ./cmd/device-plugin/ COPY ./internal/deviceplugin/ ./internal/deviceplugin/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=device-plugin -FROM common-builder as status-updater-builder +FROM common-builder AS status-updater-builder COPY ./cmd/status-updater/ ./cmd/status-updater/ COPY ./internal/status-updater/ ./internal/status-updater/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=status-updater -FROM common-builder as status-exporter-builder +FROM common-builder AS kwok-gpu-device-plugin-builder +COPY ./cmd/kwok-gpu-device-plugin/ ./cmd/kwok-gpu-device-plugin/ +COPY ./internal/status-updater/ ./internal/status-updater/ +COPY ./internal/kwok-gpu-device-plugin/ ./internal/kwok-gpu-device-plugin/ +RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=kwok-gpu-device-plugin + +FROM common-builder AS status-exporter-builder COPY ./cmd/status-exporter/ ./cmd/status-exporter/ COPY ./internal/ ./internal/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=status-exporter -FROM common-builder as topology-server-builder +FROM common-builder AS topology-server-builder COPY ./cmd/topology-server/ ./cmd/topology-server/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=topology-server -FROM common-builder as nvidia-smi-builder +FROM common-builder AS nvidia-smi-builder COPY ./cmd/nvidia-smi/ ./cmd/nvidia-smi/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=nvidia-smi -FROM common-builder as mig-faker-builder +FROM common-builder AS mig-faker-builder COPY ./cmd/mig-faker/ ./cmd/mig-faker/ COPY ./internal/ ./internal/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=mig-faker -FROM common-builder as preloader-builder +FROM common-builder AS preloader-builder COPY ./cmd/preloader/ ./cmd/preloader/ RUN make build-preloader -FROM jupyter/minimal-notebook as jupyter-notebook +FROM jupyter/minimal-notebook AS jupyter-notebook COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/ -FROM ubuntu as device-plugin +FROM ubuntu AS device-plugin COPY --from=device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/device-plugin /bin/ COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/ COPY --from=preloader-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/preloader /shared/memory/preloader.so COPY --from=preloader-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/preloader /shared/pid/preloader.so ENTRYPOINT ["/bin/device-plugin"] -FROM ubuntu as status-updater +FROM ubuntu AS status-updater COPY --from=status-updater-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-updater /bin/ ENTRYPOINT ["/bin/status-updater"] -FROM ubuntu as status-exporter +FROM ubuntu AS status-exporter COPY --from=status-exporter-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-exporter /bin/ ENTRYPOINT ["/bin/status-exporter"] -FROM ubuntu as topology-server +FROM ubuntu AS topology-server COPY --from=topology-server-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/topology-server /bin/ ENTRYPOINT ["/bin/topology-server"] -FROM ubuntu as mig-faker +FROM ubuntu AS mig-faker COPY --from=mig-faker-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/mig-faker /bin/ -ENTRYPOINT ["/bin/mig-faker"] \ No newline at end of file +ENTRYPOINT ["/bin/mig-faker"] + +FROM ubuntu AS kwok-gpu-device-plugin +COPY --from=kwok-gpu-device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/kwok-gpu-device-plugin /bin/ +ENTRYPOINT ["/bin/kwok-gpu-device-plugin"] diff --git a/Makefile b/Makefile index 151ca6c..f2951c9 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ COMPONENT="$1" DOCKER_REPO_BASE=gcr.io/run-ai-lab/fake-gpu-operator DOCKER_REPO_FULL=${DOCKER_REPO_BASE}/${COMPONENT} -DOCKER_TAG=0.0.0-dev +DOCKER_TAG?=0.0.0-dev DOCKER_IMAGE_NAME=${DOCKER_REPO_FULL}:${DOCKER_TAG} NAMESPACE=gpu-operator @@ -39,6 +39,7 @@ image: init-buildx images: make image COMPONENT=device-plugin make image COMPONENT=status-updater + make image COMPONENT=kwok-gpu-device-plugin make image COMPONENT=status-exporter make image COMPONENT=topology-server make image COMPONENT=mig-faker diff --git a/cmd/kwok-gpu-device-plugin/main.go b/cmd/kwok-gpu-device-plugin/main.go new file mode 100644 index 0000000..822f3f2 --- /dev/null +++ b/cmd/kwok-gpu-device-plugin/main.go @@ -0,0 +1,16 @@ +package main + +import ( + "github.com/run-ai/fake-gpu-operator/internal/common/app" + "github.com/run-ai/fake-gpu-operator/internal/common/config" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + status_updater "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin" +) + +func main() { + requiredEnvVars := []string{constants.EnvTopologyCmName, constants.EnvTopologyCmNamespace, constants.EnvFakeGpuOperatorNs} + config.ValidateConfig(requiredEnvVars) + + appRunner := app.NewAppRunner(&status_updater.KWOKDevicePluginApp{}) + appRunner.Run() +} diff --git a/deploy/fake-gpu-operator/templates/device-plugin/deployment-template.yaml b/deploy/fake-gpu-operator/templates/device-plugin/deployment-template.yaml deleted file mode 100644 index 751b421..0000000 --- a/deploy/fake-gpu-operator/templates/device-plugin/deployment-template.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "fake-gpu-operator.device-plugin.common.metadata.name" . }} - labels: - {{- include "fake-gpu-operator.device-plugin.common.metadata.labels" . | nindent 4 }} - run.ai/fake-node-deployment-template: "true" -spec: - replicas: 0 - selector: - {{- include "fake-gpu-operator.device-plugin.common.podSelector" . | nindent 4 }} - template: - metadata: - {{- include "fake-gpu-operator.device-plugin.common.podTemplate.metadata" . | nindent 6 }} - spec: - {{- include "fake-gpu-operator.device-plugin.common.podTemplate.spec" . | nindent 6 }} \ No newline at end of file diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrole.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrole.yaml new file mode 100644 index 0000000..a70546c --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrole.yaml @@ -0,0 +1,26 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: fake-kwok-gpu-device-plugin +rules: + - apiGroups: + - "" + resources: + - nodes + - nodes/status + verbs: + - update + - list + - get + - watch + - patch + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - update + - create + - list + - delete diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrolebinding.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrolebinding.yaml new file mode 100644 index 0000000..a04d31a --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/clusterrolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fake-kwok-gpu-device-plugin +roleRef: + kind: ClusterRole + apiGroup: rbac.authorization.k8s.io + name: fake-kwok-gpu-device-plugin +subjects: + - kind: ServiceAccount + name: kwok-gpu-device-plugin + namespace: "{{ .Release.Namespace }}" diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/deployment.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/deployment.yaml new file mode 100644 index 0000000..e4844fb --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/deployment.yaml @@ -0,0 +1,39 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kwok-gpu-device-plugin + annotations: + checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} + labels: + app: kwok-gpu-device-plugin +spec: + selector: + matchLabels: + app: kwok-gpu-device-plugin + component: kwok-gpu-device-plugin + replicas: 1 + template: + metadata: + annotations: + checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} + labels: + app: kwok-gpu-device-plugin + component: kwok-gpu-device-plugin + spec: + containers: + - name: kwok-gpu-device-plugin + image: "{{ .Values.kwokGpuDevicePlugin.image.repository }}:{{ .Values.kwokGpuDevicePlugin.image.tag }}" + imagePullPolicy: "{{ .Values.kwokGpuDevicePlugin.image.pullPolicy }}" + resources: + {{- toYaml .Values.kwokGpuDevicePlugin.resources | nindent 12 }} + env: + - name: TOPOLOGY_CM_NAME + value: topology + - name: TOPOLOGY_CM_NAMESPACE + value: "{{ .Release.Namespace }}" + - name: FAKE_GPU_OPERATOR_NAMESPACE + value: "{{ .Release.Namespace }}" + restartPolicy: Always + serviceAccountName: kwok-gpu-device-plugin + imagePullSecrets: + - name: gcr-secret diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/role.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/role.yaml new file mode 100644 index 0000000..9c8d2cc --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/role.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: fake-kwok-gpu-device-plugin +rules: + - apiGroups: + - "" + resources: + - configmaps + verbs: + - list + - get + - watch diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/rolebinding.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/rolebinding.yaml new file mode 100644 index 0000000..2d04e08 --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/rolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: fake-kwok-gpu-device-plugin +roleRef: + kind: Role + apiGroup: rbac.authorization.k8s.io + name: fake-kwok-gpu-device-plugin +subjects: + - kind: ServiceAccount + name: kwok-gpu-device-plugin + namespace: "{{ .Release.Namespace }}" diff --git a/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/serviceaccount.yaml b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/serviceaccount.yaml new file mode 100644 index 0000000..01c0739 --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-gpu-device-plugin/serviceaccount.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kwok-gpu-device-plugin diff --git a/deploy/fake-gpu-operator/values.yaml b/deploy/fake-gpu-operator/values.yaml index a740a22..d7d09bf 100644 --- a/deploy/fake-gpu-operator/values.yaml +++ b/deploy/fake-gpu-operator/values.yaml @@ -54,6 +54,19 @@ statusExporter: memory: "200Mi" topologyMaxExportInterval: 10s +kwokGpuDevicePlugin: + image: + pullPolicy: Always + repository: gcr.io/run-ai-lab/fake-gpu-operator/kwok-gpu-device-plugin + tag: 0.0.1 + resources: + requests: + cpu: "100m" + memory: "200Mi" + limits: + cpu: "200m" + memory: "400Mi" + migFaker: image: pullPolicy: Always @@ -72,4 +85,4 @@ topology: gpuCount: 2 gpuMemory: 11441 nodePoolLabelKey: run.ai/simulated-gpu-node-pool - migStrategy: mixed \ No newline at end of file + migStrategy: mixed diff --git a/internal/common/topology/kubernetes.go b/internal/common/topology/kubernetes.go index de612fa..b40603c 100644 --- a/internal/common/topology/kubernetes.go +++ b/internal/common/topology/kubernetes.go @@ -10,6 +10,7 @@ import ( "github.com/spf13/viper" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + apcorev1 "k8s.io/client-go/applyconfigurations/core/v1" "k8s.io/client-go/kubernetes" ) @@ -25,11 +26,17 @@ func GetNodeTopologyFromCM(kubeclient kubernetes.Interface, nodeName string) (*N return FromNodeTopologyCM(cm) } -func CreateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, nodeName string) error { - cm, err := ToNodeTopologyCM(nodeTopology, nodeName) +func CreateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, node *corev1.Node) error { + cm, _, err := ToNodeTopologyCM(nodeTopology, node.Name) if err != nil { return err } + if value, found := node.Annotations[constants.AnnotationKwokNode]; found { + if cm.Annotations == nil { + cm.Annotations = make(map[string]string) + } + cm.Annotations[constants.AnnotationKwokNode] = value + } _, err = kubeclient.CoreV1().ConfigMaps( viper.GetString(constants.EnvTopologyCmNamespace)).Create(context.TODO(), cm, metav1.CreateOptions{}) @@ -37,13 +44,13 @@ func CreateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTop } func UpdateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, nodeName string) error { - cm, err := ToNodeTopologyCM(nodeTopology, nodeName) + _, cm, err := ToNodeTopologyCM(nodeTopology, nodeName) if err != nil { return err } _, err = kubeclient.CoreV1().ConfigMaps( - viper.GetString(constants.EnvTopologyCmNamespace)).Update(context.TODO(), cm, metav1.UpdateOptions{}) + viper.GetString(constants.EnvTopologyCmNamespace)).Apply(context.TODO(), cm, metav1.ApplyOptions{}) return err } @@ -108,7 +115,7 @@ func ToClusterTopologyCM(clusterTopology *ClusterTopology) (*corev1.ConfigMap, e return cm, nil } -func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.ConfigMap, error) { +func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.ConfigMap, *apcorev1.ConfigMapApplyConfiguration, error) { cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: GetNodeTopologyCMName(nodeName), @@ -120,15 +127,18 @@ func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.Conf }, Data: make(map[string]string), } + cmApplyConfig := apcorev1.ConfigMap(cm.Name, cm.Namespace).WithLabels(cm.Labels) topologyData, err := yaml.Marshal(nodeTopology) if err != nil { - return nil, err + return nil, nil, err } cm.Data[cmTopologyKey] = string(topologyData) - return cm, nil + cmApplyConfig = cmApplyConfig.WithData(cm.Data) + + return cm, cmApplyConfig, nil } func GetNodeTopologyCMName(nodeName string) string { diff --git a/internal/kwok-gpu-device-plugin/app.go b/internal/kwok-gpu-device-plugin/app.go new file mode 100644 index 0000000..f32eb50 --- /dev/null +++ b/internal/kwok-gpu-device-plugin/app.go @@ -0,0 +1,63 @@ +package kwokgdp + +import ( + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + + "github.com/spf13/viper" + ctrl "sigs.k8s.io/controller-runtime" + + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + cmcontroller "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/controllers/configmap" + "github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers" +) + +var InClusterConfigFn = ctrl.GetConfigOrDie +var KubeClientFn = func(c *rest.Config) kubernetes.Interface { + return kubernetes.NewForConfigOrDie(c) +} + +type StatusUpdaterAppConfiguration struct { + TopologyCmName string `mapstructure:"TOPOLOGY_CM_NAME" validate:"required"` + TopologyCmNamespace string `mapstructure:"TOPOLOGY_CM_NAMESPACE" validate:"required"` +} + +type KWOKDevicePluginApp struct { + Controllers []controllers.Interface + kubeClient kubernetes.Interface + stopCh chan struct{} +} + +func (app *KWOKDevicePluginApp) Run() { + for _, controller := range app.Controllers { + go func(controller controllers.Interface) { + controller.Run(app.stopCh) + }(controller) + } +} + +func (app *KWOKDevicePluginApp) Init(stopCh chan struct{}) { + app.stopCh = stopCh + + clusterConfig := InClusterConfigFn() + clusterConfig.QPS = 100 + clusterConfig.Burst = 200 + + app.kubeClient = KubeClientFn(clusterConfig) + + app.Controllers = append( + app.Controllers, cmcontroller.NewConfigMapController( + app.kubeClient, viper.GetString(constants.EnvTopologyCmNamespace), + ), + ) +} + +func (app *KWOKDevicePluginApp) Name() string { + return "StatusUpdater" +} + +func (app *KWOKDevicePluginApp) GetConfig() interface{} { + var config StatusUpdaterAppConfiguration + + return config +} diff --git a/internal/kwok-gpu-device-plugin/app_test.go b/internal/kwok-gpu-device-plugin/app_test.go new file mode 100644 index 0000000..c633f49 --- /dev/null +++ b/internal/kwok-gpu-device-plugin/app_test.go @@ -0,0 +1,149 @@ +package kwokgdp + +import ( + "context" + "sync" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/spf13/viper" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/fake" + + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + "github.com/run-ai/fake-gpu-operator/internal/common/topology" + cmcontroller "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/controllers/configmap" + "github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers" +) + +const ( + gpuOperatorNamespace = "gpu-operator" + nodePoolLabelKey = "run.ai/node-pool" + defaultNodePoolName = "default" +) + +func TestKwokGpuDevicePlugin(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "KwokGpuDevicePlugin Suite") +} + +var _ = Describe("KwokGpuDevicePlugin", func() { + var ( + app *KWOKDevicePluginApp + kubeClient kubernetes.Interface + stopChan chan struct{} + wg *sync.WaitGroup + ) + + BeforeEach(func() { + clusterTopology := topology.ClusterTopology{ + NodePoolLabelKey: nodePoolLabelKey, + NodePools: map[string]topology.NodePoolTopology{ + defaultNodePoolName: { + GpuCount: 4, + GpuMemory: 1000, + GpuProduct: "nvidia-tesla-t4", + }, + }, + MigStrategy: "none", + } + clusterTopologyCM, err := topology.ToClusterTopologyCM(&clusterTopology) + Expect(err).ToNot(HaveOccurred()) + clusterTopologyCM.Name = "cluster-topology" + clusterTopologyCM.Namespace = gpuOperatorNamespace + + kubeClient = fake.NewSimpleClientset(clusterTopologyCM) + stopChan = make(chan struct{}) + + viper.SetDefault(constants.EnvTopologyCmName, clusterTopologyCM.Name) + viper.SetDefault(constants.EnvTopologyCmNamespace, gpuOperatorNamespace) + + app = &KWOKDevicePluginApp{ + Controllers: []controllers.Interface{ + cmcontroller.NewConfigMapController( + kubeClient, gpuOperatorNamespace, + ), + }, + kubeClient: kubeClient, + stopCh: stopChan, + } + wg = &sync.WaitGroup{} + wg.Add(1) + go func() { + app.Run() + wg.Done() + }() + }) + + AfterEach(func() { + close(stopChan) + wg.Wait() + }) + + Context("app", func() { + It("should run until channel is closed", func() {}) + + Context("ConfigMap", func() { + It("should handle new Config Map without node labels", func() { + _, err := kubeClient.CoreV1().ConfigMaps(gpuOperatorNamespace).Create(context.TODO(), &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "configmap1", + Namespace: gpuOperatorNamespace, + }, + }, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should add gpu devices to kwok nodes by configmap data", func() { + node1 := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + Labels: map[string]string{ + nodePoolLabelKey: defaultNodePoolName, + }, + Annotations: map[string]string{ + constants.AnnotationKwokNode: "fake", + }, + }, + } + _, err := kubeClient.CoreV1().Nodes().Create(context.TODO(), node1, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + nodeTopology := topology.NodeTopology{ + GpuMemory: 1000, + GpuProduct: "nvidia-tesla-t4", + Gpus: []topology.GpuDetails{ + {ID: "fake-gpu-id-1", Status: topology.GpuStatus{}}, + {ID: "fake-gpu-id-2", Status: topology.GpuStatus{}}, + {ID: "fake-gpu-id-3", Status: topology.GpuStatus{}}, + {ID: "fake-gpu-id-4", Status: topology.GpuStatus{}}, + }, + } + cm, _, err := topology.ToNodeTopologyCM(&nodeTopology, node1.Name) + Expect(err).ToNot(HaveOccurred()) + cm.Namespace = gpuOperatorNamespace + cm.Annotations = map[string]string{ + constants.AnnotationKwokNode: "fake", + } + + _, err = kubeClient.CoreV1().ConfigMaps(gpuOperatorNamespace).Create(context.TODO(), cm, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + Eventually(func() bool { + node, err := kubeClient.CoreV1().Nodes().Get(context.TODO(), node1.Name, metav1.GetOptions{}) + if err != nil { + return false + } + + gpuQuantity := node.Status.Capacity[constants.GpuResourceName] + return gpuQuantity.Value() == int64(4) + }, 3*time.Second, 100*time.Millisecond).Should(BeTrue()) + }) + }) + }) +}) diff --git a/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go new file mode 100644 index 0000000..1981ae0 --- /dev/null +++ b/internal/kwok-gpu-device-plugin/controllers/configmap/controller.go @@ -0,0 +1,86 @@ +package configmamp + +import ( + "log" + + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + "github.com/run-ai/fake-gpu-operator/internal/common/topology" + "github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers" + "github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers/util" + + cmhandler "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/handlers/configmap" + + v1 "k8s.io/api/core/v1" + + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" +) + +type ConfigMapController struct { + kubeClient kubernetes.Interface + cmInformer cache.SharedIndexInformer + handler cmhandler.Interface + + clusterTopology *topology.ClusterTopology +} + +var _ controllers.Interface = &ConfigMapController{} + +func NewConfigMapController( + kubeClient kubernetes.Interface, namespace string, +) *ConfigMapController { + clusterTopology, err := topology.GetClusterTopologyFromCM(kubeClient) + if err != nil { + log.Fatalf("Failed to get cluster topology: %v", err) + } + + informerFactory := informers.NewSharedInformerFactoryWithOptions( + kubeClient, 0, informers.WithNamespace(namespace)) + c := &ConfigMapController{ + kubeClient: kubeClient, + cmInformer: informerFactory.Core().V1().ConfigMaps().Informer(), + handler: cmhandler.NewConfigMapHandler(kubeClient, clusterTopology), + clusterTopology: clusterTopology, + } + + _, err = c.cmInformer.AddEventHandler(cache.FilteringResourceEventHandler{ + FilterFunc: func(obj interface{}) bool { + switch cm := obj.(type) { + case *v1.ConfigMap: + return c.isFakeGpuKWOKNodeConfigMap(cm) + default: + return false + } + }, + Handler: cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + go func() { + util.LogErrorIfExist(c.handler.HandleAdd(obj.(*v1.ConfigMap)), "Failed to handle cm addition") + }() + }, + }, + }) + if err != nil { + log.Fatalf("Failed to add config map event handler: %v", err) + } + + return c +} + +func (c *ConfigMapController) Run(stopCh <-chan struct{}) { + log.Println("Starting config map controller") + c.cmInformer.Run(stopCh) +} + +func (c *ConfigMapController) isFakeGpuKWOKNodeConfigMap(cm *v1.ConfigMap) bool { + if cm == nil || cm.Labels == nil || cm.Annotations == nil { + return false + } + _, foundNodeName := cm.Labels[constants.LabelTopologyCMNodeName] + if !foundNodeName { + return false + } + + return cm.Annotations[constants.AnnotationKwokNode] == "fake" +} diff --git a/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go b/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go new file mode 100644 index 0000000..3c342f2 --- /dev/null +++ b/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go @@ -0,0 +1,60 @@ +package configmap + +import ( + "context" + "fmt" + "log" + + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + "github.com/run-ai/fake-gpu-operator/internal/common/topology" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" +) + +type Interface interface { + HandleAdd(cm *v1.ConfigMap) error +} + +type ConfigMapHandler struct { + kubeClient kubernetes.Interface + + clusterTopology *topology.ClusterTopology +} + +var _ Interface = &ConfigMapHandler{} + +func NewConfigMapHandler(kubeClient kubernetes.Interface, clusterTopology *topology.ClusterTopology) *ConfigMapHandler { + return &ConfigMapHandler{ + kubeClient: kubeClient, + clusterTopology: clusterTopology, + } +} + +func (p *ConfigMapHandler) HandleAdd(cm *v1.ConfigMap) error { + log.Printf("Handling config map addition: %s\n", cm.Name) + + nodeTopology, err := topology.FromNodeTopologyCM(cm) + if err != nil { + return fmt.Errorf("failed to read node topology ConfigMap: %w", err) + } + nodeName := cm.Labels[constants.LabelTopologyCMNodeName] + + return p.applyFakeDevicePlugin(len(nodeTopology.Gpus), nodeName) +} + +func (p *ConfigMapHandler) applyFakeDevicePlugin(gpuCount int, nodeName string) error { + patch := fmt.Sprintf( + `{"status": {"capacity": {"%s": "%d"}, "allocatable": {"%s": "%d"}}}`, + constants.GpuResourceName, gpuCount, constants.GpuResourceName, gpuCount, + ) + _, err := p.kubeClient.CoreV1().Nodes().Patch( + context.TODO(), nodeName, types.MergePatchType, []byte(patch), metav1.PatchOptions{}, "status", + ) + if err != nil { + return fmt.Errorf("failed to update node capacity and allocatable: %v", err) + } + + return nil +} diff --git a/internal/status-exporter/app_test.go b/internal/status-exporter/app_test.go index 7c5039f..7a6a43f 100644 --- a/internal/status-exporter/app_test.go +++ b/internal/status-exporter/app_test.go @@ -74,7 +74,7 @@ var _ = Describe("StatusExporter", func() { time.Sleep(1000 * time.Millisecond) initialTopology := createNodeTopology() - cm, err := topology.ToNodeTopologyCM(initialTopology, nodeName) + cm, _, err := topology.ToNodeTopologyCM(initialTopology, nodeName) Expect(err).To(Not(HaveOccurred())) _, err = clientset.CoreV1().ConfigMaps(topologyCmNamespace).Create(context.TODO(), cm, metav1.CreateOptions{}) Expect(err).To(Not(HaveOccurred())) @@ -86,7 +86,7 @@ var _ = Describe("StatusExporter", func() { caseDetails := caseDetails It(caseName, func() { - cm, err := topology.ToNodeTopologyCM(caseDetails.nodeTopologies[nodeName], nodeName) + cm, _, err := topology.ToNodeTopologyCM(caseDetails.nodeTopologies[nodeName], nodeName) Expect(err).ToNot(HaveOccurred()) _, err = clientset.CoreV1().ConfigMaps(topologyCmNamespace).Update(context.TODO(), cm, metav1.UpdateOptions{}) diff --git a/internal/status-updater/handlers/node/topology_cm.go b/internal/status-updater/handlers/node/topology_cm.go index e6a4067..44c707c 100644 --- a/internal/status-updater/handlers/node/topology_cm.go +++ b/internal/status-updater/handlers/node/topology_cm.go @@ -31,7 +31,7 @@ func (p *NodeHandler) createNodeTopologyCM(node *v1.Node) error { MigStrategy: p.clusterTopology.MigStrategy, } - err := topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name) + err := topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node) if err != nil { return fmt.Errorf("failed to create node topology: %w", err) }