Skip to content

Commit

Permalink
implement config map controller instead of node controller
Browse files Browse the repository at this point in the history
  • Loading branch information
enoodle committed Aug 14, 2024
1 parent 3f7c4fe commit 234abe6
Show file tree
Hide file tree
Showing 11 changed files with 339 additions and 174 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: plugin
rules:
- apiGroups:
- ""
resources:
- configmaps
verbs:
- list
- get
- watch
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: fake-kwok-gpu-device-plugin
roleRef:
kind: Role
apiGroup: rbac.authorization.k8s.io
name: fake-kwok-gpu-device-plugin
subjects:
- kind: ServiceAccount
name: kwok-gpu-device-plugin
namespace: "{{ .Release.Namespace }}"
10 changes: 8 additions & 2 deletions internal/kwok-gpu-device-plugin/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ import (
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"

"github.com/spf13/viper"
ctrl "sigs.k8s.io/controller-runtime"

nodecontroller "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/controllers/node"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
cmcontroller "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/controllers/configmap"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers"
)

Expand Down Expand Up @@ -57,7 +59,11 @@ func (app *StatusUpdaterApp) Init(stopCh chan struct{}) {

app.kubeClient = KubeClientFn(clusterConfig)

app.Controllers = append(app.Controllers, nodecontroller.NewNodeController(app.kubeClient, app.wg))
app.Controllers = append(
app.Controllers, cmcontroller.NewConfigMapController(
app.kubeClient, app.wg, viper.GetString(constants.EnvTopologyCmNamespace),
),
)
}

func (app *StatusUpdaterApp) Name() string {
Expand Down
102 changes: 102 additions & 0 deletions internal/kwok-gpu-device-plugin/controllers/configmap/controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package configmamp

import (
"log"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers/util"

cmhandler "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/handlers/configmap"

v1 "k8s.io/api/core/v1"
listersv1 "k8s.io/client-go/listers/core/v1"

"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"
)

type ConfigMapController struct {
kubeClient kubernetes.Interface
cmInformer cache.SharedIndexInformer
nodeLister listersv1.NodeLister
informerFactory informers.SharedInformerFactory
handler cmhandler.Interface

clusterTopology *topology.ClusterTopology
}

var _ controllers.Interface = &ConfigMapController{}

func NewConfigMapController(
kubeClient kubernetes.Interface, namespace string,
) *ConfigMapController {
clusterTopology, err := topology.GetClusterTopologyFromCM(kubeClient)
if err != nil {
log.Fatalf("Failed to get cluster topology: %v", err)
}

informerFactory := informers.NewSharedInformerFactoryWithOptions(
kubeClient, 0, informers.WithNamespace(namespace))
c := &ConfigMapController{
kubeClient: kubeClient,
cmInformer: informerFactory.Core().V1().ConfigMaps().Informer(),
nodeLister: informerFactory.Core().V1().Nodes().Lister(),
handler: cmhandler.NewConfigMapHandler(kubeClient, clusterTopology),
clusterTopology: clusterTopology,
}

_, err = c.cmInformer.AddEventHandler(cache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
switch cm := obj.(type) {
case *v1.ConfigMap:
return c.isFakeGpuKWOKNodeConfigMap(cm)
default:
return false
}
},
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
go func() {
c.callConfigMapHandler(obj.(*v1.ConfigMap))
}()
},
},
})
if err != nil {
log.Fatalf("Failed to add config map event handler: %v", err)
}

return c
}

func (c *ConfigMapController) Run(stopCh <-chan struct{}) {
log.Println("Starting config map controller")
c.informerFactory.Start(stopCh)
}

func (c *ConfigMapController) isFakeGpuKWOKNodeConfigMap(cm *v1.ConfigMap) bool {
if cm == nil || cm.Labels == nil {
return false
}
nodeName := cm.Labels[constants.LabelTopologyCMNodeName]

node, err := c.nodeLister.Get(nodeName)
if err != nil {
return false
}

_, isNodeAssignedToNodePool := node.Labels[c.clusterTopology.NodePoolLabelKey]
return isNodeAssignedToNodePool && node.Annotations[constants.AnnotationKwokNode] == "fake"
}

func (c *ConfigMapController) callConfigMapHandler(cm *v1.ConfigMap) {
nodeName := cm.Labels[constants.LabelTopologyCMNodeName]
node, err := c.nodeLister.Get(nodeName)
if err != nil {

}
util.LogErrorIfExist(c.handler.HandleAdd(cm, node), "Failed to handle cm addition")
}
86 changes: 0 additions & 86 deletions internal/kwok-gpu-device-plugin/controllers/node/controller.go

This file was deleted.

59 changes: 59 additions & 0 deletions internal/kwok-gpu-device-plugin/handlers/configmap/handler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package configmap

import (
"context"
"fmt"
"log"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
)

type Interface interface {
HandleAdd(cm *v1.ConfigMap, node *v1.Node) error
}

type ConfigMapHandler struct {
kubeClient kubernetes.Interface

clusterTopology *topology.ClusterTopology
}

var _ Interface = &ConfigMapHandler{}

func NewConfigMapHandler(kubeClient kubernetes.Interface, clusterTopology *topology.ClusterTopology) *ConfigMapHandler {
return &ConfigMapHandler{
kubeClient: kubeClient,
clusterTopology: clusterTopology,
}
}

func (p *ConfigMapHandler) HandleAdd(cm *v1.ConfigMap, node *v1.Node) error {
log.Printf("Handling node addition: %s\n", cm.Name)

nodeTopology, err := topology.FromNodeTopologyCM(cm)
if err != nil {
return fmt.Errorf("failed to create node topology ConfigMap: %w", err)
}

return p.applyFakeDevicePlugin(len(nodeTopology.Gpus), node)
}

func (p *ConfigMapHandler) applyFakeDevicePlugin(gpuCount int, node *v1.Node) error {
patch := fmt.Sprintf(
`{"status": {"capacity": {"%s": "%d"}, "allocatable": {"%s": "%d"}}}`,
constants.GpuResourceName, gpuCount, constants.GpuResourceName, gpuCount,
)
_, err := p.kubeClient.CoreV1().Nodes().Patch(
context.TODO(), node.Name, types.MergePatchType, []byte(patch), metav1.PatchOptions{}, "status",
)
if err != nil {
return fmt.Errorf("failed to update node capacity and allocatable: %v", err)
}

return nil
}
76 changes: 0 additions & 76 deletions internal/kwok-gpu-device-plugin/handlers/node/handler.go

This file was deleted.

Loading

0 comments on commit 234abe6

Please sign in to comment.