diff --git a/deploy/fake-gpu-operator/templates/status-updater/clusterrole.yaml b/deploy/fake-gpu-operator/templates/status-updater/clusterrole.yaml index dcbf705..7ddc90f 100644 --- a/deploy/fake-gpu-operator/templates/status-updater/clusterrole.yaml +++ b/deploy/fake-gpu-operator/templates/status-updater/clusterrole.yaml @@ -8,6 +8,7 @@ rules: resources: - pods - nodes + - nodes/status verbs: - get - list @@ -28,4 +29,4 @@ rules: resources: - podgroups verbs: - - get \ No newline at end of file + - get diff --git a/internal/common/topology/types.go b/internal/common/topology/types.go index e70801c..06f5e06 100644 --- a/internal/common/topology/types.go +++ b/internal/common/topology/types.go @@ -48,7 +48,8 @@ type Range struct { } type Config struct { - NodeAutofill NodeAutofillSettings `yaml:"node-autofill"` + NodeAutofill NodeAutofillSettings `yaml:"node-autofill"` + FakeNodeHandling bool `yaml:"fake-node-handling"` } type NodeAutofillSettings struct { diff --git a/internal/status-updater/controllers/node/controller.go b/internal/status-updater/controllers/node/controller.go index bbe0cf1..b41bba9 100644 --- a/internal/status-updater/controllers/node/controller.go +++ b/internal/status-updater/controllers/node/controller.go @@ -53,6 +53,10 @@ func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *Nod node := obj.(*v1.Node) util.LogErrorIfExist(c.handler.HandleAdd(node), "Failed to handle node addition") }, + UpdateFunc: func(oldObj, newObj interface{}) { + newNode := newObj.(*v1.Node) + util.LogErrorIfExist(c.handler.HandleUpdate(newNode), "Failed to handle node addition") + }, DeleteFunc: func(obj interface{}) { node := obj.(*v1.Node) util.LogErrorIfExist(c.handler.HandleDelete(node), "Failed to handle node deletion") diff --git a/internal/status-updater/handlers/node/fake_node_deployments.go b/internal/status-updater/handlers/node/fake_node_deployments.go index f75ab9e..848143e 100644 --- a/internal/status-updater/handlers/node/fake_node_deployments.go +++ b/internal/status-updater/handlers/node/fake_node_deployments.go @@ -11,9 +11,27 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" ) +func (p *NodeHandler) applyFakeDevicePlugin(gpuCount int, node *v1.Node) error { + if !isFakeNode(node) { + return nil + } + + patch := fmt.Sprintf( + `{"status": {"capacity": {"%s": "%d"}, "allocatable": {"%s": "%d"}}}`, + constants.GpuResourceName, gpuCount, constants.GpuResourceName, gpuCount, + ) + _, err := p.kubeClient.CoreV1().Nodes().Patch(context.TODO(), node.Name, types.MergePatchType, []byte(patch), metav1.PatchOptions{}, "status") + if err != nil { + return fmt.Errorf("failed to update node capacity and allocatable: %v", err) + } + + return nil +} + func (p *NodeHandler) applyFakeNodeDeployments(node *v1.Node) error { if !isFakeNode(node) { return nil diff --git a/internal/status-updater/handlers/node/handler.go b/internal/status-updater/handlers/node/handler.go index 4c66501..8f3fd4e 100644 --- a/internal/status-updater/handlers/node/handler.go +++ b/internal/status-updater/handlers/node/handler.go @@ -13,6 +13,7 @@ import ( type Interface interface { HandleAdd(node *v1.Node) error HandleDelete(node *v1.Node) error + HandleUpdate(node *v1.Node) error } type NodeHandler struct { @@ -30,14 +31,26 @@ func NewNodeHandler(kubeClient kubernetes.Interface) *NodeHandler { func (p *NodeHandler) HandleAdd(node *v1.Node) error { log.Printf("Handling node addition: %s\n", node.Name) - err := p.createNodeTopologyCM(node) + baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient) if err != nil { - return fmt.Errorf("failed to create node topology ConfigMap: %w", err) + return fmt.Errorf("failed to get base topology: %w", err) } - err = p.applyFakeNodeDeployments(node) + err = p.createNodeTopologyCM(node, baseTopology) if err != nil { - return fmt.Errorf("failed to apply fake node deployments: %w", err) + return fmt.Errorf("failed to create node topology ConfigMap: %w", err) + } + + if baseTopology.Config.FakeNodeHandling { + err = p.applyFakeDevicePlugin(baseTopology.Config.NodeAutofill.GpuCount, node) + if err != nil { + return fmt.Errorf("failed to apply fake node deployments: %w", err) + } + } else { + err = p.applyFakeNodeDeployments(node) + if err != nil { + return fmt.Errorf("failed to apply fake node deployments: %w", err) + } } return nil @@ -58,3 +71,21 @@ func (p *NodeHandler) HandleDelete(node *v1.Node) error { return nil } + +func (p *NodeHandler) HandleUpdate(node *v1.Node) error { + baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient) + if err != nil { + return fmt.Errorf("failed to get base topology: %w", err) + } + + if !baseTopology.Config.FakeNodeHandling { + return nil + } + + gpuCount := baseTopology.Config.NodeAutofill.GpuCount + err = p.applyFakeDevicePlugin(gpuCount, node) + if err != nil { + return fmt.Errorf("failed to apply fake node deployments: %w", err) + } + return nil +} diff --git a/internal/status-updater/handlers/node/topology_cm.go b/internal/status-updater/handlers/node/topology_cm.go index cff8b26..cab793b 100644 --- a/internal/status-updater/handlers/node/topology_cm.go +++ b/internal/status-updater/handlers/node/topology_cm.go @@ -8,17 +8,14 @@ import ( v1 "k8s.io/api/core/v1" ) -func (p *NodeHandler) createNodeTopologyCM(node *v1.Node) error { +func (p *NodeHandler) createNodeTopologyCM( + node *v1.Node, baseTopology *topology.BaseTopology, +) error { nodeTopology, _ := topology.GetNodeTopologyFromCM(p.kubeClient, node.Name) if nodeTopology != nil { return nil } - baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient) - if err != nil { - return fmt.Errorf("failed to get base topology: %w", err) - } - nodeAutofillSettings := baseTopology.Config.NodeAutofill nodeTopology = &topology.NodeTopology{ @@ -28,7 +25,7 @@ func (p *NodeHandler) createNodeTopologyCM(node *v1.Node) error { MigStrategy: nodeAutofillSettings.MigStrategy, } - err = topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name) + err := topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name) if err != nil { return fmt.Errorf("failed to create node topology: %w", err) }