Skip to content

Commit

Permalink
add flag to skip deployments for fake nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
enoodle committed Jul 3, 2024
1 parent 73d1f1b commit bcfc523
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 12 deletions.
3 changes: 2 additions & 1 deletion internal/common/topology/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ type Range struct {
}

type Config struct {
NodeAutofill NodeAutofillSettings `yaml:"node-autofill"`
NodeAutofill NodeAutofillSettings `yaml:"node-autofill"`
FakeNodeHandling bool `yaml:"fake-node-handling"`
}

type NodeAutofillSettings struct {
Expand Down
4 changes: 4 additions & 0 deletions internal/status-updater/controllers/node/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *Nod
node := obj.(*v1.Node)
util.LogErrorIfExist(c.handler.HandleAdd(node), "Failed to handle node addition")
},
UpdateFunc: func(oldObj, newObj interface{}) {
newNode := newObj.(*v1.Node)
util.LogErrorIfExist(c.handler.HandleUpdate(newNode), "Failed to handle node addition")
},
DeleteFunc: func(obj interface{}) {
node := obj.(*v1.Node)
util.LogErrorIfExist(c.handler.HandleDelete(node), "Failed to handle node deletion")
Expand Down
18 changes: 18 additions & 0 deletions internal/status-updater/handlers/node/fake_node_deployments.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,27 @@ import (
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"
)

func (p *NodeHandler) applyFakeDevicePlugin(gpuCount int, node *v1.Node) error {
if !isFakeNode(node) {
return nil
}

patch := fmt.Sprintf(
`{"status": {"capacity": {"%s": "%d"}, "allocatable": {"%s": "%d"}}}`,
constants.GpuResourceName, gpuCount, constants.GpuResourceName, gpuCount,
)
_, err := p.kubeClient.CoreV1().Nodes().Patch(context.TODO(), node.Name, types.MergePatchType, []byte(patch), metav1.PatchOptions{}, "status")
if err != nil {
return fmt.Errorf("failed to update node capacity and allocatable: %v", err)
}

return nil
}

func (p *NodeHandler) applyFakeNodeDeployments(node *v1.Node) error {
if !isFakeNode(node) {
return nil
Expand Down
39 changes: 35 additions & 4 deletions internal/status-updater/handlers/node/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
type Interface interface {
HandleAdd(node *v1.Node) error
HandleDelete(node *v1.Node) error
HandleUpdate(node *v1.Node) error
}

type NodeHandler struct {
Expand All @@ -30,14 +31,26 @@ func NewNodeHandler(kubeClient kubernetes.Interface) *NodeHandler {
func (p *NodeHandler) HandleAdd(node *v1.Node) error {
log.Printf("Handling node addition: %s\n", node.Name)

err := p.createNodeTopologyCM(node)
baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient)
if err != nil {
return fmt.Errorf("failed to create node topology ConfigMap: %w", err)
return fmt.Errorf("failed to get base topology: %w", err)
}

err = p.applyFakeNodeDeployments(node)
err = p.createNodeTopologyCM(node, baseTopology)
if err != nil {
return fmt.Errorf("failed to apply fake node deployments: %w", err)
return fmt.Errorf("failed to create node topology ConfigMap: %w", err)
}

if baseTopology.Config.FakeNodeHandling {
err = p.applyFakeDevicePlugin(baseTopology.Config.NodeAutofill.GpuCount, node)
if err != nil {
return fmt.Errorf("failed to apply fake node deployments: %w", err)
}
} else {
err = p.applyFakeNodeDeployments(node)
if err != nil {
return fmt.Errorf("failed to apply fake node deployments: %w", err)
}
}

return nil
Expand All @@ -58,3 +71,21 @@ func (p *NodeHandler) HandleDelete(node *v1.Node) error {

return nil
}

func (p *NodeHandler) HandleUpdate(node *v1.Node) error {
baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient)
if err != nil {
return fmt.Errorf("failed to get base topology: %w", err)
}

if !baseTopology.Config.FakeNodeHandling {
return nil
}

gpuCount := baseTopology.Config.NodeAutofill.GpuCount
err = p.applyFakeDevicePlugin(gpuCount, node)
if err != nil {
return fmt.Errorf("failed to apply fake node deployments: %w", err)
}
return nil
}
11 changes: 4 additions & 7 deletions internal/status-updater/handlers/node/topology_cm.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,14 @@ import (
v1 "k8s.io/api/core/v1"
)

func (p *NodeHandler) createNodeTopologyCM(node *v1.Node) error {
func (p *NodeHandler) createNodeTopologyCM(
node *v1.Node, baseTopology *topology.BaseTopology,
) error {
nodeTopology, _ := topology.GetNodeTopologyFromCM(p.kubeClient, node.Name)
if nodeTopology != nil {
return nil
}

baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient)
if err != nil {
return fmt.Errorf("failed to get base topology: %w", err)
}

nodeAutofillSettings := baseTopology.Config.NodeAutofill

nodeTopology = &topology.NodeTopology{
Expand All @@ -28,7 +25,7 @@ func (p *NodeHandler) createNodeTopologyCM(node *v1.Node) error {
MigStrategy: nodeAutofillSettings.MigStrategy,
}

err = topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name)
err := topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name)
if err != nil {
return fmt.Errorf("failed to create node topology: %w", err)
}
Expand Down

0 comments on commit bcfc523

Please sign in to comment.