From ff909eefe04c3c0c01f32120e246d282ed6e267b Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 16 Oct 2024 20:50:06 +0800 Subject: [PATCH] set node status before remove node (#645) * set node status before remove node * fix metric help message * fix UT issue --- cluster/calcium/node.go | 6 ++++++ cluster/calcium/node_test.go | 4 +++- metrics/metrics.go | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/cluster/calcium/node.go b/cluster/calcium/node.go index af2f7d99..ef5d3c47 100644 --- a/cluster/calcium/node.go +++ b/cluster/calcium/node.go @@ -87,6 +87,12 @@ func (c *Calcium) RemoveNode(ctx context.Context, nodename string) error { return utils.Txn(ctx, // if: remove node metadata func(ctx context.Context) error { + // we need set node status here, consider the following scenery: + // the node is down, so the node status doesn't exist in ETCD, + // if we don't set node status here, other core instances will not be notified when the node is removed + if err = c.store.SetNodeStatus(ctx, node, 90); err != nil { + logger.Warnf(ctx, "failed to set node status: %s", err) + } if err := c.store.RemoveNode(ctx, node); err != nil { return err } diff --git a/cluster/calcium/node_test.go b/cluster/calcium/node_test.go index 1fde82c3..503841d8 100644 --- a/cluster/calcium/node_test.go +++ b/cluster/calcium/node_test.go @@ -90,12 +90,14 @@ func TestRemoveNode(t *testing.T) { // fail by store.RemoveNode store.On("ListNodeWorkloads", mock.Anything, mock.Anything, mock.Anything).Return([]*types.Workload{}, nil) + store.On("SetNodeStatus", mock.Anything, mock.Anything, mock.Anything).Return(nil).Once() store.On("RemoveNode", mock.Anything, mock.Anything).Return(types.ErrMockError).Once() assert.Error(t, c.RemoveNode(ctx, name)) // success + store.On("SetNodeStatus", mock.Anything, mock.Anything, mock.Anything).Return(nil).Once() store.On("RemoveNode", mock.Anything, mock.Anything).Return(nil) - store.On("SetNodeStatus", mock.Anything, mock.Anything, int64(-1)).Return(nil) + store.On("SetNodeStatus", mock.Anything, mock.Anything, int64(-1)).Return(nil).Once() rmgr := c.rmgr.(*resourcemocks.Manager) rmgr.On("RemoveNode", mock.Anything, mock.Anything).Return(nil) assert.NoError(t, c.RemoveNode(ctx, name)) diff --git a/metrics/metrics.go b/metrics/metrics.go index 374ec091..11bdfb01 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -225,7 +225,7 @@ func InitMetrics(ctx context.Context, config types.Config, metricsDescriptions [ Client.Collectors[podNodeStatusName] = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: podNodeStatusName, - Help: "number of up nodes", + Help: "node status", }, []string{"hostname", "podname", "nodename"}) once.Do(func() {