Skip to content

Commit

Permalink
ceph: evict a mon if colocated with another mon
Browse files Browse the repository at this point in the history
In a production cluster we never should have two mons running on
the same node. However, if two mons have ended up on the same node,
whether from a bug or some other unintentional event, the operator
will evict one of them and failover to a new mon. This only applies
when allowMultiplePerNode is false.

Signed-off-by: Travis Nielsen <[email protected]>
  • Loading branch information
travisn committed Jun 23, 2021
1 parent 5e3d502 commit b1c1e29
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 2 deletions.
54 changes: 52 additions & 2 deletions pkg/operator/ceph/cluster/mon/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ var (

retriesBeforeNodeDrainFailover = 1
timeZero = time.Duration(0)
// Check whether mons are on the same node once per operator restart since it's a rare scheduling condition
needToCheckMonsOnSameNode = true
)

// HealthChecker aggregates the mon/cluster info needed to check the health of the monitors
Expand Down Expand Up @@ -208,7 +210,6 @@ func (c *Cluster) checkHealth() error {
if time.Since(c.monTimeoutList[mon.Name]) <= MonOutTimeout {
timeToFailover := int(MonOutTimeout.Seconds() - time.Since(c.monTimeoutList[mon.Name]).Seconds())
logger.Warningf("mon %q not found in quorum, waiting for timeout (%d seconds left) before failover", mon.Name, timeToFailover)

continue
}

Expand Down Expand Up @@ -257,10 +258,18 @@ func (c *Cluster) checkHealth() error {
}
}

// remove any pending/not needed mon canary deployment if everything is ok
if allMonsInQuorum && len(quorumStatus.MonMap.Mons) == desiredMonCount {
// remove any pending/not needed mon canary deployment if everything is ok
logger.Debug("mon cluster is healthy, removing any existing canary deployment")
c.removeCanaryDeployments()

// Check whether two healthy mons are on the same node when they should not be.
// This should be a rare event to find them on the same node, so we just need to check
// once per operator restart.
if needToCheckMonsOnSameNode {
needToCheckMonsOnSameNode = false
return c.evictMonIfMultipleOnSameNode()
}
}

return nil
Expand Down Expand Up @@ -601,3 +610,44 @@ func (c *Cluster) addOrRemoveExternalMonitor(status cephclient.MonStatusResponse
logger.Debugf("ClusterInfo.Monitors is %+v", c.ClusterInfo.Monitors)
return changed, nil
}

func (c *Cluster) evictMonIfMultipleOnSameNode() error {
if c.spec.Mon.AllowMultiplePerNode {
logger.Debug("skipping check for multiple mons on same node since multiple mons are allowed")
return nil
}

logger.Info("checking if multiple mons are on the same node")

// Get all the mon pods
label := fmt.Sprintf("app=%s", AppName)
pods, err := c.context.Clientset.CoreV1().Pods(c.Namespace).List(context.TODO(), metav1.ListOptions{LabelSelector: label})
if err != nil {
return errors.Wrap(err, "failed to list mon pods")
}

nodesToMons := map[string]string{}
for _, pod := range pods.Items {
logger.Debugf("analyzing mon pod %q on node %q", pod.Name, pod.Spec.NodeName)
if _, ok := pod.Labels["mon_canary"]; ok {
logger.Debugf("skipping mon canary pod %q", pod.Name)
continue
}
if pod.Spec.NodeName == "" {
logger.Warningf("mon %q is not assigned to a node", pod.Name)
continue
}
monName := pod.Labels["mon"]
previousMonName, ok := nodesToMons[pod.Spec.NodeName]
if !ok {
// remember this node is taken by this mon
nodesToMons[pod.Spec.NodeName] = monName
continue
}

logger.Warningf("Both mons %q and %q are on node %q. Evicting mon %q", monName, previousMonName, pod.Spec.NodeName, monName)
return c.failoverMon(monName)
}

return nil
}
57 changes: 57 additions & 0 deletions pkg/operator/ceph/cluster/mon/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
apps "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
)

func TestCheckHealth(t *testing.T) {
Expand Down Expand Up @@ -149,6 +150,62 @@ func TestCheckHealth(t *testing.T) {
}
}

func TestEvictMonOnSameNode(t *testing.T) {
ctx := context.TODO()
clientset := test.New(t, 1)
configDir, _ := ioutil.TempDir("", "")
defer os.RemoveAll(configDir)
context := &clusterd.Context{Clientset: clientset, ConfigDir: configDir, Executor: &exectest.MockExecutor{}, RequestCancelOrchestration: abool.New()}
ownerInfo := cephclient.NewMinimumOwnerInfoWithOwnerRef()
c := New(context, "ns", cephv1.ClusterSpec{}, ownerInfo, &sync.Mutex{})
setCommonMonProperties(c, 1, cephv1.MonSpec{Count: 0}, "myversion")
c.maxMonID = 2
c.waitForStart = false
waitForMonitorScheduling = func(c *Cluster, d *apps.Deployment) (SchedulingResult, error) {
node, _ := clientset.CoreV1().Nodes().Get(ctx, "node0", metav1.GetOptions{})
return SchedulingResult{Node: node}, nil
}

c.spec.Mon.Count = 3
createTestMonPod(t, clientset, c, "a", "node1")

// Nothing to evict with a single mon
err := c.evictMonIfMultipleOnSameNode()
assert.NoError(t, err)

// Create a second mon on a different node
createTestMonPod(t, clientset, c, "b", "node2")

// Nothing to evict with where mons are on different nodes
err = c.evictMonIfMultipleOnSameNode()
assert.NoError(t, err)

// Create a third mon on the same node as mon a
createTestMonPod(t, clientset, c, "c", "node1")
assert.Equal(t, 2, c.maxMonID)

// Should evict either mon a or mon c since they are on the same node and failover to mon d
err = c.evictMonIfMultipleOnSameNode()
assert.NoError(t, err)
_, err = clientset.AppsV1().Deployments(c.Namespace).Get(ctx, "rook-ceph-mon-d", metav1.GetOptions{})
assert.NoError(t, err)
assert.Equal(t, 3, c.maxMonID)
}

func createTestMonPod(t *testing.T, clientset kubernetes.Interface, c *Cluster, name, node string) {
m := &monConfig{ResourceName: resourceName(name), DaemonName: name, DataPathMap: &config.DataPathMap{}}
d, err := c.makeDeployment(m, false)
assert.NoError(t, err)
monPod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "mon-pod-" + name, Namespace: c.Namespace, Labels: d.Labels},
Spec: d.Spec.Template.Spec,
}
monPod.Spec.NodeName = node
monPod.Status.Phase = v1.PodRunning
_, err = clientset.CoreV1().Pods(c.Namespace).Create(context.TODO(), monPod, metav1.CreateOptions{})
assert.NoError(t, err)
}

func TestScaleMonDeployment(t *testing.T) {
ctx := context.TODO()
clientset := test.New(t, 1)
Expand Down

0 comments on commit b1c1e29

Please sign in to comment.