valkey-io · enjoy-binbin · Dec 2, 2024 · Oct 17, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/src/cluster.h b/src/cluster.h
@@ -12,6 +12,12 @@
 #define CLUSTER_FAIL 1                                              /* The cluster can't work */
 #define CLUSTER_NAMELEN 40                                          /* sha1 hex length */
 
+/* Reason why the cluster state changes to fail. When adding new reasons,
+ * make sure to update clusterLogFailReason. */
+#define CLUSTER_FAIL_NONE 0
+#define CLUSTER_FAIL_NOT_FULL_COVERAGE 1
+#define CLUSTER_FAIL_MINORITY_PARTITION 2
+
 /* Redirection errors returned by getNodeByQuery(). */
 #define CLUSTER_REDIR_NONE 0          /* Node can serve the request. */
 #define CLUSTER_REDIR_CROSS_SLOT 1    /* -CROSSSLOT request. */

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
@@ -1082,6 +1082,7 @@ void clusterInit(void) {
     server.cluster->myself = NULL;
     server.cluster->currentEpoch = 0;
     server.cluster->state = CLUSTER_FAIL;
+    server.cluster->fail_reason = CLUSTER_FAIL_NONE;
     server.cluster->size = 0;
     server.cluster->todo_before_sleep = 0;
     server.cluster->nodes = dictCreate(&clusterNodesDictType);
@@ -4493,7 +4494,7 @@ void clusterLogCantFailover(int reason) {
     case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break;
     case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break;
     case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break;
-    default: msg = "Unknown reason code."; break;
+    default: serverPanic("Unknown cant failover reason code.");
     }
     lastlog_time = time(NULL);
     serverLog(LL_NOTICE, "Currently unable to failover: %s", msg);
@@ -5362,6 +5363,23 @@ void clusterCloseAllSlots(void) {
  * Cluster state evaluation function
  * -------------------------------------------------------------------------- */
 
+void clusterLogFailReason(int reason) {
+    if (reason == CLUSTER_FAIL_NONE) return;
+
+    char *msg;
+    switch (reason) {
+    case CLUSTER_FAIL_NOT_FULL_COVERAGE:
+        msg = "At least one hash slot is not served by any available node. "
+              "Please check the 'cluster-require-full-coverage' configuration.";
+        break;
+    case CLUSTER_FAIL_MINORITY_PARTITION:
+        msg = "I am part of a minority partition.";
+        break;
+    default: serverPanic("Unknown fail reason code.");
+    }
+    serverLog(LL_WARNING, "Cluster is currently down: %s", msg);
+}
+
 /* The following are defines that are only used in the evaluation function
  * and are based on heuristics. Actually the main point about the rejoin and
  * writable delay is that they should be a few orders of magnitude larger
@@ -5371,7 +5389,7 @@ void clusterCloseAllSlots(void) {
 #define CLUSTER_WRITABLE_DELAY 2000
 
 void clusterUpdateState(void) {
-    int j, new_state;
+    int j, new_state, new_reason;
     int reachable_primaries = 0;
     static mstime_t among_minority_time;
     static mstime_t first_call_time = 0;
@@ -5392,12 +5410,14 @@ void clusterUpdateState(void) {
     /* Start assuming the state is OK. We'll turn it into FAIL if there
      * are the right conditions. */
     new_state = CLUSTER_OK;
+    new_reason = CLUSTER_FAIL_NONE;
 
     /* Check if all the slots are covered. */
     if (server.cluster_require_full_coverage) {
         for (j = 0; j < CLUSTER_SLOTS; j++) {
             if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) {
                 new_state = CLUSTER_FAIL;
+                new_reason = CLUSTER_FAIL_NOT_FULL_COVERAGE;
                 break;
             }
         }
@@ -5432,6 +5452,7 @@ void clusterUpdateState(void) {
 
         if (reachable_primaries < needed_quorum) {
             new_state = CLUSTER_FAIL;
+            new_reason = CLUSTER_FAIL_MINORITY_PARTITION;
             among_minority_time = mstime();
         }
     }
@@ -5455,7 +5476,21 @@ void clusterUpdateState(void) {
         serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, "Cluster state changed: %s",
                   new_state == CLUSTER_OK ? "ok" : "fail");
         server.cluster->state = new_state;
+
+        /* Cluster state changes from ok to fail, print a log. */
+        if (new_state == CLUSTER_FAIL) {
+            clusterLogFailReason(new_reason);
+            server.cluster->fail_reason = new_reason;
+        }
     }
+
+    /* Cluster state is still fail, but the reason has changed, print a log. */
+    if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) {
+        clusterLogFailReason(new_reason);
+        server.cluster->fail_reason = new_reason;
+    }
+
+    if (new_state == CLUSTER_OK) server.cluster->fail_reason = CLUSTER_FAIL_NONE;
 }
 
 /* This function is called after the node startup in order to verify that data

diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
@@ -370,6 +370,7 @@ struct clusterState {
     clusterNode *myself; /* This node */
     uint64_t currentEpoch;
     int state;              /* CLUSTER_OK, CLUSTER_FAIL, ... */
+    int fail_reason;        /* Why the cluster state changes to fail. */
     int size;               /* Num of primary nodes with at least one slot */
     dict *nodes;            /* Hash table of name -> clusterNode structures */
     dict *shards;           /* Hash table of shard_id -> list (of nodes) structures */

diff --git a/tests/unit/cluster/info.tcl b/tests/unit/cluster/info.tcl
@@ -41,3 +41,26 @@ test "errorstats: rejected call due to MOVED Redirection" {
 }
 
 } ;# start_cluster
+
+start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
+    test "fail reason changed" {
+        # Kill one primary, so the cluster fail with not-full-coverage.
+        pause_process [srv 0 pid]
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_state] eq {fail} &&
+            [CI 2 cluster_state] eq {fail}
+        } else {
+            fail "Cluster doesn't fail"
+        }
+        verify_log_message -1 "*At least one hash slot is not served by any available node*" 0
+        verify_log_message -2 "*At least one hash slot is not served by any available node*" 0
+
+        # Kill one more primary, so the cluster fail with minority-partition.
+        pause_process [srv -1 pid]
+        wait_for_log_messages -2 {"*minority partition*"} 0 1000 50
+
+        resume_process [srv 0 pid]
+        resume_process [srv -1 pid]
+        wait_for_cluster_state ok
+    }
+}