Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Print logs when the cluster state changes to fail or the fail reason changes #1188

Merged
merged 10 commits into from
Dec 2, 2024
6 changes: 6 additions & 0 deletions src/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
#define CLUSTER_FAIL 1 /* The cluster can't work */
#define CLUSTER_NAMELEN 40 /* sha1 hex length */

/* Reason why the cluster state changes to fail. When adding new reasons,
* make sure to update clusterLogFailReason. */
#define CLUSTER_FAIL_NONE 0
#define CLUSTER_FAIL_NOT_FULL_COVERAGE 1
#define CLUSTER_FAIL_MINORITY_PARTITION 2
enjoy-binbin marked this conversation as resolved.
Show resolved Hide resolved

/* Redirection errors returned by getNodeByQuery(). */
#define CLUSTER_REDIR_NONE 0 /* Node can serve the request. */
#define CLUSTER_REDIR_CROSS_SLOT 1 /* -CROSSSLOT request. */
Expand Down
39 changes: 37 additions & 2 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -1082,6 +1082,7 @@ void clusterInit(void) {
server.cluster->myself = NULL;
server.cluster->currentEpoch = 0;
server.cluster->state = CLUSTER_FAIL;
server.cluster->fail_reason = CLUSTER_FAIL_NONE;
server.cluster->size = 0;
server.cluster->todo_before_sleep = 0;
server.cluster->nodes = dictCreate(&clusterNodesDictType);
Expand Down Expand Up @@ -4493,7 +4494,7 @@ void clusterLogCantFailover(int reason) {
case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break;
case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break;
case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break;
default: msg = "Unknown reason code."; break;
default: serverPanic("Unknown cant failover reason code.");
}
lastlog_time = time(NULL);
serverLog(LL_NOTICE, "Currently unable to failover: %s", msg);
Expand Down Expand Up @@ -5362,6 +5363,23 @@ void clusterCloseAllSlots(void) {
* Cluster state evaluation function
* -------------------------------------------------------------------------- */

void clusterLogFailReason(int reason) {
if (reason == CLUSTER_FAIL_NONE) return;

char *msg;
enjoy-binbin marked this conversation as resolved.
Show resolved Hide resolved
switch (reason) {
case CLUSTER_FAIL_NOT_FULL_COVERAGE:
msg = "At least one hash slot is not served by any available node. "
"Please check the 'cluster-require-full-coverage' configuration.";
break;
case CLUSTER_FAIL_MINORITY_PARTITION:
msg = "I am part of a minority partition.";
break;
default: serverPanic("Unknown fail reason code.");
}
serverLog(LL_WARNING, "Cluster is currently down: %s", msg);
}

/* The following are defines that are only used in the evaluation function
* and are based on heuristics. Actually the main point about the rejoin and
* writable delay is that they should be a few orders of magnitude larger
Expand All @@ -5371,7 +5389,7 @@ void clusterCloseAllSlots(void) {
#define CLUSTER_WRITABLE_DELAY 2000

void clusterUpdateState(void) {
int j, new_state;
int j, new_state, new_reason;
int reachable_primaries = 0;
static mstime_t among_minority_time;
static mstime_t first_call_time = 0;
Expand All @@ -5392,12 +5410,14 @@ void clusterUpdateState(void) {
/* Start assuming the state is OK. We'll turn it into FAIL if there
* are the right conditions. */
new_state = CLUSTER_OK;
new_reason = CLUSTER_FAIL_NONE;

/* Check if all the slots are covered. */
if (server.cluster_require_full_coverage) {
for (j = 0; j < CLUSTER_SLOTS; j++) {
if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) {
new_state = CLUSTER_FAIL;
new_reason = CLUSTER_FAIL_NOT_FULL_COVERAGE;
break;
}
}
Expand Down Expand Up @@ -5432,6 +5452,7 @@ void clusterUpdateState(void) {

if (reachable_primaries < needed_quorum) {
new_state = CLUSTER_FAIL;
new_reason = CLUSTER_FAIL_MINORITY_PARTITION;
among_minority_time = mstime();
}
}
Expand All @@ -5455,7 +5476,21 @@ void clusterUpdateState(void) {
serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, "Cluster state changed: %s",
new_state == CLUSTER_OK ? "ok" : "fail");
server.cluster->state = new_state;

/* Cluster state changes from ok to fail, print a log. */
if (new_state == CLUSTER_FAIL) {
clusterLogFailReason(new_reason);
server.cluster->fail_reason = new_reason;
}
}

/* Cluster state is still fail, but the reason has changed, print a log. */
if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) {
clusterLogFailReason(new_reason);
server.cluster->fail_reason = new_reason;
}

if (new_state == CLUSTER_OK) server.cluster->fail_reason = CLUSTER_FAIL_NONE;
}

/* This function is called after the node startup in order to verify that data
Expand Down
1 change: 1 addition & 0 deletions src/cluster_legacy.h
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ struct clusterState {
clusterNode *myself; /* This node */
uint64_t currentEpoch;
int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */
int fail_reason; /* Why the cluster state changes to fail. */
enjoy-binbin marked this conversation as resolved.
Show resolved Hide resolved
int size; /* Num of primary nodes with at least one slot */
dict *nodes; /* Hash table of name -> clusterNode structures */
dict *shards; /* Hash table of shard_id -> list (of nodes) structures */
Expand Down
23 changes: 23 additions & 0 deletions tests/unit/cluster/info.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,26 @@ test "errorstats: rejected call due to MOVED Redirection" {
}

} ;# start_cluster

start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
test "fail reason changed" {
# Kill one primary, so the cluster fail with not-full-coverage.
pause_process [srv 0 pid]
wait_for_condition 1000 50 {
[CI 1 cluster_state] eq {fail} &&
[CI 2 cluster_state] eq {fail}
} else {
fail "Cluster doesn't fail"
}
verify_log_message -1 "*At least one hash slot is not served by any available node*" 0
verify_log_message -2 "*At least one hash slot is not served by any available node*" 0

# Kill one more primary, so the cluster fail with minority-partition.
pause_process [srv -1 pid]
wait_for_log_messages -2 {"*minority partition*"} 0 1000 50

resume_process [srv 0 pid]
resume_process [srv -1 pid]
wait_for_cluster_state ok
}
}
Loading