diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c index 96e02094f4d..db488ffb34a 100644 --- a/daemons/pacemakerd/pacemakerd.c +++ b/daemons/pacemakerd/pacemakerd.c @@ -48,8 +48,14 @@ static bool global_keep_tracking = false; static const char *local_name = NULL; static uint32_t local_nodeid = 0; static crm_trigger_t *shutdown_trigger = NULL; +static crm_trigger_t *startup_trigger = NULL; static const char *pid_file = PCMK_RUN_DIR "/pacemaker.pid"; +static const char *pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_INIT; +static gboolean running_with_sbd = FALSE; +static gboolean first_state_query_seen = FALSE; +static gboolean shutdown_complete_state_reported = FALSE; + typedef struct pcmk_child_s { int pid; long flag; @@ -444,6 +450,7 @@ pcmk_shutdown_worker(gpointer user_data) if (phase == 0) { crm_notice("Shutting down Pacemaker"); phase = max; + pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_SHUTTINGDOWN; } for (; phase > 0; phase--) { @@ -497,6 +504,10 @@ pcmk_shutdown_worker(gpointer user_data) /* send_cluster_id(); */ crm_notice("Shutdown complete"); + pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_SHUTDOWNCOMPLETE; + if (running_with_sbd && !shutdown_complete_state_reported) { + return TRUE; + } { const char *delay = daemon_option("shutdown_delay"); @@ -563,35 +574,86 @@ pcmk_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size) crm_client_t *c = crm_client_get(qbc); xmlNode *msg = crm_ipcs_recv(c, data, size, &id, &flags); - crm_ipcs_send_ack(c, id, flags, "ack", __FUNCTION__, __LINE__); - if (msg == NULL) { - return 0; + if (msg != NULL) { + task = crm_element_value(msg, F_CRM_TASK); } - task = crm_element_value(msg, F_CRM_TASK); - if (crm_str_eq(task, CRM_OP_QUIT, TRUE)) { - /* Time to quit */ - crm_notice("Shutting down in response to ticket %s (%s)", - crm_element_value(msg, F_CRM_REFERENCE), crm_element_value(msg, F_CRM_ORIGIN)); - pcmk_shutdown(15); + if (crm_str_eq(task, CRM_OP_PING, TRUE)) { + const char *value = NULL; + xmlNode *ping = NULL; + xmlNode *reply = NULL; + + /* Pinged for status */ + crm_trace("Pinged from %s.%s", + crm_element_value(msg, F_CRM_ORIGIN), + crm_element_value(msg, F_CRM_SYS_FROM)); + first_state_query_seen = TRUE; + ping = create_xml_node(NULL, XML_CRM_TAG_PING); + value = crm_element_value(msg, F_CRM_SYS_TO); + crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value); + crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state); + crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok"); + reply = create_reply(msg, ping); + free_xml(ping); + if (reply) { + const char *local_name = get_local_node_name(); + + if ((crm_element_value(reply, F_CRM_HOST_FROM) == NULL) && + local_name) { + crm_xml_add(reply, F_CRM_HOST_FROM, local_name); + } + if (crm_ipcs_send(c, id, reply, crm_ipc_server_event) <= 0) { + crm_err("Failed sending ping-reply"); + } + free_xml(reply); + } else { + crm_err("Failed building ping-reply"); + } + if (crm_str_eq(pacemakerd_state, + XML_PING_ATTR_PACEMAKERDSTATE_SHUTDOWNCOMPLETE, TRUE)) { + sleep(5); /* get out message - less ugly alternative? */ + shutdown_complete_state_reported = TRUE; + if (shutdown_trigger) { + mainloop_set_trigger(shutdown_trigger); + } + } else if (crm_str_eq(pacemakerd_state, + XML_PING_ATTR_PACEMAKERDSTATE_WAITPING, + TRUE)) { + pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_STARTINGDAEMONS; + mainloop_set_trigger(startup_trigger); + } + } else { + crm_ipcs_send_ack(c, id, flags, "ack", __FUNCTION__, __LINE__); - } else if (crm_str_eq(task, CRM_OP_RM_NODE_CACHE, TRUE)) { - /* Send to everyone */ - struct iovec *iov; - int id = 0; - const char *name = NULL; + if (msg == NULL) { + return 0; + } - crm_element_value_int(msg, XML_ATTR_ID, &id); - name = crm_element_value(msg, XML_ATTR_UNAME); - crm_notice("Instructing peers to remove references to node %s/%u", name, id); + if (crm_str_eq(task, CRM_OP_QUIT, TRUE)) { + /* Time to quit */ + crm_notice("Shutting down in response to ticket %s (%s)", + crm_element_value(msg, F_CRM_REFERENCE), + crm_element_value(msg, F_CRM_ORIGIN)); + pcmk_shutdown(15); - iov = calloc(1, sizeof(struct iovec)); - iov->iov_base = dump_xml_unformatted(msg); - iov->iov_len = 1 + strlen(iov->iov_base); - send_cpg_iov(iov); + } else if (crm_str_eq(task, CRM_OP_RM_NODE_CACHE, TRUE)) { + /* Send to everyone */ + struct iovec *iov; + int id = 0; + const char *name = NULL; - } else { - update_process_clients(c); + crm_element_value_int(msg, XML_ATTR_ID, &id); + name = crm_element_value(msg, XML_ATTR_UNAME); + crm_notice("Instructing peers to remove references to node %s/%u", name, id); + + iov = calloc(1, sizeof(struct iovec)); + iov->iov_base = dump_xml_unformatted(msg); + iov->iov_len = 1 + strlen(iov->iov_base); + send_cpg_iov(iov); + + } else { + update_process_clients(c); + } } free_xml(msg); @@ -1051,8 +1113,8 @@ find_and_track_existing_processes(void) return (tracking > INT_MAX) ? INT_MAX : tracking; } -static void -init_children_processes(void) +static gboolean +init_children_processes(gpointer user_data) { int start_seq = 1, lpc = 0; static int max = SIZEOF(pcmk_children); @@ -1078,6 +1140,8 @@ init_children_processes(void) * This may be useful for the daemons to know */ setenv("PCMK_respawned", "true", 1); + pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_RUNNING; + return TRUE; } static void @@ -1356,6 +1420,7 @@ main(int argc, char **argv) if(pcmk_locate_sbd() > 0) { setenv("PCMK_watchdog", "true", 1); + running_with_sbd = TRUE; } else { setenv("PCMK_watchdog", "false", 1); } @@ -1394,7 +1459,13 @@ main(int argc, char **argv) mainloop_add_signal(SIGTERM, pcmk_shutdown); mainloop_add_signal(SIGINT, pcmk_shutdown); - init_children_processes(); + if (running_with_sbd) { + pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_WAITPING; + startup_trigger = mainloop_add_trigger(G_PRIORITY_HIGH, init_children_processes, NULL); + } else { + pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_STARTINGDAEMONS; + init_children_processes(NULL); + } crm_notice("Pacemaker daemon successfully started and accepting connections"); g_main_loop_run(mainloop); diff --git a/include/crm/msg_xml.h b/include/crm/msg_xml.h index d56e40c6379..24696016dd7 100644 --- a/include/crm/msg_xml.h +++ b/include/crm/msg_xml.h @@ -123,6 +123,13 @@ extern "C" { # define XML_PING_ATTR_STATUS "result" # define XML_PING_ATTR_SYSFROM "crm_subsystem" # define XML_PING_ATTR_CRMDSTATE "crmd_state" +# define XML_PING_ATTR_PACEMAKERDSTATE "pacemakerd_state" +# define XML_PING_ATTR_PACEMAKERDSTATE_INIT "init" +# define XML_PING_ATTR_PACEMAKERDSTATE_STARTINGDAEMONS "starting_daemons" +# define XML_PING_ATTR_PACEMAKERDSTATE_WAITPING "wait_for_ping" +# define XML_PING_ATTR_PACEMAKERDSTATE_RUNNING "running" +# define XML_PING_ATTR_PACEMAKERDSTATE_SHUTTINGDOWN "shutting_down" +# define XML_PING_ATTR_PACEMAKERDSTATE_SHUTDOWNCOMPLETE "shutdown_complete" # define XML_TAG_FRAGMENT "cib_fragment" diff --git a/tools/crmadmin.c b/tools/crmadmin.c index 41bbe24f4ba..d4b6964afbc 100644 --- a/tools/crmadmin.c +++ b/tools/crmadmin.c @@ -33,6 +33,7 @@ static int message_timeout_ms = 30 * 1000; static GMainLoop *mainloop = NULL; static crm_ipc_t *crmd_channel = NULL; +static crm_ipc_t *pacemakerd_channel = NULL; static char *admin_uuid = NULL; gboolean do_init(void); @@ -46,6 +47,7 @@ static gboolean BE_VERBOSE = FALSE; static int expected_responses = 1; static gboolean BASH_EXPORT = FALSE; static gboolean DO_HEALTH = FALSE; +static gboolean DO_PACEMAKERD_HEALTH = FALSE; static gboolean DO_RESET = FALSE; static gboolean DO_RESOURCE = FALSE; static gboolean DO_ELECT_DC = FALSE; @@ -70,6 +72,8 @@ static struct crm_option long_options[] = { /* daemon options */ {"status", 1, 0, 'S', "Display the status of the specified node." }, {"-spacer-", 1, 0, '-', "\n\tResult is the node's internal FSM state which can be useful for debugging\n"}, + {"pacemakerd",0, 0, 'P', "Display the status of local pacemakerd."}, + {"-spacer-", 1, 0, '-', "\n\tResult is the state of the sub-daemons watched by pacemakerd\n"}, {"dc_lookup", 0, 0, 'D', "Display the uname of the node co-ordinating the cluster."}, {"-spacer-", 1, 0, '-', "\n\tThis is an internal detail and is rarely useful to administrators except when deciding on which node to examine the logs.\n"}, {"nodes", 0, 0, 'N', "\tDisplay the uname of all member nodes"}, @@ -142,6 +146,9 @@ main(int argc, char **argv) case 'q': BE_SILENT = TRUE; break; + case 'P': + DO_PACEMAKERD_HEALTH = TRUE; + break; case 'S': DO_HEALTH = TRUE; crm_trace("Option %c => %s", flag, optarg); @@ -215,12 +222,12 @@ do_work(void) xmlNode *msg_data = NULL; gboolean all_is_good = TRUE; - if (DO_HEALTH == TRUE) { + if ((DO_HEALTH == TRUE) || (DO_PACEMAKERD_HEALTH == TRUE)) { crm_trace("Querying the system"); sys_to = CRM_SYSTEM_DC; - if (dest_node != NULL) { + if ((DO_HEALTH == TRUE) && (dest_node != NULL)) { sys_to = CRM_SYSTEM_CRMD; crmd_operation = CRM_OP_PING; @@ -228,6 +235,13 @@ do_work(void) expected_responses = 1; } + } else if (DO_PACEMAKERD_HEALTH == TRUE) { + sys_to = CRM_SYSTEM_MCP; + crmd_operation = CRM_OP_PING; + + if (BE_VERBOSE) { + expected_responses = 1; + } } else { crm_info("Cluster-wide health not available yet"); all_is_good = FALSE; @@ -286,7 +300,7 @@ do_work(void) } /* send it */ - if (crmd_channel == NULL) { + if ((DO_PACEMAKERD_HEALTH?pacemakerd_channel:crmd_channel) == NULL) { crm_err("The IPC connection is not valid, cannot send anything"); return -1; } @@ -303,7 +317,8 @@ do_work(void) xmlNode *cmd = create_request(crmd_operation, msg_data, dest_node, sys_to, crm_system_name, admin_uuid); - crm_ipc_send(crmd_channel, cmd, 0, 0, NULL); + crm_ipc_send(DO_PACEMAKERD_HEALTH?pacemakerd_channel:crmd_channel, + cmd, 0, 0, NULL); free_xml(cmd); } @@ -329,21 +344,39 @@ struct ipc_client_callbacks crm_callbacks = { gboolean do_init(void) { - mainloop_io_t *source = + mainloop_io_t *crmd_source = mainloop_add_ipc_client(CRM_SYSTEM_CRMD, G_PRIORITY_DEFAULT, 0, NULL, &crm_callbacks); + mainloop_io_t *pacemakerd_source = + mainloop_add_ipc_client(CRM_SYSTEM_MCP, G_PRIORITY_DEFAULT, 0, NULL, &crm_callbacks); admin_uuid = crm_getpid_s(); - crmd_channel = mainloop_get_ipc_client(source); + crmd_channel = mainloop_get_ipc_client(crmd_source); + pacemakerd_channel = mainloop_get_ipc_client(pacemakerd_source); - if (DO_RESOURCE || DO_RESOURCE_LIST || DO_NODE_LIST) { + if (DO_RESOURCE || DO_RESOURCE_LIST || DO_NODE_LIST || DO_PACEMAKERD_HEALTH) { return TRUE; - } else if (crmd_channel != NULL) { - xmlNode *xml = create_hello_message(admin_uuid, crm_system_name, "0", "1"); + } else { + int hellos = 0; - crm_ipc_send(crmd_channel, xml, 0, 0, NULL); - return TRUE; + if (crmd_channel != NULL) { + xmlNode *xml = create_hello_message(admin_uuid, crm_system_name, "0", "1"); + + crm_ipc_send(crmd_channel, xml, 0, 0, NULL); + hellos++; + } +#if 0 + if (pacemakerd_channel != NULL) { + xmlNode *xml = create_hello_message(admin_uuid, crm_system_name, "0", "1"); + + crm_ipc_send(pacemakerd_channel, xml, 0, 0, NULL); + hellos++; + } +#endif + if (hellos == 1) { + return TRUE; + } } return FALSE; } @@ -394,14 +427,18 @@ admin_msg_callback(const char *buffer, ssize_t length, gpointer userdata) } else if (validate_crm_message(xml, crm_system_name, admin_uuid, XML_ATTR_RESPONSE) == FALSE) { crm_trace("Message was not a CRM response. Discarding."); + printf("Validation of response failed\n"); - } else if (DO_HEALTH) { + } else if (DO_HEALTH || DO_PACEMAKERD_HEALTH) { xmlNode *data = get_message_xml(xml, F_CRM_DATA); - const char *state = crm_element_value(data, XML_PING_ATTR_CRMDSTATE); + const char *state = DO_PACEMAKERD_HEALTH? + crm_element_value(data, XML_PING_ATTR_PACEMAKERDSTATE): + crm_element_value(data, XML_PING_ATTR_CRMDSTATE); + const char *host_from = crm_element_value(xml, F_CRM_HOST_FROM); printf("Status of %s@%s: %s (%s)\n", crm_element_value(data, XML_PING_ATTR_SYSFROM), - crm_element_value(xml, F_CRM_HOST_FROM), + host_from?host_from:"local", state, crm_element_value(data, XML_PING_ATTR_STATUS)); if (BE_SILENT && state != NULL) {