From 3a04de02312c4271373aa6fae3af9ff5a82156e0 Mon Sep 17 00:00:00 2001 From: Cai Chen Date: Mon, 17 Jun 2024 10:58:40 -0400 Subject: [PATCH] Sync from server repo (7057bbc327d) --- commands/cmd_add_node.go | 6 ++ commands/cmd_create_db.go | 2 +- commands/cmd_list_all_nodes.go | 5 + commands/cmd_restart_node.go | 8 +- commands/cmd_start_db.go | 23 +++-- commands/cmd_start_subcluster.go | 6 ++ commands/vcluster_config.go | 8 +- vclusterops/add_node.go | 12 ++- vclusterops/add_subcluster.go | 2 +- vclusterops/alter_subcluster_type.go | 4 +- vclusterops/cluster_op.go | 1 + vclusterops/cluster_op_engine.go | 6 ++ vclusterops/cmd_type.go | 86 +++++++++++++++++ vclusterops/create_db.go | 13 +-- vclusterops/fetch_database.go | 2 +- vclusterops/fetch_node_state.go | 6 -- vclusterops/fetch_nodes_details.go | 2 +- vclusterops/get_config_parameter.go | 6 +- vclusterops/helpers.go | 19 ++++ vclusterops/https_find_subcluster_op.go | 17 +--- vclusterops/https_get_up_nodes_op.go | 42 +++------ vclusterops/https_poll_node_state_op.go | 62 +++--------- vclusterops/https_poll_node_state_op_test.go | 10 +- vclusterops/https_startup_command_op.go | 6 +- vclusterops/https_sync_catalog_op.go | 18 +--- vclusterops/install_packages.go | 4 +- vclusterops/manage_connection_draining.go | 4 +- vclusterops/nma_vertica_version_op.go | 20 ++-- vclusterops/promote_sandbox_to_main.go | 2 +- vclusterops/re_ip.go | 2 +- vclusterops/remove_node.go | 25 ++++- vclusterops/remove_subcluster.go | 10 +- vclusterops/rename_subcluster.go | 4 +- vclusterops/replication.go | 4 +- vclusterops/restore_points.go | 2 +- vclusterops/sandbox.go | 4 +- vclusterops/scrutinize.go | 5 +- vclusterops/set_config_parameter.go | 6 +- vclusterops/start_db.go | 49 ++++++++-- vclusterops/start_node.go | 99 +++++++++++++------- vclusterops/start_subcluster.go | 13 ++- vclusterops/state_poller.go | 3 +- vclusterops/stop_db.go | 2 +- vclusterops/stop_node.go | 6 +- vclusterops/stop_subcluster.go | 2 +- vclusterops/unsandbox.go | 8 +- vclusterops/util/util.go | 17 ++++ vclusterops/util/util_test.go | 20 ++++ vclusterops/vcluster_database_options.go | 47 ++-------- vclusterops/vlog/printer.go | 4 + 50 files changed, 463 insertions(+), 271 deletions(-) create mode 100644 vclusterops/cmd_type.go diff --git a/commands/cmd_add_node.go b/commands/cmd_add_node.go index c75be62..ea49ecb 100644 --- a/commands/cmd_add_node.go +++ b/commands/cmd_add_node.go @@ -123,6 +123,12 @@ func (c *CmdAddNode) setLocalFlags(cmd *cobra.Command) { "", "Comma-separated list of node names that exist in the cluster", ) + cmd.Flags().IntVar( + &c.addNodeOptions.TimeOut, + "add-node-timeout", + util.GetEnvInt("NODE_STATE_POLLING_TIMEOUT", util.DefaultTimeoutSeconds), + "The timeout to wait for the nodes to add", + ) } func (c *CmdAddNode) Parse(inputArgv []string, logger vlog.Printer) error { diff --git a/commands/cmd_create_db.go b/commands/cmd_create_db.go index 2313d23..cc84dcd 100644 --- a/commands/cmd_create_db.go +++ b/commands/cmd_create_db.go @@ -208,7 +208,7 @@ func (c *CmdCreateDB) setLocalFlags(cmd *cobra.Command) { cmd.Flags().IntVar( &c.createDBOptions.TimeoutNodeStartupSeconds, "startup-timeout", - util.DefaultTimeoutSeconds, + util.GetEnvInt("NODE_STATE_POLLING_TIMEOUT", util.DefaultTimeoutSeconds), "The timeout in seconds to wait for the nodes to start", ) } diff --git a/commands/cmd_list_all_nodes.go b/commands/cmd_list_all_nodes.go index f8a2fb2..7ae6215 100644 --- a/commands/cmd_list_all_nodes.go +++ b/commands/cmd_list_all_nodes.go @@ -119,6 +119,11 @@ func (c *CmdListAllNodes) Run(vcc vclusterops.ClusterCommands) error { c.writeCmdOutputToFile(globals.file, bytes, vcc.GetLog()) vcc.LogInfo("Node states: ", "nodeStates", string(bytes)) + // if writing into stdout, add a new line + // otherwise, the successful message may be wrapped into the same line of the node state output + if c.output == "" { + fmt.Println("") + } vcc.DisplayInfo("Successfully listed all nodes") return nil } diff --git a/commands/cmd_restart_node.go b/commands/cmd_restart_node.go index adc4893..ff2f90f 100644 --- a/commands/cmd_restart_node.go +++ b/commands/cmd_restart_node.go @@ -104,7 +104,7 @@ func (c *CmdStartNodes) setLocalFlags(cmd *cobra.Command) { cmd.Flags().IntVar( &c.startNodesOptions.StatePollingTimeout, "timeout", - util.DefaultTimeoutSeconds, + util.GetEnvInt("NODE_STATE_POLLING_TIMEOUT", util.DefaultTimeoutSeconds), "The timeout (in seconds) to wait for polling node state operation", ) @@ -167,6 +167,12 @@ func (c *CmdStartNodes) Run(vcc vclusterops.ClusterCommands) error { return err } + // all nodes unreachable, nothing need to be done. + if len(options.Nodes) == 0 { + vcc.DisplayInfo("No reachable nodes to start") + return nil + } + var hostToStart []string for _, ip := range options.Nodes { hostToStart = append(hostToStart, ip) diff --git a/commands/cmd_start_db.go b/commands/cmd_start_db.go index f44dfbe..5a88a06 100644 --- a/commands/cmd_start_db.go +++ b/commands/cmd_start_db.go @@ -221,6 +221,7 @@ func (c *CmdStartDB) Run(vcc vclusterops.ClusterCommands) error { } dbConfig, readConfigErr := readConfig() if readConfigErr == nil { + options.ReadFromConfig = true if options.Sandbox != util.MainClusterSandbox || options.MainCluster { options.RawHosts = filterInputHosts(options, dbConfig) } @@ -238,6 +239,12 @@ func (c *CmdStartDB) Run(vcc vclusterops.ClusterCommands) error { return err } + // all nodes unreachable + if len(options.Hosts) == 0 { + vcc.DisplayInfo("No reachable nodes to start database %s", options.DBName) + return nil + } + msg := fmt.Sprintf("Started database %s", options.DBName) if options.Sandbox != "" { sandboxMsg := fmt.Sprintf(" on sandbox %s", options.Sandbox) @@ -253,12 +260,7 @@ func (c *CmdStartDB) Run(vcc vclusterops.ClusterCommands) error { // for Eon database, update config file to fill nodes' subcluster information if readConfigErr == nil && options.IsEon { - // write db info to vcluster config file - vdb.FirstStartAfterRevive = false - err = writeConfig(vdb, true /*forceOverwrite*/) - if err != nil { - vcc.DisplayWarning("fail to update config file, details: %s", err) - } + c.UpdateConfigFileForEon(vdb, vcc) } // write config parameters to vcluster config param file @@ -270,6 +272,15 @@ func (c *CmdStartDB) Run(vcc vclusterops.ClusterCommands) error { return nil } +func (c *CmdStartDB) UpdateConfigFileForEon(vdb *vclusterops.VCoordinationDatabase, vcc vclusterops.ClusterCommands) { + // write db info to vcluster config file + vdb.FirstStartAfterRevive = false + err := writeConfig(vdb, true /*forceOverwrite*/) + if err != nil { + vcc.DisplayWarning("fail to update config file, details: %s", err) + } +} + // SetDatabaseOptions will assign a vclusterops.DatabaseOptions instance to the one in CmdStartDB func (c *CmdStartDB) SetDatabaseOptions(opt *vclusterops.DatabaseOptions) { c.startDBOptions.DatabaseOptions = *opt diff --git a/commands/cmd_start_subcluster.go b/commands/cmd_start_subcluster.go index 15f9f82..2d7419c 100644 --- a/commands/cmd_start_subcluster.go +++ b/commands/cmd_start_subcluster.go @@ -133,6 +133,12 @@ func (c *CmdStartSubcluster) Run(vcc vclusterops.ClusterCommands) error { return err } + // all nodes unreachable, nothing need to be done. + if len(options.Nodes) == 0 { + vcc.DisplayInfo("No reachable nodes to start in subcluster %s", options.SCName) + return nil + } + vcc.DisplayInfo("Successfully started subcluster %s for database %s", options.SCName, options.DBName) diff --git a/commands/vcluster_config.go b/commands/vcluster_config.go index 0cb5d36..fbeb894 100644 --- a/commands/vcluster_config.go +++ b/commands/vcluster_config.go @@ -193,13 +193,19 @@ func writeConfig(vdb *vclusterops.VCoordinationDatabase, forceOverwrite bool) er return fmt.Errorf("configuration file path is empty") } + return WriteConfigToPath(vdb, dbOptions.ConfigPath, forceOverwrite) +} + +func WriteConfigToPath(vdb *vclusterops.VCoordinationDatabase, + configPath string, + forceOverwrite bool) error { dbConfig, err := readVDBToDBConfig(vdb) if err != nil { return err } // update db config with the given database info - err = dbConfig.write(dbOptions.ConfigPath, forceOverwrite) + err = dbConfig.write(configPath, forceOverwrite) if err != nil { return err } diff --git a/vclusterops/add_node.go b/vclusterops/add_node.go index 614ff68..be2b0f8 100644 --- a/vclusterops/add_node.go +++ b/vclusterops/add_node.go @@ -46,6 +46,9 @@ type VAddNodeOptions struct { // Names of the existing nodes in the cluster. This option can be // used to remove partially added nodes from catalog. ExpectedNodeNames []string + + // timeout for polling nodes in seconds when we add Nodes + TimeOut int } func VAddNodeOptionsFactory() VAddNodeOptions { @@ -60,6 +63,10 @@ func (options *VAddNodeOptions) setDefaultValues() { options.DatabaseOptions.setDefaultValues() options.SkipRebalanceShards = new(bool) + + // try to retrieve the timeout from the environment variable + // otherwise, set the default value (300 seconds) to the timeout + options.TimeOut = util.GetEnvInt("NODE_STATE_POLLING_TIMEOUT", util.DefaultTimeoutSeconds) } func (options *VAddNodeOptions) validateEonOptions() error { @@ -70,7 +77,7 @@ func (options *VAddNodeOptions) validateEonOptions() error { } func (options *VAddNodeOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandAddNode, logger) + err := options.validateBaseOptions(AddNodeCmd, logger) if err != nil { return err } @@ -398,10 +405,11 @@ func (vcc VClusterCommands) produceAddNodeInstructions(vdb *VCoordinationDatabas vdb /*db configurations retrieved from a running db*/) nmaStartNewNodesOp := makeNMAStartNodeOpWithVDB(newHosts, options.StartUpConf, vdb) - httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOp(newHosts, usePassword, username, password) + httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOp(newHosts, usePassword, username, password, options.TimeOut) if err != nil { return instructions, err } + httpsPollNodeStateOp.cmdType = AddNodeCmd instructions = append(instructions, &nmaStartNewNodesOp, &httpsPollNodeStateOp, diff --git a/vclusterops/add_subcluster.go b/vclusterops/add_subcluster.go index b750be6..f6be071 100644 --- a/vclusterops/add_subcluster.go +++ b/vclusterops/add_subcluster.go @@ -67,7 +67,7 @@ func (options *VAddSubclusterOptions) setDefaultValues() { } func (options *VAddSubclusterOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandAddSubcluster, logger) + err := options.validateBaseOptions(AddSubclusterCmd, logger) if err != nil { return err } diff --git a/vclusterops/alter_subcluster_type.go b/vclusterops/alter_subcluster_type.go index 95fdb08..fa80de6 100644 --- a/vclusterops/alter_subcluster_type.go +++ b/vclusterops/alter_subcluster_type.go @@ -72,7 +72,7 @@ func (options *VAlterSubclusterTypeOptions) validateParseOptions(logger vlog.Pri return err } - err = options.validateAuthOptions(commandAlterSubclusterType, logger) + err = options.validateAuthOptions(AlterSubclusterTypeCmd.CmdString(), logger) if err != nil { return err } @@ -89,7 +89,7 @@ func (options *VAlterSubclusterTypeOptions) validateParseOptions(logger vlog.Pri if !options.SCType.IsValid() { return fmt.Errorf("invalid subcluster type: must be 'primary' or 'secondary'") } - return options.validateBaseOptions(commandAlterSubclusterType, logger) + return options.validateBaseOptions(AlterSubclusterTypeCmd, logger) } // analyzeOptions will modify some options based on what is chosen diff --git a/vclusterops/cluster_op.go b/vclusterops/cluster_op.go index 5de3837..7182a2d 100644 --- a/vclusterops/cluster_op.go +++ b/vclusterops/cluster_op.go @@ -277,6 +277,7 @@ func (op *opBase) setupSpinner() { StopFailCharacter: "✘", StopFailMessage: "failed", StopFailColors: []string{"fgRed"}, + Writer: op.logger.Writer, // if nil, writing to stdout } spinner, err := yacspin.New(cfg) if err != nil { diff --git a/vclusterops/cluster_op_engine.go b/vclusterops/cluster_op_engine.go index 4fa26e5..1c37283 100644 --- a/vclusterops/cluster_op_engine.go +++ b/vclusterops/cluster_op_engine.go @@ -55,6 +55,12 @@ func (opEngine *VClusterOpEngine) runWithExecContext(logger vlog.Printer, execCo } } + // display warning if any unreachable hosts detected + if len(opEngine.execContext.unreachableHosts) > 0 { + logger.DisplayWarning("Unreachable host(s) detected, please check the NMA connectivity in %v", + opEngine.execContext.unreachableHosts) + } + return nil } diff --git a/vclusterops/cmd_type.go b/vclusterops/cmd_type.go new file mode 100644 index 0000000..96fedf3 --- /dev/null +++ b/vclusterops/cmd_type.go @@ -0,0 +1,86 @@ +package vclusterops + +type CmdType int + +const ( + // command types + CreateDBCmd CmdType = iota + DropDBCmd + StopDBCmd + StartDBCmd + AddNodeCmd + RemoveNodeCmd + StartNodeCmd + StopNodeCmd + RestartNodeCmd + AddSubclusterCmd + RemoveSubclusterCmd + StopSubclusterCmd + StartSubclusterCmd + SandboxSCCmd + UnsandboxSCCmd + ShowRestorePointsCmd + InstallPackagesCmd + ConfigRecoverCmd + ManageConnectionDrainingCmd + SetConfigurationParameterCmd + GetConfigurationParameterCmd + ReplicationStartCmd + PromoteSandboxToMainCmd + FetchNodesDetailsCmd + AlterSubclusterTypeCmd + RenameScCmd + ReIPCmd + ScrutinizeCmd + CreateDBSyncCat + StartDBSyncCat + StopDBSyncCat + StopSCSyncCat + AddNodeSyncCat + StartNodeSyncCat + RemoveNodeSyncCat +) + +var cmdStringMap = map[CmdType]string{ + CreateDBCmd: "create_db", + DropDBCmd: "drop_db", + StopDBCmd: "stop_db", + StartDBCmd: "start_db", + AddNodeCmd: "add_node", + RemoveNodeCmd: "remove_node", + StartNodeCmd: "start_node", + StopNodeCmd: "stop_node", + RestartNodeCmd: "restart_node", + AddSubclusterCmd: "add_subcluster", + RemoveSubclusterCmd: "remove_subcluster", + StopSubclusterCmd: "stop_subcluster", + StartSubclusterCmd: "start_subcluster", + SandboxSCCmd: "sandbox_subcluster", + UnsandboxSCCmd: "unsandbox_subcluster", + ShowRestorePointsCmd: "show_restore_points", + InstallPackagesCmd: "install_packages", + ConfigRecoverCmd: "manage_config_recover", + ManageConnectionDrainingCmd: "manage_connection_draining", + SetConfigurationParameterCmd: "set_configuration_parameter", + ReplicationStartCmd: "replication_start", + PromoteSandboxToMainCmd: "promote_sandbox_to_main", + FetchNodesDetailsCmd: "fetch_nodes_details", + AlterSubclusterTypeCmd: "alter_subcluster_type", + RenameScCmd: "rename_subcluster", + ReIPCmd: "re_ip", + ScrutinizeCmd: "scrutinize", + CreateDBSyncCat: "create_db_sync_cat", + StartDBSyncCat: "start_db_sync_cat", + StopDBSyncCat: "stop_db_sync_cat", + StopSCSyncCat: "stop_sc_sync_cat", + AddNodeSyncCat: "add_node_sync_cat", + StartNodeSyncCat: "start_node_sync_cat", + RemoveNodeSyncCat: "remove_node_sync_cat", +} + +func (cmd CmdType) CmdString() string { + if str, ok := cmdStringMap[cmd]; ok { + return str + } + return "unknown_operation" +} diff --git a/vclusterops/create_db.go b/vclusterops/create_db.go index 8859d08..e79f2fa 100644 --- a/vclusterops/create_db.go +++ b/vclusterops/create_db.go @@ -98,7 +98,7 @@ func (options *VCreateDatabaseOptions) setDefaultValues() { func (options *VCreateDatabaseOptions) validateRequiredOptions(logger vlog.Printer) error { // validate base options - err := options.validateBaseOptions(commandCreateDB, logger) + err := options.validateBaseOptions(CreateDBCmd, logger) if err != nil { return err } @@ -106,7 +106,6 @@ func (options *VCreateDatabaseOptions) validateRequiredOptions(logger vlog.Print // validate required parameters with default values if options.Password == nil { options.Password = new(string) - *options.Password = "" logger.Info("no password specified, using none") } @@ -431,11 +430,12 @@ func (vcc VClusterCommands) produceCreateDBBootstrapInstructions( nmaStartNodeOp := makeNMAStartNodeOp(bootstrapHost, options.StartUpConf) - httpsPollBootstrapNodeStateOp, err := makeHTTPSPollNodeStateOpWithTimeoutAndCommand(bootstrapHost, true, /* useHTTPPassword */ - options.UserName, options.Password, options.TimeoutNodeStartupSeconds, CreateDBCmd) + httpsPollBootstrapNodeStateOp, err := makeHTTPSPollNodeStateOp(bootstrapHost, true, /* useHTTPPassword */ + options.UserName, options.Password, options.TimeoutNodeStartupSeconds) if err != nil { return instructions, err } + httpsPollBootstrapNodeStateOp.cmdType = CreateDBCmd instructions = append(instructions, &nmaStartNodeOp, @@ -508,11 +508,12 @@ func (vcc VClusterCommands) produceAdditionalCreateDBInstructions(vdb *VCoordina username := options.UserName if !options.SkipStartupPolling { - httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOpWithTimeoutAndCommand(hosts, true, username, options.Password, - options.TimeoutNodeStartupSeconds, CreateDBCmd) + httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOp(hosts, true, username, options.Password, + options.TimeoutNodeStartupSeconds) if err != nil { return instructions, err } + httpsPollNodeStateOp.cmdType = CreateDBCmd instructions = append(instructions, &httpsPollNodeStateOp) } diff --git a/vclusterops/fetch_database.go b/vclusterops/fetch_database.go index 984b1fd..c2a544a 100644 --- a/vclusterops/fetch_database.go +++ b/vclusterops/fetch_database.go @@ -39,7 +39,7 @@ func VRecoverConfigOptionsFactory() VFetchCoordinationDatabaseOptions { } func (options *VFetchCoordinationDatabaseOptions) validateParseOptions(logger vlog.Printer) error { - return options.validateBaseOptions(commandConfigRecover, logger) + return options.validateBaseOptions(ConfigRecoverCmd, logger) } func (options *VFetchCoordinationDatabaseOptions) analyzeOptions() error { diff --git a/vclusterops/fetch_node_state.go b/vclusterops/fetch_node_state.go index d8a5649..d7617a7 100644 --- a/vclusterops/fetch_node_state.go +++ b/vclusterops/fetch_node_state.go @@ -112,12 +112,6 @@ func (vcc VClusterCommands) VFetchNodeState(options *VFetchNodeStateOptions) ([] } } - // display warning if any unreachable hosts detected - if len(clusterOpEngine.execContext.unreachableHosts) > 0 { - vcc.DisplayWarning("hosts %v are unreachable, please check the NMA connectivity in the hosts", - clusterOpEngine.execContext.unreachableHosts) - } - return nodeStates, nil } diff --git a/vclusterops/fetch_nodes_details.go b/vclusterops/fetch_nodes_details.go index f1c10b8..a055c46 100644 --- a/vclusterops/fetch_nodes_details.go +++ b/vclusterops/fetch_nodes_details.go @@ -85,7 +85,7 @@ func (options *VFetchNodesDetailsOptions) setDefaultValues() { } func (options *VFetchNodesDetailsOptions) validateParseOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandFetchNodesDetails, logger) + err := options.validateBaseOptions(FetchNodesDetailsCmd, logger) if err != nil { return err } diff --git a/vclusterops/get_config_parameter.go b/vclusterops/get_config_parameter.go index 7d0440d..8620282 100644 --- a/vclusterops/get_config_parameter.go +++ b/vclusterops/get_config_parameter.go @@ -42,12 +42,12 @@ func VGetConfigurationParameterOptionsFactory() VGetConfigurationParameterOption } func (opt *VGetConfigurationParameterOptions) validateParseOptions(logger vlog.Printer) error { - err := opt.validateBaseOptions(commandGetConfigurationParameter, logger) + err := opt.validateBaseOptions(GetConfigurationParameterCmd, logger) if err != nil { return err } - err = opt.validateAuthOptions(commandGetConfigurationParameter, logger) + err = opt.validateAuthOptions(GetConfigurationParameterCmd.CmdString(), logger) if err != nil { return err } @@ -138,7 +138,7 @@ func (vcc VClusterCommands) produceGetConfigurationParameterInstructions( // up hosts will be filtered by sandbox name in prepare stage of nmaGetConfigurationParameterOp httpsGetUpNodesOp, err := makeHTTPSGetUpNodesWithSandboxOp(options.DBName, options.Hosts, options.usePassword, options.UserName, options.Password, - GetConfigurationParametersCmd, options.Sandbox, assertMainClusterUpNodes) + GetConfigurationParameterCmd, options.Sandbox, assertMainClusterUpNodes) if err != nil { return instructions, err } diff --git a/vclusterops/helpers.go b/vclusterops/helpers.go index 8af5a80..49d918d 100644 --- a/vclusterops/helpers.go +++ b/vclusterops/helpers.go @@ -186,6 +186,12 @@ func getInitiatorHostInCluster(name, sandbox, scname string, vdb *VCoordinationD // getInitiatorHostForReplication returns an initiator that is the first up source host in the main cluster // or a sandbox func getInitiatorHostForReplication(name, sandbox string, hosts []string, vdb *VCoordinationDatabase) ([]string, error) { + // the k8s operator uses a service hostname, not an ip address of a node in the cluster + // since the hostname will not match any node in the cluster we need to skip the below logic + // this is ok since the operator has already chosen an appropriate "initiator" + if util.IsK8sEnvironment() { + return hosts, nil + } // source hosts will be : // 1. up hosts from the main subcluster if the sandbox is empty // 2. up hosts from the sandbox if the sandbox is specified @@ -462,3 +468,16 @@ func (vcc *VClusterCommands) doReIP(options *DatabaseOptions, scName string, return nil } + +func (vcc *VClusterCommands) getUnreachableHosts(options *DatabaseOptions) ([]string, error) { + var nmaHealthInstructions []clusterOp + nmaHealthOp := makeNMAHealthOpSkipUnreachable(options.Hosts) + nmaHealthInstructions = []clusterOp{&nmaHealthOp} + certs := httpsCerts{key: options.Key, cert: options.Cert, caCert: options.CaCert} + opEng := makeClusterOpEngine(nmaHealthInstructions, &certs) + err := opEng.run(vcc.Log) + if err != nil { + return nil, err + } + return opEng.execContext.unreachableHosts, nil +} diff --git a/vclusterops/https_find_subcluster_op.go b/vclusterops/https_find_subcluster_op.go index e73a265..8909edc 100644 --- a/vclusterops/https_find_subcluster_op.go +++ b/vclusterops/https_find_subcluster_op.go @@ -20,17 +20,12 @@ import ( "fmt" ) -const ( - AddNodeCmd CommandType = iota - RemoveSubclusterCmd -) - type httpsFindSubclusterOp struct { opBase opHTTPSBase scName string ignoreNotFound bool - cmdType CommandType + cmdType CmdType } // makeHTTPSFindSubclusterOp initializes an op to find @@ -39,7 +34,7 @@ type httpsFindSubclusterOp struct { // the given cluster name is not found. func makeHTTPSFindSubclusterOp(hosts []string, useHTTPPassword bool, userName string, httpsPassword *string, scName string, - ignoreNotFound bool, cmdType CommandType, + ignoreNotFound bool, cmdType CmdType, ) (httpsFindSubclusterOp, error) { op := httpsFindSubclusterOp{} op.name = "HTTPSFindSubclusterOp" @@ -193,14 +188,12 @@ func (op *httpsFindSubclusterOp) processSubclusters(subclusterResp scResp, execC } if isSandboxed { - switch op.cmdType { - case AddNodeCmd: + if op.cmdType == AddNodeCmd { return fmt.Errorf(`[%s] cannot add node into a sandboxed subcluster`, op.name) - case RemoveSubclusterCmd: + } else if op.cmdType == RemoveSubclusterCmd { return fmt.Errorf(`[%s] cannot remove a sandboxed subcluster, must unsandbox the subcluster first`, op.name) - default: - return fmt.Errorf(`[%s] sandbox handling in the operation is not implemented`, op.name) } + return fmt.Errorf(`[%s] sandbox handling in the operation is not implemented`, op.name) } return nil diff --git a/vclusterops/https_get_up_nodes_op.go b/vclusterops/https_get_up_nodes_op.go index 869130d..9b6b1c1 100644 --- a/vclusterops/https_get_up_nodes_op.go +++ b/vclusterops/https_get_up_nodes_op.go @@ -24,35 +24,19 @@ import ( "github.com/vertica/vcluster/vclusterops/util" ) -const ( - SandboxCmd = iota - StartNodeCommand - StopDBCmd - ScrutinizeCmd - AddSubclusterCmd - StopSubclusterCmd - InstallPackageCmd - UnsandboxCmd - ManageConnectionDrainingCmd - SetConfigurationParametersCmd - GetConfigurationParametersCmd -) - -type CommandType int - type httpsGetUpNodesOp struct { opBase opHTTPSBase DBName string noUpHostsOk bool - cmdType CommandType + cmdType CmdType sandbox string mainCluster bool scName string } func makeHTTPSGetUpNodesOp(dbName string, hosts []string, - useHTTPPassword bool, userName string, httpsPassword *string, cmdType CommandType, + useHTTPPassword bool, userName string, httpsPassword *string, cmdType CmdType, ) (httpsGetUpNodesOp, error) { op := httpsGetUpNodesOp{} op.name = "HTTPSGetUpNodesOp" @@ -76,7 +60,7 @@ func makeHTTPSGetUpNodesOp(dbName string, hosts []string, } func makeHTTPSGetUpNodesWithSandboxOp(dbName string, hosts []string, - useHTTPPassword bool, userName string, httpsPassword *string, cmdType CommandType, + useHTTPPassword bool, userName string, httpsPassword *string, cmdType CmdType, sandbox string, mainCluster bool) (httpsGetUpNodesOp, error) { op, err := makeHTTPSGetUpNodesOp(dbName, hosts, useHTTPPassword, userName, httpsPassword, cmdType) op.sandbox = sandbox @@ -85,7 +69,7 @@ func makeHTTPSGetUpNodesWithSandboxOp(dbName string, hosts []string, } func makeHTTPSGetUpScNodesOp(dbName string, hosts []string, - useHTTPPassword bool, userName string, httpsPassword *string, cmdType CommandType, + useHTTPPassword bool, userName string, httpsPassword *string, cmdType CmdType, scName string) (httpsGetUpNodesOp, error) { op, err := makeHTTPSGetUpNodesOp(dbName, hosts, useHTTPPassword, userName, httpsPassword, cmdType) op.scName = scName @@ -196,7 +180,7 @@ func (op *httpsGetUpNodesOp) processResult(execContext *opEngineExecContext) err return allErrs } - if op.cmdType == UnsandboxCmd { + if op.cmdType == UnsandboxSCCmd { op.collectUnsandboxingHosts(nodesStates, sandboxInfo) } @@ -218,12 +202,12 @@ func (op *httpsGetUpNodesOp) processResult(execContext *opEngineExecContext) err } // Return true if all the results need to be scanned to figure out UP hosts -func isCompleteScanRequired(cmdType CommandType) bool { - return cmdType == SandboxCmd || cmdType == StopDBCmd || - cmdType == UnsandboxCmd || cmdType == StopSubclusterCmd || +func isCompleteScanRequired(cmdType CmdType) bool { + return cmdType == SandboxSCCmd || cmdType == StopDBCmd || + cmdType == UnsandboxSCCmd || cmdType == StopSubclusterCmd || cmdType == ManageConnectionDrainingCmd || - cmdType == SetConfigurationParametersCmd || - cmdType == GetConfigurationParametersCmd + cmdType == SetConfigurationParameterCmd || + cmdType == GetConfigurationParameterCmd } func (op *httpsGetUpNodesOp) finalize(_ *opEngineExecContext) error { @@ -251,7 +235,7 @@ func (op *httpsGetUpNodesOp) processHostLists(upHosts mapset.Set[string], upScIn op.logger.PrintError(`[%s] There are no UP nodes in subcluster %s. The subcluster is already down`, op.name, op.scName) return false, nil } - if op.sandbox != "" && op.cmdType != UnsandboxCmd { + if op.sandbox != "" && op.cmdType != UnsandboxSCCmd { upSandbox := op.checkSandboxUp(sandboxInfo, op.sandbox) if !upSandbox { op.logger.PrintError(`[%s] There are no UP nodes in the sandbox %s. The db %s is already down`, op.name, op.sandbox, op.DBName) @@ -362,8 +346,8 @@ func (op *httpsGetUpNodesOp) collectUpHosts(nodesStates nodesStateInfo, host str func (op *httpsGetUpNodesOp) requiresSandboxInfo() bool { return op.cmdType == ManageConnectionDrainingCmd || - op.cmdType == SetConfigurationParametersCmd || - op.cmdType == GetConfigurationParametersCmd || + op.cmdType == SetConfigurationParameterCmd || + op.cmdType == GetConfigurationParameterCmd || op.cmdType == StopDBCmd } diff --git a/vclusterops/https_poll_node_state_op.go b/vclusterops/https_poll_node_state_op.go index 1eeb267..707a74b 100644 --- a/vclusterops/https_poll_node_state_op.go +++ b/vclusterops/https_poll_node_state_op.go @@ -18,7 +18,6 @@ package vclusterops import ( "errors" "fmt" - "strconv" "github.com/vertica/vcluster/vclusterops/util" ) @@ -27,31 +26,12 @@ import ( // 30 seconds is long enough for normal http request. // If this timeout is reached, it might imply that the target IP is unreachable const defaultHTTPSRequestTimeoutSeconds = 30 -const ( - StartDBCmd CmdType = iota - StartNodeCmd - CreateDBCmd -) - -type CmdType int - -func (cmd CmdType) String() string { - switch cmd { - case StartDBCmd: - return "start_db" - case StartNodeCmd: - return "start_node" - case CreateDBCmd: - return "create_db" - } - return "unknown_operation" -} type httpsPollNodeStateOp struct { opBase opHTTPSBase currentHost string - // The timeout for the entire operation + // The timeout for the entire operation (polling) timeout int // The timeout for each http request. Requests will be repeated if timeout hasn't been exceeded. httpRequestTimeout int @@ -75,21 +55,9 @@ func makeHTTPSPollNodeStateOpHelper(hosts []string, } op.userName = userName op.httpsPassword = httpsPassword - return op, nil } -func makeHTTPSPollNodeStateOpWithTimeoutAndCommand(hosts []string, - useHTTPPassword bool, userName string, httpsPassword *string, - timeout int, cmdType CmdType) (httpsPollNodeStateOp, error) { - op, err := makeHTTPSPollNodeStateOpHelper(hosts, useHTTPPassword, userName, httpsPassword) - if err != nil { - return op, err - } - op.timeout = timeout - op.cmdType = cmdType - return op, nil -} func makeHTTPSPollNodeStateDownOp(hosts []string, useHTTPPassword bool, userName string, httpsPassword *string) (httpsPollNodeStateOp, error) { @@ -97,30 +65,27 @@ func makeHTTPSPollNodeStateDownOp(hosts []string, if err != nil { return op, err } - timeoutSecondStr := util.GetEnv("NODE_STATE_POLLING_TIMEOUT", strconv.Itoa(StartupPollingTimeout)) - timeoutSecond, err := strconv.Atoi(timeoutSecondStr) - if err != nil { - return httpsPollNodeStateOp{}, err - } - op.timeout = timeoutSecond + op.timeout = util.GetEnvInt("NODE_STATE_POLLING_TIMEOUT", StartupPollingTimeout) op.checkDown = true op.description = fmt.Sprintf("Wait for %d node(s) to go DOWN", len(hosts)) return op, nil } + func makeHTTPSPollNodeStateOp(hosts []string, useHTTPPassword bool, userName string, - httpsPassword *string) (httpsPollNodeStateOp, error) { + httpsPassword *string, timeout int) (httpsPollNodeStateOp, error) { op, err := makeHTTPSPollNodeStateOpHelper(hosts, useHTTPPassword, userName, httpsPassword) if err != nil { return op, err } - timeoutSecondStr := util.GetEnv("NODE_STATE_POLLING_TIMEOUT", strconv.Itoa(StartupPollingTimeout)) - timeoutSecond, err := strconv.Atoi(timeoutSecondStr) - if err != nil { - return httpsPollNodeStateOp{}, err + + if timeout == 0 { + // using default value + op.timeout = util.GetEnvInt("NODE_STATE_POLLING_TIMEOUT", StartupPollingTimeout) + } else { + op.timeout = timeout } - op.timeout = timeoutSecond - return op, nil + return op, err } func (op *httpsPollNodeStateOp) getPollingTimeout() int { @@ -195,13 +160,12 @@ func (op *httpsPollNodeStateOp) shouldStopPolling() (bool, error) { // If we find the wrong password for the HTTPS service on any hosts, we should fail immediately. // We also need to let user know to wait until all nodes are up if result.isPasswordAndCertificateError(op.logger) { - switch op.cmdType { - case StartDBCmd, StartNodeCmd: + if op.cmdType == StartDBCmd || op.cmdType == StartNodeCmd { op.logger.PrintError("[%s] The credentials are incorrect. 'Catalog Sync' will not be executed.", op.name) return false, fmt.Errorf("[%s] wrong password/certificate for https service on host %s, but the nodes' startup have been in progress."+ "Please use vsql to check the nodes' status and manually run sync_catalog vsql command 'select sync_catalog()'", op.name, host) - case CreateDBCmd: + } else if op.cmdType == CreateDBCmd { return true, fmt.Errorf("[%s] wrong password/certificate for https service on host %s", op.name, host) } diff --git a/vclusterops/https_poll_node_state_op_test.go b/vclusterops/https_poll_node_state_op_test.go index 9c1089d..443f09a 100644 --- a/vclusterops/https_poll_node_state_op_test.go +++ b/vclusterops/https_poll_node_state_op_test.go @@ -31,7 +31,7 @@ func TestTimeoutErrorCase(t *testing.T) { password := "testPwd" // Intentionally pick a low http request timeout to speed up the test. const httpRequestTimeoutForTest = 3 - httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOp(hosts, true, username, &password) + httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOp(hosts, true, username, &password, 0) assert.Nil(t, err) httpsPollNodeStateOp.httpRequestTimeout = httpRequestTimeoutForTest instructions = append(instructions, &httpsPollNodeStateOp) @@ -45,13 +45,13 @@ func TestTimeoutErrorCase(t *testing.T) { // negative timeout value for the op (treated as 0, means no polling) instructions = make([]clusterOp, 0) - httpsPollNodeStateOp, err = makeHTTPSPollNodeStateOpWithTimeoutAndCommand(hosts, true, username, &password, - -100, CreateDBCmd) + httpsPollNodeStateOp, err = makeHTTPSPollNodeStateOp(hosts, true, username, &password, -100) + httpsPollNodeStateOp.cmdType = CreateDBCmd assert.Nil(t, err) httpsPollNodeStateOp.httpRequestTimeout = httpRequestTimeoutForTest instructions = append(instructions, &httpsPollNodeStateOp) clusterOpEngine = makeClusterOpEngine(instructions, &certs) err = clusterOpEngine.run(vlog.Printer{}) - // no polling is done, directly error out - assert.ErrorContains(t, err, "reached polling timeout of 0 seconds") + // negative value means no polling timeout + assert.ErrorContains(t, err, "[HTTPSPollNodeStateOp] cannot connect to host") } diff --git a/vclusterops/https_startup_command_op.go b/vclusterops/https_startup_command_op.go index 0fd1835..2a9c432 100644 --- a/vclusterops/https_startup_command_op.go +++ b/vclusterops/https_startup_command_op.go @@ -30,7 +30,7 @@ type httpsStartUpCommandOp struct { opBase opHTTPSBase vdb *VCoordinationDatabase - cmdType CommandType + cmdType CmdType sandbox string } @@ -62,7 +62,7 @@ func makeHTTPSStartUpCommandOpAfterUnsandbox(useHTTPPassword bool, userName stri op.name = startupOp op.description = startNodeAfterUnsandboxDesc op.useHTTPPassword = useHTTPPassword - op.cmdType = UnsandboxCmd + op.cmdType = UnsandboxSCCmd op.sandbox = util.MainClusterSandbox if useHTTPPassword { @@ -120,7 +120,7 @@ func (op *httpsStartUpCommandOp) setupClusterHTTPRequest(hosts []string) error { func (op *httpsStartUpCommandOp) prepare(execContext *opEngineExecContext) error { // Use the /v1/startup/command endpoint for a primary Up host to view every start command of existing nodes // With sandboxes in a cluster, we need to ensure that we pick a main cluster UP host - if op.cmdType == UnsandboxCmd { + if op.cmdType == UnsandboxSCCmd { for h, sb := range execContext.upHostsToSandboxes { if sb == "" { op.hosts = append(op.hosts, h) diff --git a/vclusterops/https_sync_catalog_op.go b/vclusterops/https_sync_catalog_op.go index 9f1909d..548a424 100644 --- a/vclusterops/https_sync_catalog_op.go +++ b/vclusterops/https_sync_catalog_op.go @@ -23,26 +23,14 @@ import ( "github.com/vertica/vcluster/vclusterops/util" ) -type SyncCatCmdType int - -const ( - CreateDBSyncCat SyncCatCmdType = iota - StartDBSyncCat - StopDBSyncCat - StopSCSyncCat - AddNodeSyncCat - StartNodeSyncCat - RemoveNodeSyncCat -) - type httpsSyncCatalogOp struct { opBase opHTTPSBase - cmdType SyncCatCmdType + cmdType CmdType } func makeHTTPSSyncCatalogOp(hosts []string, useHTTPPassword bool, - userName string, httpsPassword *string, cmdType SyncCatCmdType) (httpsSyncCatalogOp, error) { + userName string, httpsPassword *string, cmdType CmdType) (httpsSyncCatalogOp, error) { op := httpsSyncCatalogOp{} op.name = "HTTPSSyncCatalogOp" op.description = "Synchronize catalog with communal storage" @@ -61,7 +49,7 @@ func makeHTTPSSyncCatalogOp(hosts []string, useHTTPPassword bool, } func makeHTTPSSyncCatalogOpWithoutHosts(useHTTPPassword bool, - userName string, httpsPassword *string, cmdType SyncCatCmdType) (httpsSyncCatalogOp, error) { + userName string, httpsPassword *string, cmdType CmdType) (httpsSyncCatalogOp, error) { return makeHTTPSSyncCatalogOp(nil, useHTTPPassword, userName, httpsPassword, cmdType) } diff --git a/vclusterops/install_packages.go b/vclusterops/install_packages.go index 60902f0..6502a70 100644 --- a/vclusterops/install_packages.go +++ b/vclusterops/install_packages.go @@ -37,7 +37,7 @@ func VInstallPackagesOptionsFactory() VInstallPackagesOptions { } func (options *VInstallPackagesOptions) validateParseOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandInstallPackages, logger) + err := options.validateBaseOptions(InstallPackagesCmd, logger) if err != nil { return err } @@ -121,7 +121,7 @@ func (vcc *VClusterCommands) produceInstallPackagesInstructions(opts *VInstallPa } httpsGetUpNodesOp, err := makeHTTPSGetUpNodesOp(opts.DBName, opts.Hosts, - usePassword, opts.UserName, opts.Password, InstallPackageCmd) + usePassword, opts.UserName, opts.Password, InstallPackagesCmd) if err != nil { return nil, nil, err } diff --git a/vclusterops/manage_connection_draining.go b/vclusterops/manage_connection_draining.go index 232f842..12de6d9 100644 --- a/vclusterops/manage_connection_draining.go +++ b/vclusterops/manage_connection_draining.go @@ -70,12 +70,12 @@ func (opt *VManageConnectionDrainingOptions) validateParseOptions(logger vlog.Pr return err } - err = opt.validateBaseOptions(commandManageConnectionDraining, logger) + err = opt.validateBaseOptions(ManageConnectionDrainingCmd, logger) if err != nil { return err } - err = opt.validateAuthOptions(commandManageConnectionDraining, logger) + err = opt.validateAuthOptions(ManageConnectionDrainingCmd.CmdString(), logger) if err != nil { return err } diff --git a/vclusterops/nma_vertica_version_op.go b/vclusterops/nma_vertica_version_op.go index 36a821e..b38bd8d 100644 --- a/vclusterops/nma_vertica_version_op.go +++ b/vclusterops/nma_vertica_version_op.go @@ -41,6 +41,7 @@ type nmaVerticaVersionOp struct { scName string readOnly bool targetNodeIPs []string // used to filter desired nodes' info + reachableHosts []string // hosts that are reachable through NMA } func makeHostVersionMap() hostVersionMap { @@ -78,10 +79,12 @@ func makeNMAReadVerticaVersionOp(vdb *VCoordinationDatabase) nmaVerticaVersionOp // makeNMAVerticaVersionOpWithTargetHosts is used in start_db, VCluster will check Vertica // version for the subclusters which contain target hosts -func makeNMAVerticaVersionOpWithTargetHosts(sameVersion bool, hosts []string) nmaVerticaVersionOp { +func makeNMAVerticaVersionOpWithTargetHosts(sameVersion bool, targetNodeIPs []string) nmaVerticaVersionOp { // We set hosts to nil and isEon to false temporarily, and they will get the correct value from execute context in prepare() op := makeNMACheckVerticaVersionOp(nil /*hosts*/, sameVersion, false /*isEon*/) - op.targetNodeIPs = hosts + op.targetNodeIPs = targetNodeIPs + // start_db target all reachable input hosts + op.reachableHosts = targetNodeIPs return op } @@ -104,9 +107,10 @@ func makeNMAVerticaVersionOpWithVDB(sameVersion bool, vdb *VCoordinationDatabase // makeNMAVerticaVersionOpBeforeStartNode is used in start_node, VCluster will check Vertica // version for the nodes which are in the same cluster(main cluster or sandbox) as the target hosts -func makeNMAVerticaVersionOpBeforeStartNode(vdb *VCoordinationDatabase, hosts []string) nmaVerticaVersionOp { - op := makeNMACheckVerticaVersionOp(nil /*hosts*/, true /*sameVersion*/, vdb.IsEon) - op.targetNodeIPs = hosts +func makeNMAVerticaVersionOpBeforeStartNode(vdb *VCoordinationDatabase, reachableHosts, targetNodeIPs []string) nmaVerticaVersionOp { + op := makeNMACheckVerticaVersionOp(nil, true /*sameVersion*/, vdb.IsEon) + op.reachableHosts = reachableHosts + op.targetNodeIPs = targetNodeIPs op.vdb = vdb return op } @@ -342,8 +346,9 @@ func (op *nmaVerticaVersionOp) prepareHostNodeMap(execContext *opEngineExecConte if err != nil { return hostNodeMap, err } + allReachableHostsInTargetSCs := util.SliceCommon(allHostsInTargetSCs, op.reachableHosts) // get host-node map for all hosts in target subclusters - hostNodeMap = util.FilterMapByKey(execContext.nmaVDatabase.HostNodeMap, allHostsInTargetSCs) + hostNodeMap = util.FilterMapByKey(execContext.nmaVDatabase.HostNodeMap, allReachableHostsInTargetSCs) } return hostNodeMap, nil } @@ -365,8 +370,9 @@ func (op *nmaVerticaVersionOp) prepareHostNodeMapWithVDB() (vHostNodeMap, error) if err != nil { return hostNodeMap, err } + allReachableHostsInTargetSCs := util.SliceCommon(allHostsInTargetSCs, op.reachableHosts) // get host-node map for all hosts in target subclusters - hostNodeMap = util.FilterMapByKey(op.vdb.HostNodeMap, allHostsInTargetSCs) + hostNodeMap = util.FilterMapByKey(op.vdb.HostNodeMap, allReachableHostsInTargetSCs) return hostNodeMap, nil } diff --git a/vclusterops/promote_sandbox_to_main.go b/vclusterops/promote_sandbox_to_main.go index 70b106f..5cc046f 100644 --- a/vclusterops/promote_sandbox_to_main.go +++ b/vclusterops/promote_sandbox_to_main.go @@ -54,7 +54,7 @@ func (opt *VPromoteSandboxToMainOptions) validateParseOptions(logger vlog.Printe return fmt.Errorf("must provide a password or a key-certificate pair") } - return opt.validateBaseOptions(commandPromoteSandboxToMain, logger) + return opt.validateBaseOptions(PromoteSandboxToMainCmd, logger) } // analyzeOptions will modify some options based on what is chosen diff --git a/vclusterops/re_ip.go b/vclusterops/re_ip.go index a4bb8ef..64465fd 100644 --- a/vclusterops/re_ip.go +++ b/vclusterops/re_ip.go @@ -50,7 +50,7 @@ func VReIPFactory() VReIPOptions { } func (options *VReIPOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandReIP, logger) + err := options.validateBaseOptions(ReIPCmd, logger) if err != nil { return err } diff --git a/vclusterops/remove_node.go b/vclusterops/remove_node.go index fbc77a2..6dc130d 100644 --- a/vclusterops/remove_node.go +++ b/vclusterops/remove_node.go @@ -31,6 +31,13 @@ type VRemoveNodeOptions struct { Initiator string // A primary up host that will be used to execute remove_node operations. ForceDelete bool // whether force delete directories IsSubcluster bool // is removing all nodes for a subcluster + // Names of the nodes that need to have active subscription. The user of vclusterOps needs + // to make sure the provided values are correct. This option will be used when some nodes + // cannot join the main cluster so we will only check the node subscription state for the nodes + // in this option. For example, after promote_sandbox, the nodes in old main cluster cannot + // join the new main cluster so we should only check the node subscription state on the nodes + // that are promoted from a sandbox. + NodesToPullSubs []string } func VRemoveNodeOptionsFactory() VRemoveNodeOptions { @@ -49,7 +56,7 @@ func (options *VRemoveNodeOptions) setDefaultValues() { } func (options *VRemoveNodeOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandRemoveNode, logger) + err := options.validateBaseOptions(RemoveNodeCmd, logger) if err != nil { return err } @@ -186,6 +193,11 @@ func (vcc VClusterCommands) removeNodesInCatalog(options *VRemoveNodeOptions, vd runError) } + if len(clusterOpEngine.execContext.unreachableHosts) > 0 { + vcc.DisplayInfo("Hint: please manually clean up directories in the unreachable host(s) %v", + clusterOpEngine.execContext.unreachableHosts) + } + // we return a vdb that contains only the remaining hosts return vdb.copy(remainingHosts), nil } @@ -356,11 +368,15 @@ func (vcc VClusterCommands) produceRemoveNodeInstructions(vdb *VCoordinationData return instructions, err } - // for Eon DB, we check whethter all subscriptions are ACTIVE after rebalance shards + // for Eon DB, we check whether all subscriptions are ACTIVE after rebalance shards // Sandboxed nodes cannot be removed, so even if the database has sandboxes, // polling subscriptions for the main cluster is enough var nodesToPollSubs []string - getMainClusterNodes(vdb, options, &nodesToPollSubs) + if len(options.NodesToPullSubs) > 0 { + nodesToPollSubs = options.NodesToPullSubs + } else { + getMainClusterNodes(vdb, options, &nodesToPollSubs) + } httpsPollSubscriptionStateOp, e := makeHTTPSPollSubscriptionStateOp(initiatorHost, usePassword, username, password, &nodesToPollSubs) @@ -400,11 +416,12 @@ func (vcc VClusterCommands) produceRemoveNodeInstructions(vdb *VCoordinationData } instructions = append(instructions, &httpsReloadSpreadOp) + nmaHealthOp := makeNMAHealthOpSkipUnreachable(v.HostList) nmaDeleteDirectoriesOp, err := makeNMADeleteDirectoriesOp(&v, options.ForceDelete) if err != nil { return instructions, err } - instructions = append(instructions, &nmaDeleteDirectoriesOp) + instructions = append(instructions, &nmaHealthOp, &nmaDeleteDirectoriesOp) if vdb.IsEon { httpsSyncCatalogOp, err := makeHTTPSSyncCatalogOp(initiatorHost, true, username, password, RemoveNodeSyncCat) diff --git a/vclusterops/remove_subcluster.go b/vclusterops/remove_subcluster.go index b4cf83d..a0bc811 100644 --- a/vclusterops/remove_subcluster.go +++ b/vclusterops/remove_subcluster.go @@ -37,6 +37,13 @@ type VRemoveScOptions struct { // A primary up host in another subcluster that belongs to same cluster as the target subcluster. // This option will be used to do re-ip in the cluster. PrimaryUpHost string + // Names of the nodes that need to have active subscription. The user of vclusterOps needs + // to make sure the provided values are correct. This option will be used when some nodes + // cannot join the main cluster so we will only check the node subscription state for the nodes + // in this option. For example, after promote_sandbox, the nodes in old main cluster cannot + // join the new main cluster so we should only check the node subscription state on the nodes + // that are promoted from a sandbox. + NodesToPullSubs []string } func VRemoveScOptionsFactory() VRemoveScOptions { @@ -54,7 +61,7 @@ func (options *VRemoveScOptions) setDefaultValues() { } func (options *VRemoveScOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandRemoveSubcluster, logger) + err := options.validateBaseOptions(RemoveSubclusterCmd, logger) if err != nil { return err } @@ -191,6 +198,7 @@ func (vcc VClusterCommands) VRemoveSubcluster(removeScOpt *VRemoveScOptions) (VC removeNodeOpt.HostsToRemove = hostsToRemove removeNodeOpt.ForceDelete = removeScOpt.ForceDelete removeNodeOpt.IsSubcluster = true + removeNodeOpt.NodesToPullSubs = removeScOpt.NodesToPullSubs vcc.Log.PrintInfo("Removing nodes %q from subcluster %s", hostsToRemove, removeScOpt.SCName) diff --git a/vclusterops/rename_subcluster.go b/vclusterops/rename_subcluster.go index c3522db..bbfca2b 100644 --- a/vclusterops/rename_subcluster.go +++ b/vclusterops/rename_subcluster.go @@ -55,7 +55,7 @@ func (options *VRenameSubclusterOptions) validateParseOptions(logger vlog.Printe return err } - err = options.validateAuthOptions(commandRenameSc, logger) + err = options.validateAuthOptions(RenameScCmd.CmdString(), logger) if err != nil { return err } @@ -77,7 +77,7 @@ func (options *VRenameSubclusterOptions) validateParseOptions(logger vlog.Printe if err != nil { return err } - return options.validateBaseOptions(commandRenameSc, logger) + return options.validateBaseOptions(RenameScCmd, logger) } // analyzeOptions will modify some options based on what is chosen diff --git a/vclusterops/replication.go b/vclusterops/replication.go index b70c20e..e7c21eb 100644 --- a/vclusterops/replication.go +++ b/vclusterops/replication.go @@ -44,7 +44,7 @@ func VReplicationDatabaseFactory() VReplicationDatabaseOptions { } func (options *VReplicationDatabaseOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandReplicationStart, logger) + err := options.validateBaseOptions(ReplicationStartCmd, logger) if err != nil { return err } @@ -103,7 +103,7 @@ func (options *VReplicationDatabaseOptions) validateParseOptions(logger vlog.Pri } // batch 3: validate auth params - err = options.validateAuthOptions(commandReplicationStart, logger) + err = options.validateAuthOptions(ReplicationStartCmd.CmdString(), logger) if err != nil { return err } diff --git a/vclusterops/restore_points.go b/vclusterops/restore_points.go index 780fdcf..a087d6d 100644 --- a/vclusterops/restore_points.go +++ b/vclusterops/restore_points.go @@ -102,7 +102,7 @@ func (options *ShowRestorePointFilterOptions) ValidateAndStandardizeTimestampsIf } func (options *VShowRestorePointsOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandShowRestorePoints, logger) + err := options.validateBaseOptions(ShowRestorePointsCmd, logger) if err != nil { return err } diff --git a/vclusterops/sandbox.go b/vclusterops/sandbox.go index d331c38..c42ea28 100644 --- a/vclusterops/sandbox.go +++ b/vclusterops/sandbox.go @@ -55,7 +55,7 @@ func (options *VSandboxOptions) setDefaultValues() { } func (options *VSandboxOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandSandboxSC, logger) + err := options.validateBaseOptions(SandboxSCCmd, logger) if err != nil { return err } @@ -148,7 +148,7 @@ func (vcc *VClusterCommands) produceSandboxSubclusterInstructions(options *VSand // Get all up nodes httpsGetUpNodesOp, err := makeHTTPSGetUpScNodesOp(options.DBName, options.Hosts, - usePassword, username, options.Password, SandboxCmd, options.SCName) + usePassword, username, options.Password, SandboxSCCmd, options.SCName) if err != nil { return instructions, err } diff --git a/vclusterops/scrutinize.go b/vclusterops/scrutinize.go index fa492b2..20f636c 100644 --- a/vclusterops/scrutinize.go +++ b/vclusterops/scrutinize.go @@ -26,9 +26,6 @@ import ( "github.com/vertica/vcluster/vclusterops/vlog" ) -// const to sync cmd, options parsing, and this -const VScrutinizeTypeName = "scrutinize" - // files and folders used by scrutinize const ScrutinizeOutputBasePath = "/tmp/scrutinize" const scrutinizeRemoteOutputPath = ScrutinizeOutputBasePath + "/remote" @@ -144,7 +141,7 @@ func (options *VScrutinizeOptions) getHoursAgo(timeString, timeVarName string, l func (options *VScrutinizeOptions) validateRequiredOptions(logger vlog.Printer) error { // checks for correctness, but not for presence of all flags - err := options.validateBaseOptions(VScrutinizeTypeName, logger) + err := options.validateBaseOptions(ScrutinizeCmd, logger) if err != nil { return err } diff --git a/vclusterops/set_config_parameter.go b/vclusterops/set_config_parameter.go index 0c21fbf..89b32ce 100644 --- a/vclusterops/set_config_parameter.go +++ b/vclusterops/set_config_parameter.go @@ -44,12 +44,12 @@ func VSetConfigurationParameterOptionsFactory() VSetConfigurationParameterOption } func (opt *VSetConfigurationParameterOptions) validateParseOptions(logger vlog.Printer) error { - err := opt.validateBaseOptions(commandSetConfigurationParameter, logger) + err := opt.validateBaseOptions(SetConfigurationParameterCmd, logger) if err != nil { return err } - err = opt.validateAuthOptions(commandSetConfigurationParameter, logger) + err = opt.validateAuthOptions(SetConfigurationParameterCmd.CmdString(), logger) if err != nil { return err } @@ -139,7 +139,7 @@ func (vcc VClusterCommands) produceSetConfigurationParameterInstructions( // up hosts will be filtered by sandbox name in prepare stage of nmaSetConfigurationParameterOp httpsGetUpNodesOp, err := makeHTTPSGetUpNodesWithSandboxOp(options.DBName, options.Hosts, options.usePassword, options.UserName, options.Password, - SetConfigurationParametersCmd, options.Sandbox, assertMainClusterUpNodes) + SetConfigurationParameterCmd, options.Sandbox, assertMainClusterUpNodes) if err != nil { return instructions, err } diff --git a/vclusterops/start_db.go b/vclusterops/start_db.go index 3266c3c..c9a3f89 100644 --- a/vclusterops/start_db.go +++ b/vclusterops/start_db.go @@ -49,6 +49,9 @@ type VStartDatabaseOptions struct { // whether the first time to start the database after revive FirstStartAfterRevive bool + + // whether input info is read from vcluster config file, used for quorum check + ReadFromConfig bool } func VStartDatabaseOptionsFactory() VStartDatabaseOptions { @@ -67,7 +70,7 @@ func (options *VStartDatabaseOptions) setDefaultValues() { } func (options *VStartDatabaseOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandStartDB, logger) + err := options.validateBaseOptions(StartDBCmd, logger) if err != nil { return err } @@ -152,9 +155,10 @@ func (vcc VClusterCommands) VStartDatabase(options *VStartDatabaseOptions) (vdbP vcc.Log.PrintWarning("communal storage location is not specified" + warningMsg) } } + numTotalNodes := len(options.Hosts) // start_db pre-checks and get basic info - err = vcc.runStartDBPrecheck(options, &vdb) + err = vcc.runStartDBPrecheck(options, &vdb, numTotalNodes) if err != nil { return nil, err } @@ -185,7 +189,18 @@ func (vcc VClusterCommands) VStartDatabase(options *VStartDatabaseOptions) (vdbP return &updatedVDB, nil } -func (vcc VClusterCommands) runStartDBPrecheck(options *VStartDatabaseOptions, vdb *VCoordinationDatabase) error { +func (vcc VClusterCommands) runStartDBPrecheck(options *VStartDatabaseOptions, vdb *VCoordinationDatabase, numTotalNodes int) error { + // filter out unreachable hosts + unreachableHosts, err := vcc.getUnreachableHosts(&options.DatabaseOptions) + if err != nil { + return err + } + // if it's eon mode and there are unreachable hosts, we cannot perform quorum check due to missing primary node information + // error out here with hint + if options.IsEon && len(unreachableHosts) > 0 { + return fmt.Errorf("cannot start db with unreachable hosts, please check cluster and NMA connectivity on unreachable hosts") + } + options.Hosts = util.SliceDiff(options.Hosts, unreachableHosts) // pre-instruction to perform basic checks and get basic information preInstructions, err := vcc.produceStartDBPreCheck(options, vdb, options.TrimHostList) if err != nil { @@ -207,6 +222,14 @@ func (vcc VClusterCommands) runStartDBPrecheck(options *VStartDatabaseOptions, v options.Hosts = vcc.removeHostsNotInCatalog(&clusterOpEngine.execContext.nmaVDatabase, options.Hosts) } + // Quorum Check + if options.ReadFromConfig && !options.IsEon { + err = vcc.quorumCheck(numTotalNodes, len(options.Hosts)) + if err != nil { + return fmt.Errorf("fail to start database pre-checks: %w", err) + } + } + return nil } @@ -244,7 +267,6 @@ func (vcc VClusterCommands) produceStartDBPreCheck(options *VStartDatabaseOption trimHostList bool) ([]clusterOp, error) { var instructions []clusterOp - nmaHealthOp := makeNMAHealthOp(options.Hosts) // need username for https operations err := options.setUsePasswordAndValidateUsernameIfNeeded(vcc.Log) if err != nil { @@ -256,10 +278,7 @@ func (vcc VClusterCommands) produceStartDBPreCheck(options *VStartDatabaseOption if err != nil { return instructions, err } - instructions = append(instructions, - &nmaHealthOp, - &checkDBRunningOp, - ) + instructions = append(instructions, &checkDBRunningOp) // when we cannot get db info from cluster_config.json, we will fetch it from NMA /nodes endpoint. if len(vdb.HostNodeMap) == 0 { @@ -323,11 +342,12 @@ func (vcc VClusterCommands) produceStartDBInstructions(options *VStartDatabaseOp nil /*db configurations retrieved from a running db*/) nmaStartNewNodesOp := makeNMAStartNodeOp(options.Hosts, options.StartUpConf) - httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOpWithTimeoutAndCommand(options.Hosts, - options.usePassword, options.UserName, options.Password, options.StatePollingTimeout, StartDBCmd) + httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOp(options.Hosts, + options.usePassword, options.UserName, options.Password, options.StatePollingTimeout) if err != nil { return instructions, err } + httpsPollNodeStateOp.cmdType = StartDBCmd instructions = append(instructions, &nmaStartNewNodesOp, @@ -350,3 +370,12 @@ func (vcc VClusterCommands) setOrRotateEncryptionKey(keyType string) clusterOp { op := makeNMASpreadSecurityOp(vcc.Log, keyType) return &op } + +func (vcc VClusterCommands) quorumCheck(numPrimaryNodes, numReachableHosts int) error { + minimumNodesForQuorum := numPrimaryNodes/2 + 1 + if numReachableHosts < minimumNodesForQuorum { + return fmt.Errorf("quorum not satisfied, number of reachable nodes %d < minimum %d of %d primary nodes", + numReachableHosts, minimumNodesForQuorum, numPrimaryNodes) + } + return nil +} diff --git a/vclusterops/start_node.go b/vclusterops/start_node.go index 1cbe0a7..efccd45 100644 --- a/vclusterops/start_node.go +++ b/vclusterops/start_node.go @@ -68,12 +68,13 @@ func VStartNodesOptionsFactory() VStartNodesOptions { func (options *VStartNodesOptions) setDefaultValues() { options.DatabaseOptions.setDefaultValues() // set default value to StatePollingTimeout - options.StatePollingTimeout = util.DefaultStatePollingTimeout + options.StatePollingTimeout = util.GetEnvInt("NODE_STATE_POLLING_TIMEOUT", util.DefaultStatePollingTimeout) + options.Nodes = make(map[string]string) } func (options *VStartNodesOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandStartNode, logger) + err := options.validateBaseOptions(StartNodeCmd, logger) if err != nil { return err } @@ -149,56 +150,88 @@ func (vcc VClusterCommands) startNodePreCheck(vdb *VCoordinationDatabase, option return nil } -// VStartNodes starts the given nodes for a cluster that has not yet lost -// cluster quorum. Returns any error encountered. If necessary, it updates the -// node's IP in the Vertica catalog. If cluster quorum is already lost, use -// VStartDatabase. It will skip any nodes given that no longer exist in the -// catalog. -func (vcc VClusterCommands) VStartNodes(options *VStartNodesOptions) error { - /* - * - Produce Instructions - * - Create a VClusterOpEngine - * - Give the instructions to the VClusterOpEngine to run - */ - - // validate and analyze options - err := options.validateAnalyzeOptions(vcc.Log) +func (vcc VClusterCommands) removeUnreachableHosts(options *VStartNodesOptions) error { + unreachableHosts, err := vcc.getUnreachableHosts(&options.DatabaseOptions) if err != nil { return err } + options.Hosts = util.SliceDiff(options.Hosts, unreachableHosts) + for _, unreachableHost := range unreachableHosts { + for name, val := range options.Nodes { + if val == unreachableHost { + delete(options.Nodes, name) + } + } + } + return nil +} +func (vcc VClusterCommands) preStartNodeCheck(options *VStartNodesOptions, vdb *VCoordinationDatabase, + hostNodeNameMap map[string]string, startNodeInfo *VStartNodesInfo) error { // retrieve database information to execute the command so we do not always rely on some user input // if VStartNodes is called from VStartSubcluster, we can reuse the vdb from VStartSubcluster - vdb := makeVCoordinationDatabase() if options.vdb == nil { - err = vcc.getVDBFromRunningDBIncludeSandbox(&vdb, &options.DatabaseOptions, AnySandbox) + err := vcc.getVDBFromRunningDBIncludeSandbox(vdb, &options.DatabaseOptions, AnySandbox) if err != nil { return err } - } else { - vdb = *options.vdb } - hostNodeNameMap := make(map[string]string) - startNodeInfo := new(VStartNodesInfo) for _, vnode := range vdb.HostNodeMap { hostNodeNameMap[vnode.Name] = vnode.Address } // precheck to make sure the nodes to start are either all sandboxed nodes in one sandbox or all main cluster nodes - err = vcc.startNodePreCheck(&vdb, options, hostNodeNameMap, startNodeInfo) + err := vcc.startNodePreCheck(vdb, options, hostNodeNameMap, startNodeInfo) if err != nil { return err } // sandboxes may have different catalog from the main cluster, update the vdb build from the sandbox of the nodes to start - err = vcc.getVDBFromRunningDBIncludeSandbox(&vdb, &options.DatabaseOptions, startNodeInfo.Sandbox) + err = vcc.getVDBFromRunningDBIncludeSandbox(vdb, &options.DatabaseOptions, startNodeInfo.Sandbox) if err != nil { if startNodeInfo.Sandbox != util.MainClusterSandbox { return errors.Join(err, fmt.Errorf("hint: make sure there is at least one UP node in the sandbox %s", startNodeInfo.Sandbox)) } return errors.Join(err, fmt.Errorf("hint: make sure there is at least one UP node in the database")) } + return nil +} + +// VStartNodes starts the given nodes for a cluster that has not yet lost +// cluster quorum. Returns any error encountered. If necessary, it updates the +// node's IP in the Vertica catalog. If cluster quorum is already lost, use +// VStartDatabase. It will skip any nodes given that no longer exist in the +// catalog. +func (vcc VClusterCommands) VStartNodes(options *VStartNodesOptions) error { + /* + * - Produce Instructions + * - Create a VClusterOpEngine + * - Give the instructions to the VClusterOpEngine to run + */ + + // validate and analyze options + err := options.validateAnalyzeOptions(vcc.Log) + if err != nil { + return err + } + + err = vcc.removeUnreachableHosts(options) + if err != nil || len(options.Nodes) == 0 { + return err + } + + vdb := makeVCoordinationDatabase() + if options.vdb != nil { + vdb = *options.vdb + } + hostNodeNameMap := make(map[string]string) + startNodeInfo := new(VStartNodesInfo) + + err = vcc.preStartNodeCheck(options, &vdb, hostNodeNameMap, startNodeInfo) + if err != nil { + return err + } // find out hosts // - that need to re-ip, and @@ -215,9 +248,7 @@ func (vcc VClusterCommands) VStartNodes(options *VStartNodesOptions) error { // if none of them is down and no other nodes to re-ip, // we will early stop as there is no need to start them if !startNodeInfo.hasDownNodeNoNeedToReIP && len(startNodeInfo.ReIPList) == 0 { - const msg = "The provided nodes are either not in catalog or already up. There is nothing to start." - fmt.Println(msg) - vcc.Log.Info(msg) + vcc.Log.DisplayInfo("The provided nodes are either not in catalog or already up. There is nothing to start.") return nil } @@ -228,9 +259,7 @@ func (vcc VClusterCommands) VStartNodes(options *VStartNodesOptions) error { // If no nodes found to start. We can simply exit here. This can happen if // given a list of nodes that aren't in the catalog any longer. if len(startNodeInfo.HostsToStart) == 0 { - const msg = "None of the nodes provided are in the catalog. There is nothing to start." - fmt.Println(msg) - vcc.Log.Info(msg) + vcc.Log.DisplayInfo("None of the nodes provided are in the catalog. There is nothing to start.") return nil } @@ -311,7 +340,7 @@ func (vcc VClusterCommands) produceStartNodesInstructions(startNodeInfo *VStartN } httpsGetUpNodesOp, err := makeHTTPSGetUpNodesOp(options.DBName, options.Hosts, - options.usePassword, options.UserName, options.Password, StartNodeCommand) + options.usePassword, options.UserName, options.Password, StartNodeCmd) if err != nil { return instructions, err } @@ -350,7 +379,7 @@ func (vcc VClusterCommands) produceStartNodesInstructions(startNodeInfo *VStartN } // require to have the same vertica version - nmaVerticaVersionOp := makeNMAVerticaVersionOpBeforeStartNode(vdb, startNodeInfo.HostsToStart) + nmaVerticaVersionOp := makeNMAVerticaVersionOpBeforeStartNode(vdb, options.Hosts, startNodeInfo.HostsToStart) instructions = append(instructions, &nmaVerticaVersionOp) // The second parameter (sourceConfHost) in produceTransferConfigOps is set to a nil value in the upload and download step @@ -370,12 +399,12 @@ func (vcc VClusterCommands) produceStartNodesInstructions(startNodeInfo *VStartN } nmaStartNewNodesOp := makeNMAStartNodeOpWithVDB(startNodeInfo.HostsToStart, options.StartUpConf, vdb) - httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOpWithTimeoutAndCommand(startNodeInfo.HostsToStart, - options.usePassword, options.UserName, options.Password, options.StatePollingTimeout, StartNodeCmd) + httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOp(startNodeInfo.HostsToStart, + options.usePassword, options.UserName, options.Password, options.StatePollingTimeout) if err != nil { return instructions, err } - + httpsPollNodeStateOp.cmdType = StartNodeCmd instructions = append(instructions, &httpsRestartUpCommandOp, &nmaStartNewNodesOp, diff --git a/vclusterops/start_subcluster.go b/vclusterops/start_subcluster.go index 344f05b..9c58566 100644 --- a/vclusterops/start_subcluster.go +++ b/vclusterops/start_subcluster.go @@ -45,7 +45,7 @@ func (options *VStartScOptions) setDefaultValues() { } func (options *VStartScOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandStartSubcluster, logger) + err := options.validateBaseOptions(StartSubclusterCmd, logger) if err != nil { return err } @@ -140,13 +140,12 @@ func (vcc VClusterCommands) VStartSubcluster(options *VStartScOptions) error { options.SCName) } - var startNodesOptions VStartNodesOptions - startNodesOptions.Nodes = nodesToStart - startNodesOptions.DatabaseOptions = options.DatabaseOptions - startNodesOptions.StatePollingTimeout = options.StatePollingTimeout - startNodesOptions.vdb = &vdb + options.VStartNodesOptions.Nodes = nodesToStart + options.VStartNodesOptions.DatabaseOptions = options.DatabaseOptions + options.VStartNodesOptions.StatePollingTimeout = options.StatePollingTimeout + options.VStartNodesOptions.vdb = &vdb vlog.DisplayColorInfo("Starting nodes %v in subcluster %s", maps.Keys(nodesToStart), options.SCName) - return vcc.VStartNodes(&startNodesOptions) + return vcc.VStartNodes(&options.VStartNodesOptions) } diff --git a/vclusterops/state_poller.go b/vclusterops/state_poller.go index e5d5aae..15b7404 100644 --- a/vclusterops/state_poller.go +++ b/vclusterops/state_poller.go @@ -25,6 +25,7 @@ const ( OneMinute = 60 * OneSecond StopDBTimeout = 5 * OneMinute StartupPollingTimeout = 5 * OneMinute + StopPollingTimeout = 5 * OneMinute ScrutinizePollingTimeout = -1 * OneMinute // no timeout PollingInterval = 3 * OneSecond ) @@ -43,7 +44,7 @@ func pollState(poller statePoller, execContext *opEngineExecContext) error { duration := time.Duration(timeout) * time.Second count := 0 needTimeout := true - if timeout < 0 { + if timeout <= 0 { needTimeout = false } diff --git a/vclusterops/stop_db.go b/vclusterops/stop_db.go index c283506..e5509ea 100644 --- a/vclusterops/stop_db.go +++ b/vclusterops/stop_db.go @@ -48,7 +48,7 @@ func (options *VStopDatabaseOptions) setDefaultValues() { } func (options *VStopDatabaseOptions) validateRequiredOptions(log vlog.Printer) error { - err := options.validateBaseOptions(commandStopDB, log) + err := options.validateBaseOptions(StopDBCmd, log) if err != nil { return err } diff --git a/vclusterops/stop_node.go b/vclusterops/stop_node.go index 231cf5b..888339a 100644 --- a/vclusterops/stop_node.go +++ b/vclusterops/stop_node.go @@ -28,6 +28,8 @@ type VStopNodeOptions struct { DatabaseOptions // Hosts to stop StopHosts []string + // timeout for polling nodes that we want to wait in httpsPollNodeStopeOp + StopPollingTimeout int } func VStopNodeOptionsFactory() VStopNodeOptions { @@ -40,10 +42,12 @@ func VStopNodeOptionsFactory() VStopNodeOptions { func (options *VStopNodeOptions) setDefaultValues() { options.DatabaseOptions.setDefaultValues() + // set time out from env variable + options.StopPollingTimeout = util.GetEnvInt("NODE_STATE_POLLING_TIMEOUT", util.DefaultStatePollingTimeout) } func (options *VStopNodeOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandStopNode, logger) + err := options.validateBaseOptions(StopNodeCmd, logger) if err != nil { return err } diff --git a/vclusterops/stop_subcluster.go b/vclusterops/stop_subcluster.go index 2659e27..afb6e1d 100644 --- a/vclusterops/stop_subcluster.go +++ b/vclusterops/stop_subcluster.go @@ -46,7 +46,7 @@ func (options *VStopSubclusterOptions) setDefaultValues() { } func (options *VStopSubclusterOptions) validateRequiredOptions(log vlog.Printer) error { - err := options.validateBaseOptions(commandStopSubcluster, log) + err := options.validateBaseOptions(StopSubclusterCmd, log) if err != nil { return err } diff --git a/vclusterops/unsandbox.go b/vclusterops/unsandbox.go index 23cc446..0310a3d 100644 --- a/vclusterops/unsandbox.go +++ b/vclusterops/unsandbox.go @@ -53,7 +53,7 @@ func (options *VUnsandboxOptions) setDefaultValues() { } func (options *VUnsandboxOptions) validateRequiredOptions(logger vlog.Printer) error { - err := options.validateBaseOptions(commandUnsandboxSC, logger) + err := options.validateBaseOptions(UnsandboxSCCmd, logger) if err != nil { return err } @@ -172,6 +172,7 @@ func (vcc *VClusterCommands) unsandboxPreCheck(vdb *VCoordinationDatabase, optio if len(mainClusterHost) == 0 { return fmt.Errorf(`require at least one UP host outside of the sandbox subcluster '%s'in the input host list`, options.SCName) } + options.SCHosts = sandboxedHosts return nil } @@ -206,10 +207,13 @@ func (vcc *VClusterCommands) produceUnsandboxSCInstructions(options *VUnsandboxO } username := options.UserName + // Check NMA health on sandbox hosts + nmaHealthOp := makeNMAHealthOp(options.SCHosts) + instructions = append(instructions, &nmaHealthOp) // Get all up nodes httpsGetUpNodesOp, err := makeHTTPSGetUpScNodesOp(options.DBName, options.Hosts, - usePassword, username, options.Password, UnsandboxCmd, options.SCName) + usePassword, username, options.Password, UnsandboxSCCmd, options.SCName) if err != nil { return instructions, err } diff --git a/vclusterops/util/util.go b/vclusterops/util/util.go index 559f452..70f4cf9 100644 --- a/vclusterops/util/util.go +++ b/vclusterops/util/util.go @@ -27,6 +27,7 @@ import ( "path/filepath" "reflect" "regexp" + "strconv" "strings" "time" @@ -379,6 +380,17 @@ func GetEnv(key, fallback string) string { return fallback } +// get int value of env var with a fallback value +func GetEnvInt(key string, fallback int) int { + if value, ok := os.LookupEnv(key); ok { + if intValue, err := strconv.Atoi(value); err == nil { + return intValue + } + // failed to retrieve env value, should use fallback value + } + return fallback +} + func CheckMissingFields(object any) error { var missingFields []string v := reflect.ValueOf(object) @@ -663,3 +675,8 @@ func IsTimeEqualOrAfter(start, end time.Time) bool { } const EmptyConfigParamErrMsg = "configuration parameter must not be empty" + +func IsK8sEnvironment() bool { + port, portSet := os.LookupEnv(kubernetesPort) + return portSet && port != "" +} diff --git a/vclusterops/util/util_test.go b/vclusterops/util/util_test.go index be46120..d4e6bad 100644 --- a/vclusterops/util/util_test.go +++ b/vclusterops/util/util_test.go @@ -19,6 +19,7 @@ import ( "bytes" "errors" "fmt" + "os" "testing" "github.com/stretchr/testify/assert" @@ -383,3 +384,22 @@ func TestIsEmptyOrValidTimeStr(t *testing.T) { _, err = IsEmptyOrValidTimeStr(layout, testTimeString) assert.ErrorContains(t, err, "cannot parse") } + +func TestGetEnvInt(t *testing.T) { + key := "TEST_ENV_INT" + fallback := 123 + // positive case: environment variable exists and is a valid integer + os.Setenv(key, "456") + actual := GetEnvInt(key, fallback) + assert.Equal(t, 456, actual) + + // negative case: environment variable does not exist + os.Unsetenv(key) + actual = GetEnvInt(key, fallback) + assert.Equal(t, fallback, actual) + + // negative case: environment variable exists but is not a valid integer + os.Setenv(key, "not_an_integer") + actual = GetEnvInt(key, fallback) + assert.Equal(t, fallback, actual) +} diff --git a/vclusterops/vcluster_database_options.go b/vclusterops/vcluster_database_options.go index 6599704..af49de5 100644 --- a/vclusterops/vcluster_database_options.go +++ b/vclusterops/vcluster_database_options.go @@ -84,35 +84,6 @@ const ( catalogPath = "" ) -const ( - commandCreateDB = "create_db" - commandDropDB = "drop_db" - commandStopDB = "stop_db" - commandStartDB = "start_db" - commandAddNode = "add_node" - commandRemoveNode = "remove_node" - commandStopNode = "stop_node" - commandStartNode = "start_node" - commandAddSubcluster = "add_subcluster" - commandRemoveSubcluster = "remove_subcluster" - commandStopSubcluster = "stop_subcluster" - commandStartSubcluster = "start_subcluster" - commandSandboxSC = "sandbox_subcluster" - commandUnsandboxSC = "unsandbox_subcluster" - commandShowRestorePoints = "show_restore_points" - commandInstallPackages = "install_packages" - commandConfigRecover = "manage_config_recover" - commandManageConnectionDraining = "manage_connection_draining" - commandSetConfigurationParameter = "set_configuration_parameter" - commandGetConfigurationParameter = "get_configuration_parameter" - commandReplicationStart = "replication_start" - commandPromoteSandboxToMain = "promote_sandbox_to_main" - commandFetchNodesDetails = "fetch_nodes_details" - commandAlterSubclusterType = "alter_subcluster_type" - commandRenameSc = "rename_subcluster" - commandReIP = "re_ip" -) - func DatabaseOptionsFactory() DatabaseOptions { opt := DatabaseOptions{} // set default values to the params @@ -125,8 +96,9 @@ func (opt *DatabaseOptions) setDefaultValues() { opt.ConfigurationParameters = make(map[string]string) } -func (opt *DatabaseOptions) validateBaseOptions(commandName string, log vlog.Printer) error { +func (opt *DatabaseOptions) validateBaseOptions(cmdType CmdType, log vlog.Printer) error { // get vcluster commands + commandName := cmdType.CmdString() log.WithName(commandName) // database name if opt.DBName == "" { @@ -151,7 +123,7 @@ func (opt *DatabaseOptions) validateBaseOptions(commandName string, log vlog.Pri // config directory // VER-91801: remove this condition once re_ip supports the config file - if !slices.Contains([]string{commandReIP}, commandName) { + if !slices.Contains([]string{ReIPCmd.CmdString()}, commandName) { err = opt.validateConfigDir(commandName) if err != nil { return err @@ -195,9 +167,8 @@ func (opt *DatabaseOptions) validateHostsAndPwd(commandName string, log vlog.Pri // when we create db, we need to set password to "" if user did not provide one if opt.Password == nil { - if commandName == commandCreateDB { + if commandName == CreateDBCmd.CmdString() { opt.Password = new(string) - *opt.Password = "" } log.PrintInfo("no password specified, using none") } @@ -207,7 +178,7 @@ func (opt *DatabaseOptions) validateHostsAndPwd(commandName string, log vlog.Pri // validate catalog, data, and depot paths func (opt *DatabaseOptions) validatePaths(commandName string) error { // validate for the following commands only - commands := []string{commandCreateDB, commandDropDB, commandConfigRecover} + commands := []string{CreateDBCmd.CmdString(), DropDBCmd.CmdString(), ConfigRecoverCmd.CmdString()} if !slices.Contains(commands, commandName) { return nil } @@ -220,7 +191,7 @@ func (opt *DatabaseOptions) validatePaths(commandName string) error { // data prefix // `manage_config recover` does not need the data-path - if commandName != commandConfigRecover { + if commandName != ConfigRecoverCmd.CmdString() { err = util.ValidateRequiredAbsPath(opt.DataPrefix, "data path") if err != nil { return err @@ -246,8 +217,10 @@ func (opt *DatabaseOptions) validateCatalogPath() error { func (opt *DatabaseOptions) validateConfigDir(commandName string) error { // validate for the following commands only // TODO: add other commands into the command list - commands := []string{commandCreateDB, commandDropDB, commandStopDB, commandStartDB, commandAddSubcluster, commandRemoveSubcluster, - commandSandboxSC, commandUnsandboxSC, commandShowRestorePoints, commandAddNode, commandRemoveNode, commandInstallPackages} + commands := []string{CreateDBCmd.CmdString(), DropDBCmd.CmdString(), StopDBCmd.CmdString(), StartDBCmd.CmdString(), + AddSubclusterCmd.CmdString(), RemoveSubclusterCmd.CmdString(), + SandboxSCCmd.CmdString(), UnsandboxSCCmd.CmdString(), ShowRestorePointsCmd.CmdString(), AddNodeCmd.CmdString(), + RemoveNodeCmd.CmdString(), InstallPackagesCmd.CmdString()} if slices.Contains(commands, commandName) { return nil } diff --git a/vclusterops/vlog/printer.go b/vclusterops/vlog/printer.go index c69efe2..8fbbd92 100644 --- a/vclusterops/vlog/printer.go +++ b/vclusterops/vlog/printer.go @@ -17,6 +17,7 @@ package vlog import ( "fmt" + "io" "os" "strings" "unicode" @@ -44,6 +45,8 @@ type Printer struct { LogToFileOnly bool // ForCli can indicate if vclusterops is called from vcluster cli or other clients ForCli bool + + Writer io.Writer } // WithName will construct a new printer with the logger set with an additional @@ -53,6 +56,7 @@ func (p *Printer) WithName(logName string) Printer { Log: p.Log.WithName(logName), LogToFileOnly: p.LogToFileOnly, ForCli: p.ForCli, + Writer: p.Writer, } }