diff --git a/README.md b/README.md index 34285f5..86f79a7 100644 --- a/README.md +++ b/README.md @@ -94,4 +94,4 @@ We can use similar way to set up and call other vcluster-ops commands. ## Licensing -vcluster is open source code and is under the Apache 2.0 license. Please see `LICENSE` for details. \ No newline at end of file +vcluster is open source and is under the Apache 2.0 license. Please see `LICENSE` for details. diff --git a/commands/cmd_create_db.go b/commands/cmd_create_db.go index a7164a5..a869996 100644 --- a/commands/cmd_create_db.go +++ b/commands/cmd_create_db.go @@ -275,7 +275,6 @@ func (c *CmdCreateDB) Run(vcc vclusterops.ClusterCommands) error { vcc.V(1).Info("Called method Run()") vdb, createError := vcc.VCreateDatabase(c.createDBOptions) if createError != nil { - vcc.LogError(createError, "Failed to create the database.") return createError } diff --git a/vclusterops/cluster_op.go b/vclusterops/cluster_op.go index 8693902..c615090 100644 --- a/vclusterops/cluster_op.go +++ b/vclusterops/cluster_op.go @@ -72,10 +72,11 @@ const ( ) const ( - SuccessCode = 200 - MultipleChoiceCode = 300 - UnauthorizedCode = 401 - InternalErrorCode = 500 + SuccessCode = 200 + MultipleChoiceCode = 300 + UnauthorizedCode = 401 + PreconditionFailedCode = 412 + InternalErrorCode = 500 ) // hostHTTPResult is used to save result of an Adapter's sendRequest(...) function @@ -97,13 +98,17 @@ const respSuccStatusCode = 0 // The HTTP response with a 401 status code can have several scenarios: // 1. Wrong password // 2. Wrong certificate -// 3. The local node has not yet joined the cluster; the HTTP server will accept connections once the node joins the cluster. -// HTTPCheckDBRunningOp in create_db need to check all scenarios to see any HTTP running -// For HTTPSPollNodeStateOp in start_db, it requires only handling the first and second scenarios +// HTTPCheckDBRunningOp in create_db and HTTPSPollNodeStateOp in start_db need to handle these scenarios func (hostResult *hostHTTPResult) isUnauthorizedRequest() bool { return hostResult.statusCode == UnauthorizedCode } +// The HTTP response with a 412 may happen if +// the local node has not yet joined the cluster; the HTTP server will accept connections once the node joins the cluster. +func (hostResult *hostHTTPResult) hasPreconditionFailed() bool { + return hostResult.statusCode == PreconditionFailedCode +} + // isSuccess returns true if status code is 200 func (hostResult *hostHTTPResult) isSuccess() bool { return hostResult.statusCode == SuccessCode @@ -129,7 +134,8 @@ func (hostResult *hostHTTPResult) isInternalError() bool { } func (hostResult *hostHTTPResult) isHTTPRunning() bool { - if hostResult.isPassing() || hostResult.isUnauthorizedRequest() || hostResult.isInternalError() { + if hostResult.isPassing() || hostResult.isUnauthorizedRequest() || + hostResult.isInternalError() || hostResult.hasPreconditionFailed() { return true } return false diff --git a/vclusterops/create_db.go b/vclusterops/create_db.go index 03d4992..57ae8ee 100644 --- a/vclusterops/create_db.go +++ b/vclusterops/create_db.go @@ -295,6 +295,7 @@ func (vcc VClusterCommands) VCreateDatabase(options *VCreateDatabaseOptions) (VC vdb := makeVCoordinationDatabase() err := vdb.setFromCreateDBOptions(options, vcc.Log) if err != nil { + vcc.Log.Error(err, "fail to create database") return vdb, err } // produce instructions diff --git a/vclusterops/fetch_node_state.go b/vclusterops/fetch_node_state.go index 45d9646..e2cbfd7 100644 --- a/vclusterops/fetch_node_state.go +++ b/vclusterops/fetch_node_state.go @@ -72,7 +72,7 @@ func (vcc VClusterCommands) VFetchNodeState(options *VFetchNodeStateOptions) ([] // this vdb is used to fetch node version var vdb VCoordinationDatabase - err = vcc.getVDBFromRunningDBIncludeSandbox(&vdb, &options.DatabaseOptions, util.MainClusterSandbox) + err = vcc.getVDBFromMainRunningDBContainsSandbox(&vdb, &options.DatabaseOptions) if err != nil { vcc.Log.PrintInfo("Error from vdb build: %s", err.Error()) @@ -91,7 +91,13 @@ func (vcc VClusterCommands) VFetchNodeState(options *VFetchNodeStateOptions) ([] return vcc.fetchNodeStateFromDownDB(options) } - // produce list_all_nodes instructions + nodeStates := buildNodeStateList(&vdb, false /*forDownDatabase*/) + // return the result if no need to get version info + if !options.GetVersion { + return nodeStates, nil + } + + // produce instructions to fill node information instructions, err := vcc.produceListAllNodesInstructions(options, &vdb) if err != nil { return nil, fmt.Errorf("fail to produce instructions, %w", err) @@ -102,7 +108,6 @@ func (vcc VClusterCommands) VFetchNodeState(options *VFetchNodeStateOptions) ([] // give the instructions to the VClusterOpEngine to run runError := clusterOpEngine.run(vcc.Log) - nodeStates := clusterOpEngine.execContext.nodesInfo if runError == nil { // fill node version for i, nodeInfo := range nodeStates { @@ -116,34 +121,9 @@ func (vcc VClusterCommands) VFetchNodeState(options *VFetchNodeStateOptions) ([] nodeInfo.Address) } } - - return nodeStates, nil - } - - // error out in case of wrong certificate or password - if len(clusterOpEngine.execContext.hostsWithWrongAuth) > 0 { - return nodeStates, - fmt.Errorf("wrong certificate or password on hosts %v", clusterOpEngine.execContext.hostsWithWrongAuth) - } - - // if failed to get node info from a running database, - // we will try to get it by reading catalog editor - upNodeCount := 0 - for _, n := range nodeStates { - if n.State == util.NodeUpState { - upNodeCount++ - } - } - - if upNodeCount == 0 { - if options.SkipDownDatabase { - return []NodeInfo{}, rfc7807.New(rfc7807.FetchDownDatabase) - } - - return vcc.fetchNodeStateFromDownDB(options) } - return nodeStates, runError + return nodeStates, nil } func (vcc VClusterCommands) fetchNodeStateFromDownDB(options *VFetchNodeStateOptions) ([]NodeInfo, error) { @@ -163,18 +143,7 @@ func (vcc VClusterCommands) fetchNodeStateFromDownDB(options *VFetchNodeStateOpt return nodeStates, err } - for _, h := range vdb.HostList { - var nodeInfo NodeInfo - n := vdb.HostNodeMap[h] - nodeInfo.Address = n.Address - nodeInfo.Name = n.Name - nodeInfo.CatalogPath = n.CatalogPath - nodeInfo.Subcluster = n.Subcluster - nodeInfo.IsPrimary = n.IsPrimary - nodeInfo.Version = n.Version - nodeInfo.State = util.NodeDownState - nodeStates = append(nodeStates, nodeInfo) - } + nodeStates = buildNodeStateList(&vdb, true /*forDownDatabase*/) return nodeStates, nil } @@ -186,73 +155,64 @@ func (vcc VClusterCommands) produceListAllNodesInstructions( vdb *VCoordinationDatabase) ([]clusterOp, error) { var instructions []clusterOp - // get hosts - hosts := options.Hosts - - // validate user name - usePassword := false - if options.Password != nil { - usePassword = true - err := options.validateUserName(vcc.Log) - if err != nil { - return instructions, err - } - } - nmaHealthOp := makeNMAHealthOpSkipUnreachable(options.Hosts) nmaReadVerticaVersionOp := makeNMAReadVerticaVersionOp(vdb) - // Trim host list - hosts = options.updateHostlist(vcc, vdb, hosts) - - httpsCheckNodeStateOp, err := makeHTTPSCheckNodeStateOp(hosts, - usePassword, options.UserName, options.Password) - if err != nil { - return instructions, err - } - if options.GetVersion { instructions = append(instructions, &nmaHealthOp, &nmaReadVerticaVersionOp) } - instructions = append(instructions, - &httpsCheckNodeStateOp, - ) - return instructions, nil } -// Update and limit the hostlist based on status and sandbox info -// Note: if we have any UP main cluster host in the input list, the trimmed hostlist would always contain -// -// only main cluster UP hosts. -func (options *VFetchNodeStateOptions) updateHostlist(vcc VClusterCommands, vdb *VCoordinationDatabase, inputHosts []string) []string { - var mainClusterHosts []string - var upSandboxHosts []string - - for _, h := range inputHosts { - vnode, ok := vdb.HostNodeMap[h] - if !ok { - // host address not found in vdb, skip it - continue +func buildNodeStateList(vdb *VCoordinationDatabase, forDownDatabase bool) []NodeInfo { + var nodeStates []NodeInfo + + // a map from a subcluster name to whether it is primary + // Context: if a node is primary, the subcluster it belongs to is a primary subcluster. + // If any of the nodes are down in such a primary subcluster, HTTPSUpdateNodeStateOp cannot correctly + // update its IsPrimary value, because this op sends request to each host. + // We use the following scMap to check whether any node is primary in each subcluster, + // then update other nodes' IsPrimary value in this subcluster. + scMap := make(map[string]bool) + + for _, h := range vdb.HostList { + var nodeInfo NodeInfo + n := vdb.HostNodeMap[h] + nodeInfo.Address = n.Address + nodeInfo.CatalogPath = n.CatalogPath + nodeInfo.IsPrimary = n.IsPrimary + nodeInfo.Name = n.Name + nodeInfo.Sandbox = n.Sandbox + if forDownDatabase { + nodeInfo.State = util.NodeDownState + } else { + nodeInfo.State = n.State } - if vnode.Sandbox == "" && (vnode.State == util.NodeUpState || vnode.State == util.NodeUnknownState) { - mainClusterHosts = append(mainClusterHosts, vnode.Address) - } else if vnode.State == util.NodeUpState { - upSandboxHosts = append(upSandboxHosts, vnode.Address) + nodeInfo.Subcluster = n.Subcluster + nodeInfo.Version = n.Version + + nodeStates = append(nodeStates, nodeInfo) + + if !forDownDatabase { + if isPrimary, exists := scMap[n.Subcluster]; exists { + scMap[n.Subcluster] = isPrimary || n.IsPrimary + } else { + scMap[n.Subcluster] = n.IsPrimary + } } } - if len(mainClusterHosts) > 0 { - vcc.Log.PrintWarning("Main cluster UP node found in host list. The status would be fetched from a main cluster host!") - return mainClusterHosts - } - if len(upSandboxHosts) > 0 { - vcc.Log.PrintWarning("Only sandboxed UP nodes found in host list. The status would be fetched from a sandbox host!") - return upSandboxHosts + + // update IsPrimary of the nodes for running database + if !forDownDatabase { + for i := 0; i < len(nodeStates); i++ { + nodeInfo := nodeStates[i] + scName := nodeInfo.Subcluster + nodeStates[i].IsPrimary = scMap[scName] + } } - // We do not have an up host, so better try with complete input hostlist - return inputHosts + return nodeStates } diff --git a/vclusterops/https_check_node_state_op.go b/vclusterops/https_check_node_state_op.go deleted file mode 100644 index 46adb6a..0000000 --- a/vclusterops/https_check_node_state_op.go +++ /dev/null @@ -1,138 +0,0 @@ -/* - (c) Copyright [2023-2024] Open Text. - Licensed under the Apache License, Version 2.0 (the "License"); - You may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package vclusterops - -import ( - "errors" - "fmt" - - "github.com/vertica/vcluster/vclusterops/util" -) - -type httpsCheckNodeStateOp struct { - opBase - opHTTPSBase -} - -func makeHTTPSCheckNodeStateOp(hosts []string, - useHTTPPassword bool, - userName string, - httpsPassword *string, -) (httpsCheckNodeStateOp, error) { - op := httpsCheckNodeStateOp{} - op.name = "HTTPCheckNodeStateOp" - op.description = "Check node state from running database" - // The hosts are the ones we are going to talk to. - // They can be a subset of the actual host information that we return, - // as if any of the hosts is responsive, spread can give us the info of all nodes - op.hosts = hosts - op.useHTTPPassword = useHTTPPassword - - err := util.ValidateUsernameAndPassword(op.name, useHTTPPassword, userName) - if err != nil { - return op, err - } - - op.userName = userName - op.httpsPassword = httpsPassword - return op, nil -} - -func (op *httpsCheckNodeStateOp) setupClusterHTTPRequest(hosts []string) error { - for _, host := range hosts { - httpRequest := hostHTTPRequest{} - httpRequest.Method = GetMethod - httpRequest.buildHTTPSEndpoint("nodes") - if op.useHTTPPassword { - httpRequest.Password = op.httpsPassword - httpRequest.Username = op.userName - } - op.clusterHTTPRequest.RequestCollection[host] = httpRequest - } - - return nil -} - -func (op *httpsCheckNodeStateOp) prepare(execContext *opEngineExecContext) error { - execContext.dispatcher.setup(op.hosts) - - return op.setupClusterHTTPRequest(op.hosts) -} - -func (op *httpsCheckNodeStateOp) execute(execContext *opEngineExecContext) error { - if err := op.runExecute(execContext); err != nil { - return err - } - - return op.processResult(execContext) -} - -func (op *httpsCheckNodeStateOp) processResult(execContext *opEngineExecContext) error { - var allErrs error - respondingNodeCount := 0 - - for host, result := range op.clusterHTTPRequest.ResultCollection { - op.logResponse(host, result) - - if result.isUnauthorizedRequest() { - op.logger.PrintError("[%s] unauthorized request: %s", op.name, result.content) - execContext.hostsWithWrongAuth = append(execContext.hostsWithWrongAuth, host) - // return here because we assume that - // we will get the same error across other nodes - allErrs = errors.Join(allErrs, result.err) - return allErrs - } - - if !result.isPassing() { - // for any error, we continue to the next node - if result.isInternalError() { - op.logger.PrintError("[%s] internal error of the /nodes endpoint: %s", op.name, result.content) - // At internal error originated from the server, so its a - // response, just not a successful one. - respondingNodeCount++ - } - allErrs = errors.Join(allErrs, result.err) - continue - } - - // parse the /nodes endpoint response - respondingNodeCount++ - nodesStates := nodesStateInfo{} - err := op.parseAndCheckResponse(host, result.content, &nodesStates) - if err != nil { - err = fmt.Errorf("[%s] fail to parse result on host %s: %w", - op.name, host, err) - allErrs = errors.Join(allErrs, err) - continue - } - - nodesInfo := nodesInfo{} - for _, node := range nodesStates.NodeList { - n := node.asNodeInfoWithoutVer() - nodesInfo.NodeList = append(nodesInfo.NodeList, n) - } - // successful case, write the result into exec context - execContext.nodesInfo = nodesInfo.NodeList - op.logger.PrintInfo("reporting results as obtained from the host [%s] ", host) - return nil - } - - return allErrs -} - -func (op *httpsCheckNodeStateOp) finalize(_ *opEngineExecContext) error { - return nil -} diff --git a/vclusterops/https_get_nodes_info_op.go b/vclusterops/https_get_nodes_info_op.go index 66e462b..3eb9065 100644 --- a/vclusterops/https_get_nodes_info_op.go +++ b/vclusterops/https_get_nodes_info_op.go @@ -113,6 +113,13 @@ func (op *httpsGetNodesInfoOp) processResult(_ *opEngineExecContext) error { for host, result := range op.clusterHTTPRequest.ResultCollection { op.logResponse(host, result) + // A host may have precondition failed, such as + // "Local node has not joined cluster yet, HTTP server will accept connections when the node has joined the cluster" + // In this case, we skip use the information from that host + if result.hasPreconditionFailed() { + continue + } + if result.isUnauthorizedRequest() { detail := fmt.Sprintf("[%s] wrong password/certificate for https service on host %s", op.name, host) diff --git a/vclusterops/https_stop_db_op.go b/vclusterops/https_stop_db_op.go index 37e102b..3821e7a 100644 --- a/vclusterops/https_stop_db_op.go +++ b/vclusterops/https_stop_db_op.go @@ -21,6 +21,7 @@ import ( "regexp" "strconv" + mapset "github.com/deckarep/golang-set/v2" "github.com/vertica/vcluster/vclusterops/util" ) @@ -87,6 +88,7 @@ func (op *httpsStopDBOp) prepare(execContext *opEngineExecContext) error { sandboxOnly := false var mainHost string var hosts []string + sandboxes := mapset.NewSet[string]() for h, sb := range execContext.upHostsToSandboxes { if sb == op.sandbox && sb != "" { // stop db only on sandbox @@ -96,7 +98,8 @@ func (op *httpsStopDBOp) prepare(execContext *opEngineExecContext) error { } if sb == "" { mainHost = h - } else { + } else if !sandboxes.Contains(sb) { + sandboxes.Add(sb) hosts = append(hosts, h) } } @@ -124,6 +127,7 @@ func (op *httpsStopDBOp) execute(execContext *opEngineExecContext) error { func (op *httpsStopDBOp) processResult(_ *opEngineExecContext) error { var allErrs error re := regexp.MustCompile(`Set subcluster \(.*\) to draining state.*`) + regHang := regexp.MustCompile(`context\s+deadline\s+exceeded\s+\(Client\.Timeout\s+exceeded\s+while\s+awaiting\s+headers\)`) for host, result := range op.clusterHTTPRequest.ResultCollection { op.logResponse(host, result) @@ -135,6 +139,11 @@ func (op *httpsStopDBOp) processResult(_ *opEngineExecContext) error { } if !result.isPassing() { allErrs = errors.Join(allErrs, result.err) + if regHang.MatchString(result.err.Error()) { + err := fmt.Errorf("hint: use NMA endpoint /v1/vertica-process/signal?signal_type=kill to terminate a hanging Vertica " + + "process on the failed host") + allErrs = errors.Join(allErrs, err) + } continue } diff --git a/vclusterops/https_update_node_state_op.go b/vclusterops/https_update_node_state_op.go index 401f9ba..a8a8703 100644 --- a/vclusterops/https_update_node_state_op.go +++ b/vclusterops/https_update_node_state_op.go @@ -87,6 +87,19 @@ func (op *httpsUpdateNodeStateOp) processResult(execContext *opEngineExecContext for host, result := range op.clusterHTTPRequest.ResultCollection { op.logResponse(host, result) + // A host may have precondition failed, such as + // "Local node has not joined cluster yet, HTTP server will accept connections when the node has joined the cluster" + // In this case, we mark the node status as UNKNOWN + if result.hasPreconditionFailed() { + vnode, ok := op.vdb.HostNodeMap[host] + if !ok { + return fmt.Errorf("cannot find host %s in vdb", host) + } + vnode.State = util.NodeUnknownState + + continue + } + if result.isUnauthorizedRequest() { op.logger.PrintError("[%s] unauthorized request: %s", op.name, result.content) execContext.hostsWithWrongAuth = append(execContext.hostsWithWrongAuth, host) @@ -124,6 +137,7 @@ func (op *httpsUpdateNodeStateOp) processResult(execContext *opEngineExecContext return fmt.Errorf("cannot find host %s in vdb", host) } vnode.State = nodeInfo.State + vnode.IsPrimary = nodeInfo.IsPrimary } else { // if the result format is wrong on any of the hosts, we should throw an error return fmt.Errorf(util.NodeInfoCountMismatch, op.name, len(nodesInformation.NodeList), host)