Skip to content

Commit

Permalink
Sync from server repo (4c95da46ab)
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt Spilchen committed Sep 25, 2023
1 parent f0bd199 commit dadafc1
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 9 deletions.
27 changes: 27 additions & 0 deletions commands/cmd_add_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package commands
import (
"flag"
"fmt"
"strings"

"github.com/vertica/vcluster/vclusterops"
"github.com/vertica/vcluster/vclusterops/util"
Expand All @@ -32,6 +33,8 @@ type CmdAddNode struct {
addNodeOptions *vclusterops.VAddNodeOptions
// Comma-separated list of hosts to add
newHostListStr *string
// Comma-separated list of node names, which exist in the cluster
nodeNameListStr *string

CmdBase
}
Expand Down Expand Up @@ -67,6 +70,11 @@ func makeCmdAddNode() *CmdAddNode {
addNodeOptions.DepotPrefix = newCmd.parser.String("depot-path", "", util.GetEonFlagMsg("Path to depot directory"))
addNodeOptions.DepotSize = newCmd.parser.String("depot-size", "", util.GetEonFlagMsg("Size of depot"))

// Optional flags
newCmd.nodeNameListStr = newCmd.parser.String("node-names", "",
util.GetOptionalFlagMsg("Comma-separated list of node names that exist in the cluster. "+
"Use with caution: not mentioned nodes will be trimmed from catalog."))

newCmd.addNodeOptions = &addNodeOptions
return newCmd
}
Expand Down Expand Up @@ -106,6 +114,11 @@ func (c *CmdAddNode) validateParse() error {
return err
}

err = c.parseNodeNameList()
if err != nil {
return err
}

return c.ValidateParseBaseOptions(&c.addNodeOptions.DatabaseOptions)
}

Expand All @@ -121,6 +134,20 @@ func (c *CmdAddNode) parseNewHostList() error {
return nil
}

func (c *CmdAddNode) parseNodeNameList() error {
// if --node-names is set, there must be at least one node name
if util.IsOptionSet(c.parser, "node-names") {
if *c.nodeNameListStr == "" {
return fmt.Errorf("when --node-names is specified, "+
"must provide all existing node names in %s", *c.addNodeOptions.DBName)
}

c.addNodeOptions.ExpectedNodeNames = strings.Split(*c.nodeNameListStr, ",")
}

return nil
}

func (c *CmdAddNode) Analyze() error {
return nil
}
Expand Down
94 changes: 94 additions & 0 deletions vclusterops/add_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ type VAddNodeOptions struct {
DepotSize *string // like 10G
// Skip rebalance shards if true
SkipRebalanceShards *bool

// Names of the existing nodes in the cluster.
// This options can be used to remove partially added nodes from catalog.
ExpectedNodeNames []string
}

func VAddNodeOptionsFactory() VAddNodeOptions {
Expand Down Expand Up @@ -166,6 +170,13 @@ func (vcc *VClusterCommands) VAddNode(options *VAddNodeOptions) (VCoordinationDa
return vdb, err
}

// trim stale node information from catalog
// if NodeNames is provided
err = trimNodesInCatalog(&vdb, options)
if err != nil {
return vdb, err
}

err = vdb.addHosts(options.NewHosts)
if err != nil {
return vdb, err
Expand Down Expand Up @@ -219,6 +230,89 @@ func (o *VAddNodeOptions) completeVDBSetting(vdb *VCoordinationDatabase) error {
return nil
}

// trimNodesInCatalog removes failed node info from catalog
// which can be used to remove partially added nodes
func trimNodesInCatalog(vdb *VCoordinationDatabase,
options *VAddNodeOptions) error {
if len(options.ExpectedNodeNames) == 0 {
vlog.LogInfoln("ExpectedNodeNames is not set, skip trimming nodes")
return nil
}

// find out nodes to be trimmed
// trimmed nodes are the ones in catalog but not expected
expectedNodeNames := make(map[string]any)
for _, nodeName := range options.ExpectedNodeNames {
expectedNodeNames[nodeName] = struct{}{}
}

var aliveHosts []string
var nodesToTrim []string
nodeNamesInCatalog := make(map[string]any)
for h, vnode := range vdb.HostNodeMap {
nodeNamesInCatalog[vnode.Name] = struct{}{}
if _, ok := expectedNodeNames[vnode.Name]; ok { // catalog node is expected
aliveHosts = append(aliveHosts, h)
} else { // catalog node is not expected, trim it
// cannot trim UP nodes
if vnode.State == util.NodeUpState {
return fmt.Errorf("cannot trim the UP node %s (address %s)",
vnode.Name, h)
}
nodesToTrim = append(nodesToTrim, vnode.Name)
}
}

// sanity check: all provided node names should be found in catalog
invalidNodeNames := util.MapKeyDiff(expectedNodeNames, nodeNamesInCatalog)
if len(invalidNodeNames) > 0 {
return fmt.Errorf("node names %v are not found in database %s",
invalidNodeNames, vdb.Name)
}

vlog.LogPrintInfo("Trim nodes %+v from catalog", nodesToTrim)

// pick any up host as intiator
initiator := aliveHosts[:1]

var instructions []ClusterOp

// mark k-safety
if len(aliveHosts) < ksafetyThreshold {
httpsMarkDesignKSafeOp, err := makeHTTPSMarkDesignKSafeOp(initiator,
options.usePassword, *options.UserName, options.Password,
ksafeValueZero)
if err != nil {
return err
}
instructions = append(instructions, &httpsMarkDesignKSafeOp)
}

// remove down nodes from catalog
for _, nodeName := range nodesToTrim {
httpsDropNodeOp, err := makeHTTPSDropNodeOp(nodeName, initiator,
options.usePassword, *options.UserName, options.Password, vdb.IsEon)
if err != nil {
return err
}
instructions = append(instructions, &httpsDropNodeOp)
}

certs := HTTPSCerts{key: options.Key, cert: options.Cert, caCert: options.CaCert}
clusterOpEngine := MakeClusterOpEngine(instructions, &certs)
err := clusterOpEngine.Run()
if err != nil {
vlog.LogPrintError("fail to trim nodes from catalog, %v", err)
return err
}

// update vdb info
vdb.HostNodeMap = util.FilterMapByKey(vdb.HostNodeMap, aliveHosts)
vdb.HostList = aliveHosts

return nil
}

// produceAddNodeInstructions will build a list of instructions to execute for
// the add node operation.
//
Expand Down
34 changes: 29 additions & 5 deletions vclusterops/https_poll_node_state_op.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,22 @@ import (
// 30 seconds is long enough for normal http request.
// If this timeout is reached, it might imply that the target IP is unreachable
const httpRequestTimeoutSeconds = 30
const (
StartDBCmd CmdType = iota
RestartNodeCmd
)

type CmdType int

func (cmd CmdType) String() string {
switch cmd {
case StartDBCmd:
return "start_db"
case RestartNodeCmd:
return "restart_node"
}
return "unknown_operation"
}

type HTTPSPollNodeStateOp struct {
OpBase
Expand All @@ -38,6 +54,7 @@ type HTTPSPollNodeStateOp struct {
upHosts map[string]any
notUpHosts []string
timeout int
cmdType CmdType
}

func makeHTTPSPollNodeStateOpHelper(hosts []string,
Expand All @@ -62,14 +79,15 @@ func makeHTTPSPollNodeStateOpHelper(hosts []string,
return httpsPollNodeStateOp, nil
}

func makeHTTPSPollNodeStateOpWithTimeout(hosts []string,
func makeHTTPSPollNodeStateOpWithTimeoutAndCommand(hosts []string,
useHTTPPassword bool, userName string, httpsPassword *string,
timeout int) (HTTPSPollNodeStateOp, error) {
timeout int, cmdType CmdType) (HTTPSPollNodeStateOp, error) {
op, err := makeHTTPSPollNodeStateOpHelper(hosts, useHTTPPassword, userName, httpsPassword)
if err != nil {
return op, err
}
op.timeout = timeout
op.cmdType = cmdType
return op, nil
}

Expand Down Expand Up @@ -189,10 +207,16 @@ func (op *HTTPSPollNodeStateOp) shouldStopPolling() (bool, error) {

// VER-88185 vcluster start_db - password related issues
// We don't need to wait until timeout to determine if all nodes are up or not.
// If we find the wrong password for the HTTPS service on any hosts, we should fail immediately."
// If we find the wrong password for the HTTPS service on any hosts, we should fail immediately.
// We also need to let user know to wait until all nodes are up
if result.IsPasswordAndCertificateError() {
vlog.LogPrintError("[%s] The credentials are incorrect. The following steps like 'Catalog Sync' will not be executed.",
op.name)
switch op.cmdType {
case StartDBCmd, RestartNodeCmd:
vlog.LogPrintError("[%s] The credentials are incorrect. 'Catalog Sync' will not be executed.",
op.name)
return true, fmt.Errorf("[%s] wrong password/certificate for https service on host %s, but the nodes' startup have been in progress."+
"Please use vsql to check the nodes' status and manually run sync_catalog vsql command 'select sync_catalog()'", op.name, host)
}
return true, fmt.Errorf("[%s] wrong password/certificate for https service on host %s",
op.name, host)
}
Expand Down
4 changes: 2 additions & 2 deletions vclusterops/restart_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,8 +282,8 @@ func (vcc *VClusterCommands) produceRestartNodesInstructions(restartNodeInfo *VR
return instructions, err
}
nmaRestartNewNodesOp := makeNMAStartNodeOpWithVDB(restartNodeInfo.HostsToRestart, vdb)
httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOpWithTimeout(restartNodeInfo.HostsToRestart,
options.usePassword, *options.UserName, options.Password, options.StatePollingTimeout)
httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOpWithTimeoutAndCommand(restartNodeInfo.HostsToRestart,
options.usePassword, *options.UserName, options.Password, options.StatePollingTimeout, RestartNodeCmd)
if err != nil {
return instructions, err
}
Expand Down
4 changes: 2 additions & 2 deletions vclusterops/start_db.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,8 @@ func (vcc *VClusterCommands) produceStartDBInstructions(options *VStartDatabaseO
nil /*db configurations retrieved from a running db*/)

nmaStartNewNodesOp := makeNMAStartNodeOp(options.Hosts)
httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOpWithTimeout(options.Hosts,
options.usePassword, *options.UserName, options.Password, options.StatePollingTimeout)
httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOpWithTimeoutAndCommand(options.Hosts,
options.usePassword, *options.UserName, options.Password, options.StatePollingTimeout, StartDBCmd)
if err != nil {
return instructions, err
}
Expand Down

0 comments on commit dadafc1

Please sign in to comment.