Skip to content

Commit

Permalink
Merge pull request #15609 from daos-stack/mjmac/dead_ranks
Browse files Browse the repository at this point in the history
mjmac/dead ranks
  • Loading branch information
jolivier23 authored Dec 13, 2024
2 parents f943797 + c57c674 commit 8088155
Show file tree
Hide file tree
Showing 43 changed files with 1,388 additions and 814 deletions.
22 changes: 22 additions & 0 deletions docs/admin/pool_operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,28 @@ The example below shows a rebuild in progress and NVMe space allocated.
Rebuild busy, 75 objs, 9722 recs
```

After experiencing significant failures, the pool may retain some "dead"
engines that have been marked as DEAD by the SWIM protocol but were not excluded
from the pool to prevent potential data inconsistency. An administrator can bring
these engines back online by restarting them. The example below illustrates the
system’s status with dead and disabled engines.

```bash
$ dmg pool query tank -t
```

NB: The --health-only/-t option is necessary to conduct pool health-related queries only.
This is important because dead ranks may cause commands to hang and timeout so identifying
and restarting them is a useful procedure.

```bash
Pool 6f450a68-8c7d-4da9-8900-02691650f6a2, ntarget=8, disabled=2, leader=3, version=4, state=Degraded
Pool health info:
- Disabled ranks: 1
- Dead ranks: 2
- Rebuild busy, 0 objs, 0 recs
```

Additional status and telemetry data is planned to be exported through
management tools and will be documented here once available.

Expand Down
6 changes: 3 additions & 3 deletions src/common/tests_dmg_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -1118,9 +1118,9 @@ dmg_pool_extend(const char *dmg_config_file, const uuid_t uuid,
rank_list.rl_ranks = ranks;
rank_list.rl_nr = rank_nr;

rank_str = d_rank_list_to_str(&rank_list);
if (rank_str == NULL)
D_GOTO(out, rc = -DER_NOMEM);
rc = d_rank_list_to_str(&rank_list, &rank_str);
if (rc != 0)
D_GOTO(out, rc);

uuid_unparse_lower(uuid, uuid_str);
args = cmd_push_arg(args, &argcount, "%s ", uuid_str);
Expand Down
19 changes: 7 additions & 12 deletions src/control/cmd/daos/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,24 +99,19 @@ func (cmd *healthCheckCmd) Execute([]string) error {
}
}()

queryMask := daos.MustNewPoolQueryMask(daos.PoolQueryOptionEnabledEngines)
queryMask := daos.MustNewPoolQueryMask(daos.PoolQueryOptionEnabledEngines,
daos.PoolQueryOptionDeadEngines)
if pool.DisabledTargets > 0 {
queryMask.SetOptions(daos.PoolQueryOptionDisabledEngines)
}
tpi, err := queryPool(poolHdl, queryMask)
if err != nil {
cmd.Errorf("failed to query pool %s: %v", pool.Label, err)
continue
}
pool.EnabledRanks = tpi.EnabledRanks

if pool.DisabledTargets > 0 {
queryMask.ClearAll()
queryMask.SetOptions(daos.PoolQueryOptionDisabledEngines)
tpi, err = queryPool(poolHdl, queryMask)
if err != nil {
cmd.Errorf("failed to query pool %s: %v", pool.Label, err)
continue
}
pool.DisabledRanks = tpi.DisabledRanks
}
pool.DisabledRanks = tpi.DisabledRanks
pool.DeadRanks = tpi.DeadRanks

poolConts, err := listContainers(poolHdl)
if err != nil {
Expand Down
79 changes: 68 additions & 11 deletions src/control/cmd/daos/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,8 @@ type poolCmd struct {

type poolQueryCmd struct {
poolBaseCmd
ShowEnabledRanks bool `short:"e" long:"show-enabled" description:"Show engine unique identifiers (ranks) which are enabled"`
ShowDisabledRanks bool `short:"b" long:"show-disabled" description:"Show engine unique identifiers (ranks) which are disabled"`
HealthOnly bool `short:"t" long:"health-only" description:"Only perform pool health related queries"`
ShowEnabledRanks bool `short:"e" long:"show-enabled" description:"Show engine unique identifiers (ranks) which are enabled"`
HealthOnly bool `short:"t" long:"health-only" description:"Only perform pool health related queries"`
}

func convertPoolSpaceInfo(in *C.struct_daos_pool_space, mt C.uint) *daos.StorageUsageStats {
Expand Down Expand Up @@ -296,11 +295,12 @@ func convertPoolInfo(pinfo *C.daos_pool_info_t) (*daos.PoolInfo, error) {
return poolInfo, nil
}

func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
var rlPtr **C.d_rank_list_t = nil
var rl *C.d_rank_list_t = nil

if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) || queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) || queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) ||
queryMask.HasOption(daos.PoolQueryOptionDeadEngines) {
rlPtr = &rl
}

Expand Down Expand Up @@ -330,6 +330,68 @@ func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.Poo
if queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
poolInfo.DisabledRanks = rs
}
if queryMask.HasOption(daos.PoolQueryOptionDeadEngines) {
poolInfo.DeadRanks = rs
}
}

return poolInfo, nil
}
func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
poolInfo := &daos.PoolInfo{}
originalMask := queryMask // Save the original queryMask

// Function to handle the query and return a single RankList
queryAndUpdate := func(option string) error {
// Clear previous options and set new option
queryMask.ClearAll()
queryMask.SetOptions(option)

poolInfo1, err := queryPoolRankLists(poolHdl, queryMask)
if err != nil {
return err
}

switch option {
case daos.PoolQueryOptionEnabledEngines:
poolInfo.EnabledRanks = poolInfo1.EnabledRanks
case daos.PoolQueryOptionDisabledEngines:
poolInfo.DisabledRanks = poolInfo1.DisabledRanks
case daos.PoolQueryOptionDeadEngines:
poolInfo.DeadRanks = poolInfo1.DeadRanks
}
return nil
}

// Preprocess queryMask, select one option for the first query
var firstOption string
if originalMask.HasOption(daos.PoolQueryOptionEnabledEngines) {
firstOption = daos.PoolQueryOptionEnabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
firstOption = daos.PoolQueryOptionDisabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionDeadEngines) {
firstOption = daos.PoolQueryOptionDeadEngines
}

// Perform the first query to get basic information
if err := queryAndUpdate(firstOption); err != nil {
return nil, err
}

// Check the original query mask and update fields as needed
queryOptions := []string{
daos.PoolQueryOptionEnabledEngines,
daos.PoolQueryOptionDisabledEngines,
daos.PoolQueryOptionDeadEngines,
}

// Process each option sequentially
for _, opt := range queryOptions {
if originalMask.HasOption(opt) && opt != firstOption {
if err := queryAndUpdate(opt); err != nil {
return nil, err
}
}
}

return poolInfo, nil
Expand All @@ -340,15 +402,10 @@ func (cmd *poolQueryCmd) Execute(_ []string) error {
if cmd.HealthOnly {
queryMask = daos.HealthOnlyPoolQueryMask
}
if cmd.ShowEnabledRanks && cmd.ShowDisabledRanks {
return errors.New("show-enabled and show-disabled can't be used at the same time.")
}
if cmd.ShowEnabledRanks {
queryMask.SetOptions(daos.PoolQueryOptionEnabledEngines)
}
if cmd.ShowDisabledRanks {
queryMask.SetOptions(daos.PoolQueryOptionDisabledEngines)
}
queryMask.SetOptions(daos.PoolQueryOptionDisabledEngines)

cleanup, err := cmd.resolveAndConnect(C.DAOS_PC_RO, nil)
if err != nil {
Expand Down
7 changes: 7 additions & 0 deletions src/control/cmd/daos/pretty/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ func printPoolHealth(out io.Writer, pi *daos.PoolInfo, verbose bool) {
}

var healthStrings []string
if pi.DeadRanks != nil && pi.DeadRanks.Count() > 0 {
deadStr := "Dead"
if verbose {
deadStr += fmt.Sprintf(" %s", pi.DeadRanks)
}
healthStrings = append(healthStrings, deadStr)
}
if pi.DisabledTargets > 0 {
degStr := "Degraded"
if verbose {
Expand Down
6 changes: 5 additions & 1 deletion src/control/cmd/daos/pretty/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,13 @@ func PrintPoolInfo(pi *daos.PoolInfo, out io.Writer) error {
if pi.EnabledRanks != nil && pi.EnabledRanks.Count() > 0 {
fmt.Fprintf(w, "- Enabled ranks: %s\n", pi.EnabledRanks)
}
if pi.DisabledRanks != nil && pi.DisabledRanks.Count() > 0 {
if pi.DisabledRanks.Count() > 0 {
fmt.Fprintf(w, "- Disabled ranks: %s\n", pi.DisabledRanks)
}
if pi.QueryMask.HasOption(daos.PoolQueryOptionDeadEngines) &&
pi.DeadRanks != nil && pi.DeadRanks.Count() > 0 {
fmt.Fprintf(w, "- Dead ranks: %s\n", pi.DeadRanks)
}
if pi.Rebuild != nil {
if pi.Rebuild.Status == 0 {
fmt.Fprintf(w, "- Rebuild %s, %d objs, %d recs\n",
Expand Down
39 changes: 39 additions & 0 deletions src/control/cmd/daos/pretty/pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,45 @@ Pool space info:
- Storage tier 1 (NVMe):
Total size: 2 B
Free: 1 B, min:0 B, max:0 B, mean:0 B
`, poolUUID.String()),
},
"normal response; dead ranks": {
pi: &daos.PoolInfo{
QueryMask: daos.HealthOnlyPoolQueryMask,
State: daos.PoolServiceStateDegraded,
UUID: poolUUID,
TotalTargets: 2,
DisabledTargets: 1,
ActiveTargets: 1,
ServiceLeader: 42,
Version: 100,
PoolLayoutVer: 1,
UpgradeLayoutVer: 2,
DisabledRanks: ranklist.MustCreateRankSet("[0,1,3]"),
DeadRanks: ranklist.MustCreateRankSet("[2]"),
Rebuild: &daos.PoolRebuildStatus{
State: daos.PoolRebuildStateBusy,
Objects: 42,
Records: 21,
},
TierStats: []*daos.StorageUsageStats{
{
Total: 2,
Free: 1,
},
{
Total: 2,
Free: 1,
},
},
},
expPrintStr: fmt.Sprintf(`
Pool %s, ntarget=2, disabled=1, leader=42, version=100, state=Degraded
Pool layout out of date (1 < 2) -- see `+backtickStr+` for details.
Pool health info:
- Disabled ranks: 0-1,3
- Dead ranks: 2
- Rebuild busy, 42 objs, 21 recs
`, poolUUID.String()),
},
"normal response; disabled ranks": {
Expand Down
13 changes: 3 additions & 10 deletions src/control/cmd/dmg/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -606,9 +606,8 @@ func (cmd *PoolReintegrateCmd) Execute(args []string) error {
// PoolQueryCmd is the struct representing the command to query a DAOS pool.
type PoolQueryCmd struct {
poolCmd
ShowEnabledRanks bool `short:"e" long:"show-enabled" description:"Show engine unique identifiers (ranks) which are enabled"`
ShowDisabledRanks bool `short:"b" long:"show-disabled" description:"Show engine unique identifiers (ranks) which are disabled"`
HealthOnly bool `short:"t" long:"health-only" description:"Only perform pool health related queries"`
ShowEnabledRanks bool `short:"e" long:"show-enabled" description:"Show engine unique identifiers (ranks) which are enabled"`
HealthOnly bool `short:"t" long:"health-only" description:"Only perform pool health related queries"`
}

// Execute is run when PoolQueryCmd subcommand is activated
Expand All @@ -621,16 +620,10 @@ func (cmd *PoolQueryCmd) Execute(args []string) error {
if cmd.HealthOnly {
req.QueryMask = daos.HealthOnlyPoolQueryMask
}
// TODO (DAOS-10250) The two options should not be incompatible (i.e. engine limitation)
if cmd.ShowEnabledRanks && cmd.ShowDisabledRanks {
return errIncompatFlags("show-enabled-ranks", "show-disabled-ranks")
}
if cmd.ShowEnabledRanks {
req.QueryMask.SetOptions(daos.PoolQueryOptionEnabledEngines)
}
if cmd.ShowDisabledRanks {
req.QueryMask.SetOptions(daos.PoolQueryOptionDisabledEngines)
}
req.QueryMask.SetOptions(daos.PoolQueryOptionDisabledEngines)

resp, err := control.PoolQuery(cmd.MustLogCtx(), cmd.ctlInvoker, req)
if cmd.JSONOutputEnabled() {
Expand Down
28 changes: 0 additions & 28 deletions src/control/cmd/dmg/pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1025,28 +1025,6 @@ func TestPoolCommands(t *testing.T) {
}, " "),
nil,
},
{
"Query pool with UUID and disabled ranks",
"pool query --show-disabled 12345678-1234-1234-1234-1234567890ab",
strings.Join([]string{
printRequest(t, &control.PoolQueryReq{
ID: "12345678-1234-1234-1234-1234567890ab",
QueryMask: setQueryMask(func(qm *daos.PoolQueryMask) { qm.SetOptions(daos.PoolQueryOptionDisabledEngines) }),
}),
}, " "),
nil,
},
{
"Query pool with UUID and disabled ranks",
"pool query -b 12345678-1234-1234-1234-1234567890ab",
strings.Join([]string{
printRequest(t, &control.PoolQueryReq{
ID: "12345678-1234-1234-1234-1234567890ab",
QueryMask: setQueryMask(func(qm *daos.PoolQueryMask) { qm.SetOptions(daos.PoolQueryOptionDisabledEngines) }),
}),
}, " "),
nil,
},
{
"Query pool for health only",
"pool query --health-only 12345678-1234-1234-1234-1234567890ab",
Expand Down Expand Up @@ -1091,12 +1069,6 @@ func TestPoolCommands(t *testing.T) {
"",
fmt.Errorf("Unknown command"),
},
{
"Query pool with incompatible arguments",
"pool query --show-disabled --show-enabled 12345678-1234-1234-1234-1234567890ab",
"",
errors.New("may not be mixed"),
},
})
}

Expand Down
Loading

0 comments on commit 8088155

Please sign in to comment.