Skip to content

Commit

Permalink
DAOS-16477 pool: Rename Suspect state to Dead (#15584)
Browse files Browse the repository at this point in the history
Change the name to more closely reflect the underlying
SWIM status, and reduce user confusion. An engine that
has been marked DEAD by SWIM cannot participate in pool
services, and has most likely already SIGKILL-ed itself.

Signed-off-by: Michael MacDonald <[email protected]>
  • Loading branch information
mjmac authored Dec 13, 2024
1 parent 1392ea8 commit 29ad64d
Show file tree
Hide file tree
Showing 24 changed files with 363 additions and 500 deletions.
8 changes: 4 additions & 4 deletions docs/admin/pool_operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -698,25 +698,25 @@ The example below shows a rebuild in progress and NVMe space allocated.
Rebuild busy, 75 objs, 9722 recs
```

After experiencing significant failures, the pool may retain some suspect
After experiencing significant failures, the pool may retain some "dead"
engines that have been marked as DEAD by the SWIM protocol but were not excluded
from the pool to prevent potential data inconsistency. An administrator can bring
these engines back online by restarting them. The example below illustrates the
system’s status with suspect and disabled engines.
system’s status with dead and disabled engines.

```bash
$ dmg pool query tank -t
```

NB: The --health-only/-t option is necessary to conduct pool health-related queries only.
This is important because suspect ranks may cause commands to hang and timeout so identifying
This is important because dead ranks may cause commands to hang and timeout so identifying
and restarting them is a useful procedure.

```bash
Pool 6f450a68-8c7d-4da9-8900-02691650f6a2, ntarget=8, disabled=2, leader=3, version=4, state=Degraded
Pool health info:
- Disabled ranks: 1
- Suspect ranks: 2
- Dead ranks: 2
- Rebuild busy, 0 objs, 0 recs
```

Expand Down
4 changes: 2 additions & 2 deletions src/control/cmd/daos/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func (cmd *healthCheckCmd) Execute([]string) error {
}()

queryMask := daos.MustNewPoolQueryMask(daos.PoolQueryOptionEnabledEngines,
daos.PoolQueryOptionSuspectEngines)
daos.PoolQueryOptionDeadEngines)
if pool.DisabledTargets > 0 {
queryMask.SetOptions(daos.PoolQueryOptionDisabledEngines)
}
Expand All @@ -111,7 +111,7 @@ func (cmd *healthCheckCmd) Execute([]string) error {
}
pool.EnabledRanks = tpi.EnabledRanks
pool.DisabledRanks = tpi.DisabledRanks
pool.SuspectRanks = tpi.SuspectRanks
pool.DeadRanks = tpi.DeadRanks

poolConts, err := listContainers(poolHdl)
if err != nil {
Expand Down
16 changes: 8 additions & 8 deletions src/control/cmd/daos/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (
var rl *C.d_rank_list_t = nil

if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) || queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) ||
queryMask.HasOption(daos.PoolQueryOptionSuspectEngines) {
queryMask.HasOption(daos.PoolQueryOptionDeadEngines) {
rlPtr = &rl
}

Expand Down Expand Up @@ -330,8 +330,8 @@ func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (
if queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
poolInfo.DisabledRanks = rs
}
if queryMask.HasOption(daos.PoolQueryOptionSuspectEngines) {
poolInfo.SuspectRanks = rs
if queryMask.HasOption(daos.PoolQueryOptionDeadEngines) {
poolInfo.DeadRanks = rs
}
}

Expand All @@ -357,8 +357,8 @@ func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.Poo
poolInfo.EnabledRanks = poolInfo1.EnabledRanks
case daos.PoolQueryOptionDisabledEngines:
poolInfo.DisabledRanks = poolInfo1.DisabledRanks
case daos.PoolQueryOptionSuspectEngines:
poolInfo.SuspectRanks = poolInfo1.SuspectRanks
case daos.PoolQueryOptionDeadEngines:
poolInfo.DeadRanks = poolInfo1.DeadRanks
}
return nil
}
Expand All @@ -369,8 +369,8 @@ func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.Poo
firstOption = daos.PoolQueryOptionEnabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
firstOption = daos.PoolQueryOptionDisabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionSuspectEngines) {
firstOption = daos.PoolQueryOptionSuspectEngines
} else if originalMask.HasOption(daos.PoolQueryOptionDeadEngines) {
firstOption = daos.PoolQueryOptionDeadEngines
}

// Perform the first query to get basic information
Expand All @@ -382,7 +382,7 @@ func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.Poo
queryOptions := []string{
daos.PoolQueryOptionEnabledEngines,
daos.PoolQueryOptionDisabledEngines,
daos.PoolQueryOptionSuspectEngines,
daos.PoolQueryOptionDeadEngines,
}

// Process each option sequentially
Expand Down
8 changes: 4 additions & 4 deletions src/control/cmd/daos/pretty/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ func printPoolHealth(out io.Writer, pi *daos.PoolInfo, verbose bool) {
}

var healthStrings []string
if pi.SuspectRanks != nil && pi.SuspectRanks.Count() > 0 {
degStr := "Suspect"
if pi.DeadRanks != nil && pi.DeadRanks.Count() > 0 {
deadStr := "Dead"
if verbose {
degStr += fmt.Sprintf(" %s", pi.SuspectRanks)
deadStr += fmt.Sprintf(" %s", pi.DeadRanks)
}
healthStrings = append(healthStrings, degStr)
healthStrings = append(healthStrings, deadStr)
}
if pi.DisabledTargets > 0 {
degStr := "Degraded"
Expand Down
6 changes: 3 additions & 3 deletions src/control/cmd/daos/pretty/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ func PrintPoolInfo(pi *daos.PoolInfo, out io.Writer) error {
if pi.DisabledRanks.Count() > 0 {
fmt.Fprintf(w, "- Disabled ranks: %s\n", pi.DisabledRanks)
}
if pi.QueryMask.HasOption(daos.PoolQueryOptionSuspectEngines) &&
pi.SuspectRanks != nil && pi.SuspectRanks.Count() > 0 {
fmt.Fprintf(w, "- Suspect ranks: %s\n", pi.SuspectRanks)
if pi.QueryMask.HasOption(daos.PoolQueryOptionDeadEngines) &&
pi.DeadRanks != nil && pi.DeadRanks.Count() > 0 {
fmt.Fprintf(w, "- Dead ranks: %s\n", pi.DeadRanks)
}
if pi.Rebuild != nil {
if pi.Rebuild.Status == 0 {
Expand Down
6 changes: 3 additions & 3 deletions src/control/cmd/daos/pretty/pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ Pool space info:
Free: 1 B, min:0 B, max:0 B, mean:0 B
`, poolUUID.String()),
},
"normal response; suspect ranks": {
"normal response; dead ranks": {
pi: &daos.PoolInfo{
QueryMask: daos.HealthOnlyPoolQueryMask,
State: daos.PoolServiceStateDegraded,
Expand All @@ -141,7 +141,7 @@ Pool space info:
PoolLayoutVer: 1,
UpgradeLayoutVer: 2,
DisabledRanks: ranklist.MustCreateRankSet("[0,1,3]"),
SuspectRanks: ranklist.MustCreateRankSet("[2]"),
DeadRanks: ranklist.MustCreateRankSet("[2]"),
Rebuild: &daos.PoolRebuildStatus{
State: daos.PoolRebuildStateBusy,
Objects: 42,
Expand All @@ -163,7 +163,7 @@ Pool %s, ntarget=2, disabled=1, leader=42, version=100, state=Degraded
Pool layout out of date (1 < 2) -- see `+backtickStr+` for details.
Pool health info:
- Disabled ranks: 0-1,3
- Suspect ranks: 2
- Dead ranks: 2
- Rebuild busy, 42 objs, 21 recs
`, poolUUID.String()),
},
Expand Down
Loading

0 comments on commit 29ad64d

Please sign in to comment.