Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16791 control: Add include_fabric_ifaces to agent config (#15470) #15513

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 28 additions & 10 deletions src/control/cmd/daos_agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,39 @@ type Config struct {
DisableAutoEvict bool `yaml:"disable_auto_evict,omitempty"`
EvictOnStart bool `yaml:"enable_evict_on_start,omitempty"`
ExcludeFabricIfaces common.StringSet `yaml:"exclude_fabric_ifaces,omitempty"`
IncludeFabricIfaces common.StringSet `yaml:"include_fabric_ifaces,omitempty"`
FabricInterfaces []*NUMAFabricConfig `yaml:"fabric_ifaces,omitempty"`
ProviderIdx uint // TODO SRS-31: Enable with multiprovider functionality
TelemetryPort int `yaml:"telemetry_port,omitempty"`
TelemetryEnabled bool `yaml:"telemetry_enabled,omitempty"`
TelemetryRetain time.Duration `yaml:"telemetry_retain,omitempty"`
}

// Validate performs basic validation of the configuration.
func (c *Config) Validate() error {
if c == nil {
return errors.New("config is nil")
}

if !daos.SystemNameIsValid(c.SystemName) {
return fmt.Errorf("invalid system name: %s", c.SystemName)
}

if c.TelemetryRetain > 0 && c.TelemetryPort == 0 {
return errors.New("telemetry_retain requires telemetry_port")
}

if c.TelemetryEnabled && c.TelemetryPort == 0 {
return errors.New("telemetry_enabled requires telemetry_port")
}

if len(c.ExcludeFabricIfaces) > 0 && len(c.IncludeFabricIfaces) > 0 {
return errors.New("cannot specify both exclude_fabric_ifaces and include_fabric_ifaces")
}

return nil
}

// TelemetryExportEnabled returns true if client telemetry export is enabled.
func (c *Config) TelemetryExportEnabled() bool {
return c.TelemetryPort > 0
Expand Down Expand Up @@ -95,16 +121,8 @@ func LoadConfig(cfgPath string) (*Config, error) {
return nil, errors.Wrapf(err, "parsing config: %s", cfgPath)
}

if !daos.SystemNameIsValid(cfg.SystemName) {
return nil, fmt.Errorf("invalid system name: %s", cfg.SystemName)
}

if cfg.TelemetryRetain > 0 && cfg.TelemetryPort == 0 {
return nil, errors.New("telemetry_retain requires telemetry_port")
}

if cfg.TelemetryEnabled && cfg.TelemetryPort == 0 {
return nil, errors.New("telemetry_enabled requires telemetry_port")
if err := cfg.Validate(); err != nil {
return nil, errors.Wrap(err, "agent config validation failed")
}

return cfg, nil
Expand Down
16 changes: 16 additions & 0 deletions src/control/cmd/daos_agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,18 @@ transport_config:
allow_insecure: true
`)

badFilterCfg := test.CreateTestFile(t, dir, `
name: shire
access_points: ["one:10001", "two:10001"]
port: 4242
runtime_dir: /tmp/runtime
log_file: /home/frodo/logfile
transport_config:
allow_insecure: true
include_fabric_ifaces: ["ib0"]
exclude_fabric_ifaces: ["ib3"]
`)

for name, tc := range map[string]struct {
path string
expResult *Config
Expand Down Expand Up @@ -128,6 +140,10 @@ transport_config:
path: badLogMaskCfg,
expErr: errors.New("not a valid log level"),
},
"bad filter config": {
path: badFilterCfg,
expErr: errors.New("cannot specify both exclude_fabric_ifaces and include_fabric_ifaces"),
},
"all options": {
path: optCfg,
expResult: &Config{
Expand Down
53 changes: 42 additions & 11 deletions src/control/cmd/daos_agent/fabric.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,47 @@ type addrFI interface {
Addrs() ([]net.Addr, error)
}

type filterMode int

const (
// filterModeExclude indicates that devices in the set should be excluded
filterModeExclude filterMode = 0
// filterModeInclude indicates that only devices in the set should be included
filterModeInclude filterMode = 1
)

type deviceFilter struct {
deviceSet common.StringSet
mode filterMode
}

func (df *deviceFilter) ShouldIgnore(devName string) bool {
if df == nil || df.deviceSet == nil {
return false
}
if df.mode == filterModeExclude {
return df.deviceSet.Has(devName)
}
return !df.deviceSet.Has(devName)
}

func newDeviceFilter(deviceSet common.StringSet, mode filterMode) *deviceFilter {
return &deviceFilter{
deviceSet: deviceSet,
mode: mode,
}
}

// NUMAFabric represents a set of fabric interfaces organized by NUMA node.
type NUMAFabric struct {
log logging.Logger
mutex sync.RWMutex

numaMap map[int][]*FabricInterface

currentNumaDevIdx map[int]int // current device idx to use on each NUMA node
currentNUMANode int // current NUMA node to search
ignoreIfaces common.StringSet
currentNumaDevIdx map[int]int // current device idx to use on each NUMA node
currentNUMANode int // current NUMA node to search
ifaceFilter *deviceFilter // set of interface names for filtering

getAddrInterface func(name string) (addrFI, error)
}
Expand All @@ -98,12 +129,12 @@ func (n *NUMAFabric) Add(numaNode int, fi *FabricInterface) error {
return nil
}

// WithIgnoredDevices adds a set of fabric interface names that should be ignored when
// selecting a device.
func (n *NUMAFabric) WithIgnoredDevices(ifaces common.StringSet) *NUMAFabric {
n.ignoreIfaces = ifaces
if len(ifaces) > 0 {
n.log.Tracef("ignoring fabric devices: %s", n.ignoreIfaces)
// WithDeviceFilter adds a set of fabric interface names that should be used for
// filtering when selecting a device.
func (n *NUMAFabric) WithDeviceFilter(filter *deviceFilter) *NUMAFabric {
if filter != nil {
n.ifaceFilter = filter
n.log.Tracef("fabric device filter: %+v", n.ifaceFilter)
}
return n
}
Expand Down Expand Up @@ -192,8 +223,8 @@ func (n *NUMAFabric) getDeviceFromNUMA(numaNode int, netDevClass hardware.NetDev
for checked := 0; checked < n.getNumDevices(numaNode); checked++ {
fabricIF := n.getNextDevice(numaNode)

if n.ignoreIfaces.Has(fabricIF.Name) {
n.log.Tracef("device %s: ignored (ignore list %s)", fabricIF, n.ignoreIfaces)
if n.ifaceFilter.ShouldIgnore(fabricIF.Name) {
n.log.Tracef("device %s: ignored (filter: %+v)", fabricIF, n.ifaceFilter)
continue
}

Expand Down
59 changes: 52 additions & 7 deletions src/control/cmd/daos_agent/fabric_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,8 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
for name, tc := range map[string]struct {
nf *NUMAFabric
params *FabricIfaceParams
ignore []string
include []string
exclude []string
expErr error
expResults []*FabricInterface
}{
Expand Down Expand Up @@ -661,7 +662,7 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
},
},
},
"ignore interface": {
"include interface": {
nf: &NUMAFabric{
numaMap: map[int][]*FabricInterface{
0: {
Expand All @@ -687,7 +688,7 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
Provider: "ofi+sockets",
DevClass: hardware.Ether,
},
ignore: []string{"t1"},
include: []string{"t2"},
expResults: []*FabricInterface{
{
Name: "t2",
Expand All @@ -699,7 +700,7 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
},
},
},
"ignore all interfaces": {
"exclude interface": {
nf: &NUMAFabric{
numaMap: map[int][]*FabricInterface{
0: {
Expand All @@ -709,6 +710,8 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
DeviceClass: hardware.Ether,
Providers: testFabricProviderSet("ofi+sockets"),
})[0],
},
1: {
fabricInterfacesFromHardware(&hardware.FabricInterface{
NetInterfaces: common.NewStringSet("t2"),
Name: "t2",
Expand All @@ -723,8 +726,44 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
Provider: "ofi+sockets",
DevClass: hardware.Ether,
},
ignore: []string{"t1", "t2"},
expErr: errors.New("no suitable fabric interface"),
exclude: []string{"t1"},
expResults: []*FabricInterface{
{
Name: "t2",
NetDevClass: hardware.Ether,
},
{
Name: "t2",
NetDevClass: hardware.Ether,
},
},
},
"exclude all interfaces": {
nf: &NUMAFabric{
numaMap: map[int][]*FabricInterface{
0: {
fabricInterfacesFromHardware(&hardware.FabricInterface{
NetInterfaces: common.NewStringSet("t1"),
Name: "t1",
DeviceClass: hardware.Ether,
Providers: testFabricProviderSet("ofi+sockets"),
})[0],
fabricInterfacesFromHardware(&hardware.FabricInterface{
NetInterfaces: common.NewStringSet("t2"),
Name: "t2",
DeviceClass: hardware.Ether,
Providers: testFabricProviderSet("ofi+sockets"),
})[0],
},
},
},
params: &FabricIfaceParams{
NUMANode: 0,
Provider: "ofi+sockets",
DevClass: hardware.Ether,
},
exclude: []string{"t1", "t2"},
expErr: errors.New("no suitable fabric interface"),
},
} {
t.Run(name, func(t *testing.T) {
Expand All @@ -736,7 +775,13 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) {
tc.nf.getAddrInterface = getMockNetInterfaceSuccess
}

tc.nf = tc.nf.WithIgnoredDevices(common.NewStringSet(tc.ignore...))
mode := filterModeExclude
devSet := common.NewStringSet(tc.exclude...)
if len(tc.include) > 0 {
mode = filterModeInclude
devSet = common.NewStringSet(tc.include...)
}
tc.nf = tc.nf.WithDeviceFilter(newDeviceFilter(devSet, mode))
}

numDevices := 0
Expand Down
9 changes: 8 additions & 1 deletion src/control/cmd/daos_agent/infocache.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,20 @@ func NewInfoCache(ctx context.Context, log logging.Logger, client control.UnaryI
return ic
}

func fabricDeviceFilter(cfg *Config) *deviceFilter {
if len(cfg.ExcludeFabricIfaces) > 0 {
return newDeviceFilter(cfg.ExcludeFabricIfaces, filterModeExclude)
}
return newDeviceFilter(cfg.IncludeFabricIfaces, filterModeInclude)
}

func getFabricScanFn(log logging.Logger, cfg *Config, scanner *hardware.FabricScanner) fabricScanFn {
return func(ctx context.Context, provs ...string) (*NUMAFabric, error) {
fis, err := scanner.Scan(ctx, provs...)
if err != nil {
return nil, err
}
return NUMAFabricFromScan(ctx, log, fis).WithIgnoredDevices(cfg.ExcludeFabricIfaces), nil
return NUMAFabricFromScan(ctx, log, fis).WithDeviceFilter(fabricDeviceFilter(cfg)), nil
}
}

Expand Down
7 changes: 6 additions & 1 deletion utils/config/daos_agent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,15 @@
#cache_expiration: 30

## Ignore a subset of fabric interfaces when selecting an interface for client
## applications.
## applications. (Mutually exclusive with include).
#
#exclude_fabric_ifaces: ["lo", "eth1"]

## Conversely, only consider a specific set of fabric interfaces when selecting
## an interface for client applications. (Mutually exclusive with exclude).
#
#include_fabric_ifaces: ["eth0"]

# Manually define the fabric interfaces and domains to be used by the agent,
# organized by NUMA node.
# If not defined, the agent will automatically detect all fabric interfaces and
Expand Down
Loading