diff --git a/src/control/cmd/daos_agent/config.go b/src/control/cmd/daos_agent/config.go index 1c5ea0d3f5e..3263850df51 100644 --- a/src/control/cmd/daos_agent/config.go +++ b/src/control/cmd/daos_agent/config.go @@ -55,6 +55,7 @@ type Config struct { DisableAutoEvict bool `yaml:"disable_auto_evict,omitempty"` EvictOnStart bool `yaml:"enable_evict_on_start,omitempty"` ExcludeFabricIfaces common.StringSet `yaml:"exclude_fabric_ifaces,omitempty"` + IncludeFabricIfaces common.StringSet `yaml:"include_fabric_ifaces,omitempty"` FabricInterfaces []*NUMAFabricConfig `yaml:"fabric_ifaces,omitempty"` ProviderIdx uint // TODO SRS-31: Enable with multiprovider functionality TelemetryPort int `yaml:"telemetry_port,omitempty"` @@ -62,6 +63,31 @@ type Config struct { TelemetryRetain time.Duration `yaml:"telemetry_retain,omitempty"` } +// Validate performs basic validation of the configuration. +func (c *Config) Validate() error { + if c == nil { + return errors.New("config is nil") + } + + if !daos.SystemNameIsValid(c.SystemName) { + return fmt.Errorf("invalid system name: %s", c.SystemName) + } + + if c.TelemetryRetain > 0 && c.TelemetryPort == 0 { + return errors.New("telemetry_retain requires telemetry_port") + } + + if c.TelemetryEnabled && c.TelemetryPort == 0 { + return errors.New("telemetry_enabled requires telemetry_port") + } + + if len(c.ExcludeFabricIfaces) > 0 && len(c.IncludeFabricIfaces) > 0 { + return errors.New("cannot specify both exclude_fabric_ifaces and include_fabric_ifaces") + } + + return nil +} + // TelemetryExportEnabled returns true if client telemetry export is enabled. func (c *Config) TelemetryExportEnabled() bool { return c.TelemetryPort > 0 @@ -95,16 +121,8 @@ func LoadConfig(cfgPath string) (*Config, error) { return nil, errors.Wrapf(err, "parsing config: %s", cfgPath) } - if !daos.SystemNameIsValid(cfg.SystemName) { - return nil, fmt.Errorf("invalid system name: %s", cfg.SystemName) - } - - if cfg.TelemetryRetain > 0 && cfg.TelemetryPort == 0 { - return nil, errors.New("telemetry_retain requires telemetry_port") - } - - if cfg.TelemetryEnabled && cfg.TelemetryPort == 0 { - return nil, errors.New("telemetry_enabled requires telemetry_port") + if err := cfg.Validate(); err != nil { + return nil, errors.Wrap(err, "agent config validation failed") } return cfg, nil diff --git a/src/control/cmd/daos_agent/config_test.go b/src/control/cmd/daos_agent/config_test.go index d9aee88b7fd..59a51c5709d 100644 --- a/src/control/cmd/daos_agent/config_test.go +++ b/src/control/cmd/daos_agent/config_test.go @@ -88,6 +88,18 @@ transport_config: allow_insecure: true `) + badFilterCfg := test.CreateTestFile(t, dir, ` +name: shire +access_points: ["one:10001", "two:10001"] +port: 4242 +runtime_dir: /tmp/runtime +log_file: /home/frodo/logfile +transport_config: + allow_insecure: true +include_fabric_ifaces: ["ib0"] +exclude_fabric_ifaces: ["ib3"] +`) + for name, tc := range map[string]struct { path string expResult *Config @@ -128,6 +140,10 @@ transport_config: path: badLogMaskCfg, expErr: errors.New("not a valid log level"), }, + "bad filter config": { + path: badFilterCfg, + expErr: errors.New("cannot specify both exclude_fabric_ifaces and include_fabric_ifaces"), + }, "all options": { path: optCfg, expResult: &Config{ diff --git a/src/control/cmd/daos_agent/fabric.go b/src/control/cmd/daos_agent/fabric.go index bca5f6d57dc..513a6849d02 100644 --- a/src/control/cmd/daos_agent/fabric.go +++ b/src/control/cmd/daos_agent/fabric.go @@ -71,6 +71,37 @@ type addrFI interface { Addrs() ([]net.Addr, error) } +type filterMode int + +const ( + // filterModeExclude indicates that devices in the set should be excluded + filterModeExclude filterMode = 0 + // filterModeInclude indicates that only devices in the set should be included + filterModeInclude filterMode = 1 +) + +type deviceFilter struct { + deviceSet common.StringSet + mode filterMode +} + +func (df *deviceFilter) ShouldIgnore(devName string) bool { + if df == nil || df.deviceSet == nil { + return false + } + if df.mode == filterModeExclude { + return df.deviceSet.Has(devName) + } + return !df.deviceSet.Has(devName) +} + +func newDeviceFilter(deviceSet common.StringSet, mode filterMode) *deviceFilter { + return &deviceFilter{ + deviceSet: deviceSet, + mode: mode, + } +} + // NUMAFabric represents a set of fabric interfaces organized by NUMA node. type NUMAFabric struct { log logging.Logger @@ -78,9 +109,9 @@ type NUMAFabric struct { numaMap map[int][]*FabricInterface - currentNumaDevIdx map[int]int // current device idx to use on each NUMA node - currentNUMANode int // current NUMA node to search - ignoreIfaces common.StringSet + currentNumaDevIdx map[int]int // current device idx to use on each NUMA node + currentNUMANode int // current NUMA node to search + ifaceFilter *deviceFilter // set of interface names for filtering getAddrInterface func(name string) (addrFI, error) } @@ -98,12 +129,12 @@ func (n *NUMAFabric) Add(numaNode int, fi *FabricInterface) error { return nil } -// WithIgnoredDevices adds a set of fabric interface names that should be ignored when -// selecting a device. -func (n *NUMAFabric) WithIgnoredDevices(ifaces common.StringSet) *NUMAFabric { - n.ignoreIfaces = ifaces - if len(ifaces) > 0 { - n.log.Tracef("ignoring fabric devices: %s", n.ignoreIfaces) +// WithDeviceFilter adds a set of fabric interface names that should be used for +// filtering when selecting a device. +func (n *NUMAFabric) WithDeviceFilter(filter *deviceFilter) *NUMAFabric { + if filter != nil { + n.ifaceFilter = filter + n.log.Tracef("fabric device filter: %+v", n.ifaceFilter) } return n } @@ -192,8 +223,8 @@ func (n *NUMAFabric) getDeviceFromNUMA(numaNode int, netDevClass hardware.NetDev for checked := 0; checked < n.getNumDevices(numaNode); checked++ { fabricIF := n.getNextDevice(numaNode) - if n.ignoreIfaces.Has(fabricIF.Name) { - n.log.Tracef("device %s: ignored (ignore list %s)", fabricIF, n.ignoreIfaces) + if n.ifaceFilter.ShouldIgnore(fabricIF.Name) { + n.log.Tracef("device %s: ignored (filter: %+v)", fabricIF, n.ifaceFilter) continue } diff --git a/src/control/cmd/daos_agent/fabric_test.go b/src/control/cmd/daos_agent/fabric_test.go index 18517fac6c9..7eca1f78ce4 100644 --- a/src/control/cmd/daos_agent/fabric_test.go +++ b/src/control/cmd/daos_agent/fabric_test.go @@ -184,7 +184,8 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) { for name, tc := range map[string]struct { nf *NUMAFabric params *FabricIfaceParams - ignore []string + include []string + exclude []string expErr error expResults []*FabricInterface }{ @@ -661,7 +662,7 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) { }, }, }, - "ignore interface": { + "include interface": { nf: &NUMAFabric{ numaMap: map[int][]*FabricInterface{ 0: { @@ -687,7 +688,7 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) { Provider: "ofi+sockets", DevClass: hardware.Ether, }, - ignore: []string{"t1"}, + include: []string{"t2"}, expResults: []*FabricInterface{ { Name: "t2", @@ -699,7 +700,7 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) { }, }, }, - "ignore all interfaces": { + "exclude interface": { nf: &NUMAFabric{ numaMap: map[int][]*FabricInterface{ 0: { @@ -709,6 +710,8 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) { DeviceClass: hardware.Ether, Providers: testFabricProviderSet("ofi+sockets"), })[0], + }, + 1: { fabricInterfacesFromHardware(&hardware.FabricInterface{ NetInterfaces: common.NewStringSet("t2"), Name: "t2", @@ -723,8 +726,44 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) { Provider: "ofi+sockets", DevClass: hardware.Ether, }, - ignore: []string{"t1", "t2"}, - expErr: errors.New("no suitable fabric interface"), + exclude: []string{"t1"}, + expResults: []*FabricInterface{ + { + Name: "t2", + NetDevClass: hardware.Ether, + }, + { + Name: "t2", + NetDevClass: hardware.Ether, + }, + }, + }, + "exclude all interfaces": { + nf: &NUMAFabric{ + numaMap: map[int][]*FabricInterface{ + 0: { + fabricInterfacesFromHardware(&hardware.FabricInterface{ + NetInterfaces: common.NewStringSet("t1"), + Name: "t1", + DeviceClass: hardware.Ether, + Providers: testFabricProviderSet("ofi+sockets"), + })[0], + fabricInterfacesFromHardware(&hardware.FabricInterface{ + NetInterfaces: common.NewStringSet("t2"), + Name: "t2", + DeviceClass: hardware.Ether, + Providers: testFabricProviderSet("ofi+sockets"), + })[0], + }, + }, + }, + params: &FabricIfaceParams{ + NUMANode: 0, + Provider: "ofi+sockets", + DevClass: hardware.Ether, + }, + exclude: []string{"t1", "t2"}, + expErr: errors.New("no suitable fabric interface"), }, } { t.Run(name, func(t *testing.T) { @@ -736,7 +775,13 @@ func TestAgent_NUMAFabric_GetDevice(t *testing.T) { tc.nf.getAddrInterface = getMockNetInterfaceSuccess } - tc.nf = tc.nf.WithIgnoredDevices(common.NewStringSet(tc.ignore...)) + mode := filterModeExclude + devSet := common.NewStringSet(tc.exclude...) + if len(tc.include) > 0 { + mode = filterModeInclude + devSet = common.NewStringSet(tc.include...) + } + tc.nf = tc.nf.WithDeviceFilter(newDeviceFilter(devSet, mode)) } numDevices := 0 diff --git a/src/control/cmd/daos_agent/infocache.go b/src/control/cmd/daos_agent/infocache.go index f93fcb2052d..6caaac1901c 100644 --- a/src/control/cmd/daos_agent/infocache.go +++ b/src/control/cmd/daos_agent/infocache.go @@ -69,13 +69,20 @@ func NewInfoCache(ctx context.Context, log logging.Logger, client control.UnaryI return ic } +func fabricDeviceFilter(cfg *Config) *deviceFilter { + if len(cfg.ExcludeFabricIfaces) > 0 { + return newDeviceFilter(cfg.ExcludeFabricIfaces, filterModeExclude) + } + return newDeviceFilter(cfg.IncludeFabricIfaces, filterModeInclude) +} + func getFabricScanFn(log logging.Logger, cfg *Config, scanner *hardware.FabricScanner) fabricScanFn { return func(ctx context.Context, provs ...string) (*NUMAFabric, error) { fis, err := scanner.Scan(ctx, provs...) if err != nil { return nil, err } - return NUMAFabricFromScan(ctx, log, fis).WithIgnoredDevices(cfg.ExcludeFabricIfaces), nil + return NUMAFabricFromScan(ctx, log, fis).WithDeviceFilter(fabricDeviceFilter(cfg)), nil } } diff --git a/utils/config/daos_agent.yml b/utils/config/daos_agent.yml index 31dc432abaa..bced9a0447b 100644 --- a/utils/config/daos_agent.yml +++ b/utils/config/daos_agent.yml @@ -136,10 +136,15 @@ #cache_expiration: 30 ## Ignore a subset of fabric interfaces when selecting an interface for client -## applications. +## applications. (Mutually exclusive with include). # #exclude_fabric_ifaces: ["lo", "eth1"] +## Conversely, only consider a specific set of fabric interfaces when selecting +## an interface for client applications. (Mutually exclusive with exclude). +# +#include_fabric_ifaces: ["eth0"] + # Manually define the fabric interfaces and domains to be used by the agent, # organized by NUMA node. # If not defined, the agent will automatically detect all fabric interfaces and