From 4689b681d651eaa0078354f82d3bbcdb25b11883 Mon Sep 17 00:00:00 2001 From: Johannes Edmeier Date: Wed, 5 Feb 2025 13:51:58 +0100 Subject: [PATCH] fix: use allowed cpus from target container if a process is limited to a certain cpuset or some of the configured processors are not online, we need to pass the number of workers to stress-ng as otherwise stress-ng will always use the configured number of processors. --- extcontainer/action_stress.go | 4 +- extcontainer/action_stress_limit_helpers.go | 39 +++++++++++++------ .../action_stress_limit_helpers_test.go | 10 ++--- go.mod | 2 +- go.sum | 4 +- 5 files changed, 37 insertions(+), 22 deletions(-) diff --git a/extcontainer/action_stress.go b/extcontainer/action_stress.go index 61a9f9a..22ccf80 100644 --- a/extcontainer/action_stress.go +++ b/extcontainer/action_stress.go @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// SPDX-FileCopyrightText: 2024 Steadybit GmbH +// SPDX-FileCopyrightText: 2025 Steadybit GmbH package extcontainer @@ -92,7 +92,7 @@ func (a *stressAction) Prepare(ctx context.Context, state *StressActionState, re return nil, err } - readAndAdaptToContainerLimits(ctx, processInfo.CGroupPath, &opts) + readAndAdaptToContainerLimits(ctx, processInfo, &opts) state.StressOpts = opts state.ExecutionId = request.ExecutionId diff --git a/extcontainer/action_stress_limit_helpers.go b/extcontainer/action_stress_limit_helpers.go index d737009..dd8a26e 100644 --- a/extcontainer/action_stress_limit_helpers.go +++ b/extcontainer/action_stress_limit_helpers.go @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// SPDX-FileCopyrightText: 2024 Steadybit GmbH +// SPDX-FileCopyrightText: 2025 Steadybit GmbH package extcontainer @@ -8,12 +8,13 @@ import ( "fmt" "github.com/kataras/iris/v12/x/mathx" "github.com/rs/zerolog/log" + "github.com/steadybit/action-kit/go/action_kit_commons/runc" "github.com/steadybit/action-kit/go/action_kit_commons/stress" + "github.com/steadybit/action-kit/go/action_kit_commons/utils" "github.com/steadybit/extension-kit/extutil" "math" "os" "path/filepath" - "runtime" "strconv" "strings" ) @@ -21,26 +22,42 @@ import ( var cgroupV1MemUnlimited = (math.MaxInt64 / os.Getpagesize()) * os.Getpagesize() var osFs = osFileSystem{} -func readAndAdaptToContainerLimits(_ context.Context, cGroupPath string, opts *stress.Opts) { +func readAndAdaptToContainerLimits(_ context.Context, p runc.LinuxProcessInfo, opts *stress.Opts) { cpuLimitInMilliCpu := -1 memLimitInBytes := -1 if isCGroupV1() { - cpuLimitInMilliCpu = readCGroupV1CpuLimit(cGroupPath, osFs) - memLimitInBytes = readCGroupV1MemLimit(cGroupPath, osFs) + cpuLimitInMilliCpu = readCGroupV1CpuLimit(p.CGroupPath, osFs) + memLimitInBytes = readCGroupV1MemLimit(p.CGroupPath, osFs) } else { - cpuLimitInMilliCpu = readCGroupV2CpuLimit(cGroupPath, osFs) - memLimitInBytes = readCGroupV2MemLimit(cGroupPath, osFs) + cpuLimitInMilliCpu = readCGroupV2CpuLimit(p.CGroupPath, osFs) + memLimitInBytes = readCGroupV2MemLimit(p.CGroupPath, osFs) } - if opts.CpuWorkers != nil && cpuLimitInMilliCpu >= 0 { - adaptToCpuContainerLimits(cpuLimitInMilliCpu, runtime.NumCPU(), opts) + if opts.CpuWorkers != nil { + if cpuLimitInMilliCpu >= 0 { + adaptToCpuContainerLimits(cpuLimitInMilliCpu, opts) + } else if *opts.CpuWorkers == 0 { + //there might be no limit set but the process to be restricted to certain CPUs or some CPUs programmatically turned off. + //In this case we need to read for the allowed list of CPUs for the process and pass this to the stress command as stress-ng + //always uses configured CPUs and not online CPUs + adaptToAllowedCpus(p.Pid, opts) + } } + if opts.VmWorkers != nil && memLimitInBytes >= 0 { adaptToMemContainerLimits(memLimitInBytes, opts) } } +func adaptToAllowedCpus(pid int, opts *stress.Opts) { + if cpuCount, err := utils.ReadCpusAllowedCount(fmt.Sprintf("/proc/%d/status", pid)); err != nil { + opts.CpuWorkers = extutil.Ptr(cpuCount) + } else { + log.Debug().Err(err).Msg("failed to read cpus_allowed count.") + } +} + func adaptToMemContainerLimits(memLimitInBytes int, opts *stress.Opts) { memConsumptionInPercent := 0 if _, err := fmt.Sscanf(opts.VmBytes, "%d%%", &memConsumptionInPercent); err != nil { @@ -55,9 +72,9 @@ func adaptToMemContainerLimits(memLimitInBytes int, opts *stress.Opts) { log.Info().Msgf("container memory limit is %dK. Starting %d workers with memory consumption of %s each", memLimitInBytes/1024, *opts.VmWorkers, opts.VmBytes) } -func adaptToCpuContainerLimits(cpuLimitInMilliCpu int, cpuCount int, opts *stress.Opts) { +func adaptToCpuContainerLimits(cpuLimitInMilliCpu int, opts *stress.Opts) { cpuLoadInMillis := cpuLimitInMilliCpu * opts.CpuLoad / 100 - log.Debug().Int("cpuCount", cpuCount).Int("cpuLoad", opts.CpuLoad).Int("cpuLoadInMillis", cpuLoadInMillis).Msg("adapting to container cpu limit") + log.Debug().Int("cpuLoad", opts.CpuLoad).Int("cpuLoadInMillis", cpuLoadInMillis).Msg("adapting to container cpu limit") if *opts.CpuWorkers == 0 { // user didn't specify the number of workers. we start as many workers as we need to reach the desired cpu consumption diff --git a/extcontainer/action_stress_limit_helpers_test.go b/extcontainer/action_stress_limit_helpers_test.go index 0d989fa..7372e1c 100644 --- a/extcontainer/action_stress_limit_helpers_test.go +++ b/extcontainer/action_stress_limit_helpers_test.go @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: 2025 Steadybit GmbH + package extcontainer import ( @@ -10,7 +13,6 @@ import ( func Test_adaptToCpuContainerLimits(t *testing.T) { type args struct { cpuLimitInMilliCpu int - cpuCount int givenCpuWorkers int givenCpuLoad int } @@ -27,7 +29,6 @@ func Test_adaptToCpuContainerLimits(t *testing.T) { name: "worker-count not specified, desired cpu load can be handled by one worker", args: args{ cpuLimitInMilliCpu: 200, - cpuCount: 4, givenCpuLoad: 100, givenCpuWorkers: 0, }, @@ -40,7 +41,6 @@ func Test_adaptToCpuContainerLimits(t *testing.T) { name: "worker-count not specified, desired cpu load needs multiple workers", args: args{ cpuLimitInMilliCpu: 1500, - cpuCount: 4, givenCpuLoad: 100, givenCpuWorkers: 0, }, @@ -53,7 +53,6 @@ func Test_adaptToCpuContainerLimits(t *testing.T) { name: "worker-count not specified, desired 60% cpu fits to single worker", args: args{ cpuLimitInMilliCpu: 1500, - cpuCount: 4, givenCpuLoad: 60, givenCpuWorkers: 0, }, @@ -66,7 +65,6 @@ func Test_adaptToCpuContainerLimits(t *testing.T) { name: "worker-count specified, desired 60% cpu is spread across workers", args: args{ cpuLimitInMilliCpu: 1500, - cpuCount: 4, givenCpuLoad: 60, givenCpuWorkers: 3, }, @@ -82,7 +80,7 @@ func Test_adaptToCpuContainerLimits(t *testing.T) { CpuWorkers: &tt.args.givenCpuWorkers, CpuLoad: tt.args.givenCpuLoad, } - adaptToCpuContainerLimits(tt.args.cpuLimitInMilliCpu, tt.args.cpuCount, &opts) + adaptToCpuContainerLimits(tt.args.cpuLimitInMilliCpu, &opts) assert.Equal(t, tt.expected.adaptedCpuWorkers, *opts.CpuWorkers) assert.Equal(t, tt.expected.adaptedCpuLoad, opts.CpuLoad) }) diff --git a/go.mod b/go.mod index d5743b8..4118525 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,7 @@ require ( github.com/opencontainers/runtime-spec v1.2.0 github.com/rs/zerolog v1.33.0 github.com/steadybit/action-kit/go/action_kit_api/v2 v2.9.6 - github.com/steadybit/action-kit/go/action_kit_commons v1.2.17 + github.com/steadybit/action-kit/go/action_kit_commons v1.2.18 github.com/steadybit/action-kit/go/action_kit_sdk v1.1.14 github.com/steadybit/action-kit/go/action_kit_test v1.3.2 github.com/steadybit/discovery-kit/go/discovery_kit_api v1.6.0 diff --git a/go.sum b/go.sum index 8a84b37..d9c54a6 100644 --- a/go.sum +++ b/go.sum @@ -257,8 +257,8 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0= github.com/steadybit/action-kit/go/action_kit_api/v2 v2.9.6 h1:Qci7Numf66mjCIRo7KDwHUimIxUZzq+GBfyv/1f/QCU= github.com/steadybit/action-kit/go/action_kit_api/v2 v2.9.6/go.mod h1:ycF2RLgRsB8I/jD52aE+dKZKVru1GIEtmkcRcIR3vXk= -github.com/steadybit/action-kit/go/action_kit_commons v1.2.17 h1:NOKvnUA/iZo7mlcgSK+c8EHjxAEYCLSdyk8KVQSXmv8= -github.com/steadybit/action-kit/go/action_kit_commons v1.2.17/go.mod h1:GFLcaf/WluBIqnq+iSCEGkEqxnfB9h1I16IBR1T7kNQ= +github.com/steadybit/action-kit/go/action_kit_commons v1.2.18 h1:zg3c7b/Dl0M/odtNMbbhwVn/EbnvPZO14IPwbFy9WbY= +github.com/steadybit/action-kit/go/action_kit_commons v1.2.18/go.mod h1:GFLcaf/WluBIqnq+iSCEGkEqxnfB9h1I16IBR1T7kNQ= github.com/steadybit/action-kit/go/action_kit_sdk v1.1.14 h1:x94RX+vh9Iyc0tS6BhiSpvknj+xE36AV0Nc3D5Yuub0= github.com/steadybit/action-kit/go/action_kit_sdk v1.1.14/go.mod h1:Tp/klK5b7k+BCvb3JTSDZSNcnpFBaHauhndzOarnMW4= github.com/steadybit/action-kit/go/action_kit_test v1.3.2 h1:DFDznoWEbTGv+fiGYiRaq7tq5es9VTScjrWusRAbS08=