From 9bc1a5366edd1ec199ff5923702ebc611ea2a617 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 26 Dec 2024 13:48:47 +0100 Subject: [PATCH 01/14] Add support for GPU feature --- api/datadoghq/v2alpha1/const.go | 7 + api/datadoghq/v2alpha1/datadogagent_types.go | 16 ++ .../v2alpha1/zz_generated.deepcopy.go | 30 ++++ .../v2alpha1/zz_generated.openapi.go | 8 +- .../bases/v1/datadoghq.com_datadogagents.yaml | 30 ++++ .../datadoghq.com_datadogagents_v2alpha1.json | 30 ++++ docs/configuration.v2alpha1.md | 2 + examples/datadogagent/datadog-agent-all.yaml | 2 + .../controller/datadogagent/controller.go | 1 + .../defaults/datadogagent_default.go | 8 + .../defaults/datadogagent_default_test.go | 46 ++++++ .../datadogagent/feature/gpu/envvar.go | 9 + .../datadogagent/feature/gpu/feature.go | 154 ++++++++++++++++++ .../datadogagent/feature/gpu/feature_test.go | 148 +++++++++++++++++ .../controller/datadogagent/feature/ids.go | 2 + .../datadogagent/feature/test/factory_test.go | 19 ++- .../datadogagent_controller_test.go | 5 + internal/controller/testutils/agent.go | 13 ++ pkg/testutils/builder.go | 15 ++ 19 files changed, 543 insertions(+), 2 deletions(-) create mode 100644 internal/controller/datadogagent/feature/gpu/envvar.go create mode 100644 internal/controller/datadogagent/feature/gpu/feature.go create mode 100644 internal/controller/datadogagent/feature/gpu/feature_test.go diff --git a/api/datadoghq/v2alpha1/const.go b/api/datadoghq/v2alpha1/const.go index ef52fb674..c6662fa2a 100644 --- a/api/datadoghq/v2alpha1/const.go +++ b/api/datadoghq/v2alpha1/const.go @@ -78,6 +78,9 @@ const ( KubeServicesAndEndpointsListeners = "kube_services kube_endpoints" EndpointsChecksConfigProvider = "endpointschecks" ClusterAndEndpointsConfigProviders = "clusterchecks endpointschecks" + + // DefaultGPUMonitoringRuntimeClass default runtime class for GPU pods + DefaultGPUMonitoringRuntimeClass = "nvidia" ) // Labels @@ -201,6 +204,10 @@ const ( FIPSProxyCustomConfigFileName = "datadog-fips-proxy.cfg" FIPSProxyCustomConfigMapName = "%s-fips-config" FIPSProxyCustomConfigMountPath = "/etc/datadog-fips-proxy/datadog-fips-proxy.cfg" + + NVIDIADevicesMountPath = "/var/run/nvidia-container-devices/all" + NVIDIADevicesVolumeName = "nvidia-devices" + DevNullPath = "/dev/null" // used to mount the NVIDIADevicesHostPath to /dev/null in the container, it's just used as a "signal" to the nvidia runtime to use the nvidia devices ) // Field paths diff --git a/api/datadoghq/v2alpha1/datadogagent_types.go b/api/datadoghq/v2alpha1/datadogagent_types.go index 857767b44..1756ba844 100644 --- a/api/datadoghq/v2alpha1/datadogagent_types.go +++ b/api/datadoghq/v2alpha1/datadogagent_types.go @@ -82,6 +82,8 @@ type DatadogFeatures struct { SBOM *SBOMFeatureConfig `json:"sbom,omitempty"` // ServiceDiscovery ServiceDiscovery *ServiceDiscoveryFeatureConfig `json:"serviceDiscovery,omitempty"` + // GPU monitoring + GPUMonitoring *GPUMonitoringFeatureConfig `json:"gpu,omitempty"` // Cluster-level features @@ -498,6 +500,20 @@ type ServiceDiscoveryFeatureConfig struct { Enabled *bool `json:"enabled,omitempty"` } +// GPUMonitoringFeatureConfig contains the GPU monitoring configuration. +type GPUMonitoringFeatureConfig struct { + // Enabled enables GPU monitoring. + // Default: false + // +optional + Enabled *bool `json:"enabled,omitempty"` + + // PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. + // If left empty, the runtime class will not be set. + // Default: nvidia + // +optional + PodRuntimeClassName *string `json:"requiredRuntimeClassName"` +} + // DogstatsdFeatureConfig contains the Dogstatsd configuration parameters. // +k8s:openapi-gen=true type DogstatsdFeatureConfig struct { diff --git a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go index a36ac9778..30bdf8e78 100644 --- a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go +++ b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go @@ -1232,6 +1232,11 @@ func (in *DatadogFeatures) DeepCopyInto(out *DatadogFeatures) { *out = new(ServiceDiscoveryFeatureConfig) (*in).DeepCopyInto(*out) } + if in.GPUMonitoring != nil { + in, out := &in.GPUMonitoring, &out.GPUMonitoring + *out = new(GPUMonitoringFeatureConfig) + (*in).DeepCopyInto(*out) + } if in.EventCollection != nil { in, out := &in.EventCollection, &out.EventCollection *out = new(EventCollectionFeatureConfig) @@ -1545,6 +1550,31 @@ func (in *FIPSConfig) DeepCopy() *FIPSConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GPUMonitoringFeatureConfig) DeepCopyInto(out *GPUMonitoringFeatureConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.PodRuntimeClassName != nil { + in, out := &in.PodRuntimeClassName, &out.PodRuntimeClassName + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUMonitoringFeatureConfig. +func (in *GPUMonitoringFeatureConfig) DeepCopy() *GPUMonitoringFeatureConfig { + if in == nil { + return nil + } + out := new(GPUMonitoringFeatureConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GlobalConfig) DeepCopyInto(out *GlobalConfig) { *out = *in diff --git a/api/datadoghq/v2alpha1/zz_generated.openapi.go b/api/datadoghq/v2alpha1/zz_generated.openapi.go index 9dfdc495f..ae577cbf1 100644 --- a/api/datadoghq/v2alpha1/zz_generated.openapi.go +++ b/api/datadoghq/v2alpha1/zz_generated.openapi.go @@ -675,6 +675,12 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_DatadogFeatures(ref common.R Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ServiceDiscoveryFeatureConfig"), }, }, + "gpu": { + SchemaProps: spec.SchemaProps{ + Description: "GPU monitoring", + Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.GPUMonitoringFeatureConfig"), + }, + }, "eventCollection": { SchemaProps: spec.SchemaProps{ Description: "EventCollection configuration.", @@ -733,7 +739,7 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_DatadogFeatures(ref common.R }, }, Dependencies: []string{ - "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.APMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ASMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AdmissionControllerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AutoscalingFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CSPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CWSFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ClusterChecksFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DogstatsdFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EBPFCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EventCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExternalMetricsServerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.HelmCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.KubeStateMetricsCoreFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveContainerCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveProcessCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LogCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.NPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OOMKillFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OTLPFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OrchestratorExplorerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OtelCollectorFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ProcessDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.PrometheusScrapeFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigurationFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.SBOMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ServiceDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.TCPQueueLengthFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.USMFeatureConfig"}, + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.APMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ASMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AdmissionControllerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AutoscalingFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CSPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CWSFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ClusterChecksFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DogstatsdFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EBPFCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EventCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExternalMetricsServerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.GPUMonitoringFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.HelmCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.KubeStateMetricsCoreFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveContainerCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveProcessCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LogCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.NPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OOMKillFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OTLPFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OrchestratorExplorerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OtelCollectorFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ProcessDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.PrometheusScrapeFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigurationFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.SBOMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ServiceDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.TCPQueueLengthFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.USMFeatureConfig"}, } } diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml index 679f7ce12..f862243d3 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml +++ b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml @@ -1019,6 +1019,21 @@ spec: Default: false type: boolean type: object + gpu: + description: GPU monitoring + properties: + enabled: + description: |- + Enabled enables GPU monitoring. + Default: false + type: boolean + requiredRuntimeClassName: + description: |- + PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. + If left empty, the runtime class will not be set. + Default: nvidia + type: string + type: object helmCheck: description: HelmCheck configuration. properties: @@ -7883,6 +7898,21 @@ spec: Default: false type: boolean type: object + gpu: + description: GPU monitoring + properties: + enabled: + description: |- + Enabled enables GPU monitoring. + Default: false + type: boolean + requiredRuntimeClassName: + description: |- + PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. + If left empty, the runtime class will not be set. + Default: nvidia + type: string + type: object helmCheck: description: HelmCheck configuration. properties: diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json index 02d401ef4..62a36b6d3 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json +++ b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json @@ -1065,6 +1065,21 @@ }, "type": "object" }, + "gpu": { + "additionalProperties": false, + "description": "GPU monitoring", + "properties": { + "enabled": { + "description": "Enabled enables GPU monitoring.\nDefault: false", + "type": "boolean" + }, + "requiredRuntimeClassName": { + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class will not be set.\nDefault: nvidia", + "type": "string" + } + }, + "type": "object" + }, "helmCheck": { "additionalProperties": false, "description": "HelmCheck configuration.", @@ -7871,6 +7886,21 @@ }, "type": "object" }, + "gpu": { + "additionalProperties": false, + "description": "GPU monitoring", + "properties": { + "enabled": { + "description": "Enabled enables GPU monitoring.\nDefault: false", + "type": "boolean" + }, + "requiredRuntimeClassName": { + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class will not be set.\nDefault: nvidia", + "type": "string" + } + }, + "type": "object" + }, "helmCheck": { "additionalProperties": false, "description": "HelmCheck configuration.", diff --git a/docs/configuration.v2alpha1.md b/docs/configuration.v2alpha1.md index 1e37d145c..40a37b5b0 100644 --- a/docs/configuration.v2alpha1.md +++ b/docs/configuration.v2alpha1.md @@ -111,6 +111,8 @@ spec: | features.externalMetricsServer.registerAPIService | RegisterAPIService registers the External Metrics endpoint as an APIService Default: true | | features.externalMetricsServer.useDatadogMetrics | UseDatadogMetrics enables usage of the DatadogMetrics CRD (allowing one to scale on arbitrary Datadog metric queries). Default: true | | features.externalMetricsServer.wpaController | WPAController enables the informer and controller of the Watermark Pod Autoscaler. NOTE: The Watermark Pod Autoscaler controller needs to be installed. See also: https://github.com/DataDog/watermarkpodautoscaler. Default: false | +| features.gpu.enabled | Enables GPU monitoring. Default: false | +| features.gpu.requiredRuntimeClassName | PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. If left empty, the runtime class will not be set. Default: nvidia | | features.helmCheck.collectEvents | CollectEvents set to `true` enables event collection in the Helm check (Requires Agent 7.36.0+ and Cluster Agent 1.20.0+) Default: false | | features.helmCheck.enabled | Enables the Helm check. Default: false | | features.helmCheck.valuesAsTags | ValuesAsTags collects Helm values from a release and uses them as tags (Requires Agent and Cluster Agent 7.40.0+). Default: {} | diff --git a/examples/datadogagent/datadog-agent-all.yaml b/examples/datadogagent/datadog-agent-all.yaml index ea0cff3c0..dd786ed54 100644 --- a/examples/datadogagent/datadog-agent-all.yaml +++ b/examples/datadogagent/datadog-agent-all.yaml @@ -47,6 +47,8 @@ spec: enabled: true serviceDiscovery: enabled: true + gpu: + enabled: true eventCollection: collectKubernetesEvents: true orchestratorExplorer: diff --git a/internal/controller/datadogagent/controller.go b/internal/controller/datadogagent/controller.go index 7c29bdd31..6eb7f2e65 100644 --- a/internal/controller/datadogagent/controller.go +++ b/internal/controller/datadogagent/controller.go @@ -34,6 +34,7 @@ import ( _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/enabledefault" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/eventcollection" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/externalmetrics" + _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/gpu" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/helmcheck" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/kubernetesstatecore" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/livecontainer" diff --git a/internal/controller/datadogagent/defaults/datadogagent_default.go b/internal/controller/datadogagent/defaults/datadogagent_default.go index 1ebe56970..7b43579ba 100644 --- a/internal/controller/datadogagent/defaults/datadogagent_default.go +++ b/internal/controller/datadogagent/defaults/datadogagent_default.go @@ -37,6 +37,8 @@ const ( defaultEBPFCheckEnabled bool = false + defaultGPUMonitoringEnabled bool = false + defaultServiceDiscoveryEnabled bool = false defaultAPMEnabled bool = true @@ -265,6 +267,12 @@ func defaultFeaturesConfig(ddaSpec *v2alpha1.DatadogAgentSpec) { } apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.ServiceDiscovery.Enabled, defaultServiceDiscoveryEnabled) + // GPU monitoring feature + if ddaSpec.Features.GPUMonitoring == nil { + ddaSpec.Features.GPUMonitoring = &v2alpha1.GPUMonitoringFeatureConfig{} + } + apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.GPUMonitoring.Enabled, defaultGPUMonitoringEnabled) + // APM Feature // APM is enabled by default if ddaSpec.Features.APM == nil { diff --git a/internal/controller/datadogagent/defaults/datadogagent_default_test.go b/internal/controller/datadogagent/defaults/datadogagent_default_test.go index 98d61fd6b..52c9cb446 100644 --- a/internal/controller/datadogagent/defaults/datadogagent_default_test.go +++ b/internal/controller/datadogagent/defaults/datadogagent_default_test.go @@ -198,6 +198,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -333,6 +336,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(valueFalse), }, @@ -423,6 +429,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(valueFalse), }, @@ -549,6 +558,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -696,6 +708,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -838,6 +853,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(valueTrue), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -980,6 +998,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1131,6 +1152,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1273,6 +1297,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1418,6 +1445,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1602,6 +1632,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, CSPM: &v2alpha1.CSPMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultCSPMEnabled), }, @@ -1717,6 +1750,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1860,6 +1896,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -1979,6 +2018,7 @@ func Test_defaultFeatures(t *testing.T) { OOMKill: &v2alpha1.OOMKillFeatureConfig{}, TCPQueueLength: &v2alpha1.TCPQueueLengthFeatureConfig{}, EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{}, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{}, ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{}, APM: &v2alpha1.APMFeatureConfig{}, ASM: &v2alpha1.ASMFeatureConfig{}, @@ -2024,6 +2064,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ @@ -2169,6 +2212,9 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), + }, APM: &v2alpha1.APMFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultAPMEnabled), HostPortConfig: &v2alpha1.HostPortConfig{ diff --git a/internal/controller/datadogagent/feature/gpu/envvar.go b/internal/controller/datadogagent/feature/gpu/envvar.go new file mode 100644 index 000000000..5c8a0b96f --- /dev/null +++ b/internal/controller/datadogagent/feature/gpu/envvar.go @@ -0,0 +1,9 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package gpu + +const DDEnableGPUMonitoringEnvVar = "DD_GPU_MONITORING_ENABLED" +const NVIDIAVisibleDevicesEnvVar = "NVIDIA_VISIBLE_DEVICES" diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go new file mode 100644 index 000000000..1d4f8f9ff --- /dev/null +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -0,0 +1,154 @@ +package gpu + +import ( + corev1 "k8s.io/api/core/v1" + + apicommon "github.com/DataDog/datadog-operator/api/datadoghq/common" + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" + apiutils "github.com/DataDog/datadog-operator/api/utils" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/object/volume" +) + +func init() { + if err := feature.Register(feature.GPUMonitoringType, buildFeature); err != nil { + panic(err) + } +} + +func buildFeature(*feature.Options) feature.Feature { + return &gpuMonitoringFeature{} +} + +type gpuMonitoringFeature struct { + podRuntimeClassName string +} + +// ID returns the ID of the Feature +func (f *gpuMonitoringFeature) ID() feature.IDType { + return feature.GPUMonitoringType +} + +// Configure is used to configure the feature from a v2alpha1.DatadogAgent instance. +func (f *gpuMonitoringFeature) Configure(dda *v2alpha1.DatadogAgent) (reqComp feature.RequiredComponents) { + if dda.Spec.Features == nil || dda.Spec.Features.GPUMonitoring == nil || !apiutils.BoolValue(dda.Spec.Features.GPUMonitoring.Enabled) { + return reqComp + } + + reqComp.Agent = feature.RequiredComponent{ + IsRequired: apiutils.NewBoolPointer(true), + Containers: []apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}, + } + + if dda.Spec.Features.GPUMonitoring.PodRuntimeClassName == nil { + f.podRuntimeClassName = v2alpha1.DefaultGPUMonitoringRuntimeClass + } else { + f.podRuntimeClassName = *dda.Spec.Features.GPUMonitoring.PodRuntimeClassName + } + + return reqComp +} + +// ManageDependencies allows a feature to manage its dependencies. +// Feature's dependencies should be added in the store. +func (f *gpuMonitoringFeature) ManageDependencies(feature.ResourceManagers, feature.RequiredComponents) error { + return nil +} + +// ManageClusterAgent allows a feature to configure the ClusterAgent's corev1.PodTemplateSpec +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuMonitoringFeature) ManageClusterAgent(feature.PodTemplateManagers) error { + return nil +} + +func configureSystemProbe(managers feature.PodTemplateManagers) { + // annotations + managers.Annotation().AddAnnotation(v2alpha1.SystemProbeAppArmorAnnotationKey, v2alpha1.SystemProbeAppArmorAnnotationValue) + + // security context capabilities + managers.SecurityContext().AddCapabilitiesToContainer(agent.DefaultCapabilitiesForSystemProbe(), apicommon.SystemProbeContainerName) + + // socket volume mount (needs write perms for the system probe container but not the others) + procdirVol, procdirMount := volume.GetVolumes(v2alpha1.ProcdirVolumeName, v2alpha1.ProcdirHostPath, v2alpha1.ProcdirMountPath, true) + managers.VolumeMount().AddVolumeMountToContainer(&procdirMount, apicommon.SystemProbeContainerName) + managers.Volume().AddVolume(&procdirVol) + + socketVol, socketVolMount := volume.GetVolumesEmptyDir(v2alpha1.SystemProbeSocketVolumeName, v2alpha1.SystemProbeSocketVolumePath, false) + managers.Volume().AddVolume(&socketVol) + managers.VolumeMount().AddVolumeMountToContainer(&socketVolMount, apicommon.SystemProbeContainerName) + + _, socketVolMountReadOnly := volume.GetVolumesEmptyDir(v2alpha1.SystemProbeSocketVolumeName, v2alpha1.SystemProbeSocketVolumePath, true) + managers.VolumeMount().AddVolumeMountToContainer(&socketVolMountReadOnly, apicommon.CoreAgentContainerName) + + socketEnvVar := &corev1.EnvVar{ + Name: v2alpha1.DDSystemProbeSocket, + Value: v2alpha1.DefaultSystemProbeSocketPath, + } + + managers.EnvVar().AddEnvVarToContainer(apicommon.CoreAgentContainerName, socketEnvVar) + managers.EnvVar().AddEnvVarToContainer(apicommon.SystemProbeContainerName, socketEnvVar) +} + +// ManageNodeAgent allows a feature to configure the Node Agent's corev1.PodTemplateSpec +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuMonitoringFeature) ManageNodeAgent(managers feature.PodTemplateManagers, _ string) error { + configureSystemProbe(managers) + + // env var to enable the GPU module + enableEnvVar := &corev1.EnvVar{ + Name: DDEnableGPUMonitoringEnvVar, + Value: "true", + } + + // Both in the core agent and the system probe + managers.EnvVar().AddEnvVarToContainers([]apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}, enableEnvVar) + + // The agent check does not need to be manually enabled, the init config container will + // check if GPU monitoring is enabled and will enable the check automatically (see + // Dockerfiles/agent/cont-init.d/60-sysprobe-check.sh in the datadog-agent repo). + managers.EnvVar().AddEnvVarToInitContainer(apicommon.InitConfigContainerName, enableEnvVar) + + // Now we need to add the NVIDIA_VISIBLE_DEVICES env var to both agents again so + // that the nvidia runtime can expose the GPU devices in the container + nvidiaVisibleDevicesEnvVar := &corev1.EnvVar{ + Name: NVIDIAVisibleDevicesEnvVar, + Value: "all", + } + + managers.EnvVar().AddEnvVarToContainers([]apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}, nvidiaVisibleDevicesEnvVar) + + // Some nvidia-container-runtime setups ignore the NVIDIA_VISIBLE_DEVICES + // env variable. This is usually configured with the options + // accept-nvidia-visible-devices-envvar-when-unprivileged = true + // accept-nvidia-visible-devices-as-volume-mounts = true + // in the NVIDIA conatiner runtime config. In this case, we need to mount the + // /var/run/nvidia-container-devices/all directory into the container, so that + // the nvidia-container-runtime can see that we want to use all GPUs. + devicesVol, devicesMount := volume.GetVolumes(v2alpha1.NVIDIADevicesVolumeName, v2alpha1.DevNullPath, v2alpha1.NVIDIADevicesMountPath, true) + managers.Volume().AddVolume(&devicesVol) + managers.VolumeMount().AddVolumeMountToContainers(&devicesMount, []apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}) + + // Configure the runtime class for the container + if f.podRuntimeClassName != "" { + managers.PodTemplateSpec().Spec.RuntimeClassName = &f.podRuntimeClassName + } + + // Note: we don't need to mount the NVML library, as it's mounted automatically + // by the nvidia-container-runtime. + + return nil +} + +// ManageSingleContainerNodeAgent allows a feature to configure the Agent container for the Node Agent's corev1.PodTemplateSpec +// if SingleContainerStrategy is enabled and can be used with the configured feature set. +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuMonitoringFeature) ManageSingleContainerNodeAgent(feature.PodTemplateManagers, string) error { + return nil +} + +// ManageClusterChecksRunner allows a feature to configure the ClusterChecksRunner's corev1.PodTemplateSpec +// It should do nothing if the feature doesn't need to configure it. +func (f *gpuMonitoringFeature) ManageClusterChecksRunner(feature.PodTemplateManagers) error { + return nil +} diff --git a/internal/controller/datadogagent/feature/gpu/feature_test.go b/internal/controller/datadogagent/feature/gpu/feature_test.go new file mode 100644 index 000000000..82a291dbb --- /dev/null +++ b/internal/controller/datadogagent/feature/gpu/feature_test.go @@ -0,0 +1,148 @@ +package gpu + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + + apicommon "github.com/DataDog/datadog-operator/api/datadoghq/common" + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" + apiutils "github.com/DataDog/datadog-operator/api/utils" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/fake" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/test" +) + +func Test_GPUMonitoringFeature_Configure(t *testing.T) { + ddaGPUMonitoringDisabled := v2alpha1.DatadogAgent{ + Spec: v2alpha1.DatadogAgentSpec{ + Features: &v2alpha1.DatadogFeatures{ + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(false), + }, + }, + }, + } + ddaGPUMonitoringEnabled := ddaGPUMonitoringDisabled.DeepCopy() + ddaGPUMonitoringEnabled.Spec.Features.GPUMonitoring.Enabled = apiutils.NewBoolPointer(true) + + GPUMonitoringAgentNodeWantFunc := func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + mgr := mgrInterface.(*fake.PodTemplateManagers) + + // check security context capabilities + sysProbeCapabilities := mgr.SecurityContextMgr.CapabilitiesByC[apicommon.SystemProbeContainerName] + assert.True( + t, + apiutils.IsEqualStruct(sysProbeCapabilities, agent.DefaultCapabilitiesForSystemProbe()), + "System Probe security context capabilities \ndiff = %s", + cmp.Diff(sysProbeCapabilities, agent.DefaultCapabilitiesForSystemProbe()), + ) + + // check volume mounts + wantCoreAgentVolMounts := []corev1.VolumeMount{ + { + Name: v2alpha1.SystemProbeSocketVolumeName, + MountPath: v2alpha1.SystemProbeSocketVolumePath, + ReadOnly: true, + }, + { + Name: v2alpha1.NVIDIADevicesVolumeName, + MountPath: v2alpha1.NVIDIADevicesMountPath, + ReadOnly: true, + }, + } + + wantSystemProbeVolMounts := []corev1.VolumeMount{ + { + Name: v2alpha1.ProcdirVolumeName, + MountPath: v2alpha1.ProcdirMountPath, + ReadOnly: true, + }, + { + Name: v2alpha1.SystemProbeSocketVolumeName, + MountPath: v2alpha1.SystemProbeSocketVolumePath, + ReadOnly: false, + }, + { + Name: v2alpha1.NVIDIADevicesVolumeName, + MountPath: v2alpha1.NVIDIADevicesMountPath, + ReadOnly: true, + }, + } + + coreAgentVolumeMounts := mgr.VolumeMountMgr.VolumeMountsByC[apicommon.CoreAgentContainerName] + assert.True(t, apiutils.IsEqualStruct(coreAgentVolumeMounts, wantCoreAgentVolMounts), "Core agent volume mounts \ndiff = %s", cmp.Diff(coreAgentVolumeMounts, wantCoreAgentVolMounts)) + + systemProbeVolumeMounts := mgr.VolumeMountMgr.VolumeMountsByC[apicommon.SystemProbeContainerName] + assert.True(t, apiutils.IsEqualStruct(systemProbeVolumeMounts, wantSystemProbeVolMounts), "System Probe volume mounts \ndiff = %s", cmp.Diff(systemProbeVolumeMounts, wantSystemProbeVolMounts)) + + // check volumes + wantVolumes := []corev1.Volume{ + { + Name: v2alpha1.ProcdirVolumeName, + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: v2alpha1.ProcdirHostPath, + }, + }, + }, + { + Name: v2alpha1.SystemProbeSocketVolumeName, + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + { + Name: v2alpha1.NVIDIADevicesVolumeName, + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: v2alpha1.DevNullPath, + }, + }, + }, + } + + volumes := mgr.VolumeMgr.Volumes + assert.True(t, apiutils.IsEqualStruct(volumes, wantVolumes), "Volumes \ndiff = %s", cmp.Diff(volumes, wantVolumes)) + + // check env vars + wantEnvVars := []*corev1.EnvVar{ + { + Name: v2alpha1.DDSystemProbeSocket, + Value: v2alpha1.DefaultSystemProbeSocketPath, + }, + { + Name: DDEnableGPUMonitoringEnvVar, + Value: "true", + }, + { + Name: NVIDIAVisibleDevicesEnvVar, + Value: "all", + }, + } + agentEnvVars := mgr.EnvVarMgr.EnvVarsByC[apicommon.CoreAgentContainerName] + assert.True(t, apiutils.IsEqualStruct(agentEnvVars, wantEnvVars), "Agent envvars \ndiff = %s", cmp.Diff(agentEnvVars, wantEnvVars)) + + systemProbeEnvVars := mgr.EnvVarMgr.EnvVarsByC[apicommon.SystemProbeContainerName] + assert.True(t, apiutils.IsEqualStruct(systemProbeEnvVars, wantEnvVars), "System Probe envvars \ndiff = %s", cmp.Diff(systemProbeEnvVars, wantEnvVars)) + } + + tests := test.FeatureTestSuite{ + { + Name: "gpu monitoring not enabled", + DDA: ddaGPUMonitoringDisabled.DeepCopy(), + WantConfigure: false, + }, + { + Name: "gpu monitoring enabled", + DDA: ddaGPUMonitoringEnabled, + WantConfigure: true, + Agent: test.NewDefaultComponentTest().WithWantFunc(GPUMonitoringAgentNodeWantFunc), + }, + } + + tests.Run(t, buildFeature) +} diff --git a/internal/controller/datadogagent/feature/ids.go b/internal/controller/datadogagent/feature/ids.go index b395d720d..ecd4365e3 100644 --- a/internal/controller/datadogagent/feature/ids.go +++ b/internal/controller/datadogagent/feature/ids.go @@ -71,4 +71,6 @@ const ( DummyIDType = "dummy" // ServiceDiscoveryType service discovery feature. ServiceDiscoveryType = "service_discovery" + // GPUMonitoringType monitoring feature. + GPUMonitoringType = "gpu" ) diff --git a/internal/controller/datadogagent/feature/test/factory_test.go b/internal/controller/datadogagent/feature/test/factory_test.go index 8d0491d17..5b6ed0e58 100644 --- a/internal/controller/datadogagent/feature/test/factory_test.go +++ b/internal/controller/datadogagent/feature/test/factory_test.go @@ -12,6 +12,7 @@ import ( _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/apm" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/cspm" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/enabledefault" + _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/gpu" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/livecontainer" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/npm" _ "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/otelcollector" @@ -295,6 +296,22 @@ func TestBuilder(t *testing.T) { common.AgentDataPlaneContainerName: false, }, }, + { + name: "GPU monitoring enabled, 4 agents", + dda: testutils.NewDatadogAgentBuilder(). + WithGPUMonitoringEnabled(true). + BuildWithDefaults(), + wantAgentContainer: map[common.AgentContainerName]bool{ + common.UnprivilegedSingleAgentContainerName: false, + common.CoreAgentContainerName: true, + common.ProcessAgentContainerName: true, + common.TraceAgentContainerName: true, + common.SystemProbeContainerName: true, + common.SecurityAgentContainerName: false, + common.OtelAgent: false, + common.AgentDataPlaneContainerName: false, + }, + }, } for _, tt := range tests { @@ -304,7 +321,7 @@ func TestBuilder(t *testing.T) { assert.True(t, *requiredComponents.Agent.IsRequired) for name, required := range tt.wantAgentContainer { - assert.Equal(t, required, wantAgentContainer(name, requiredComponents), "Check", name) + assert.Equal(t, required, wantAgentContainer(name, requiredComponents), "container %s", name) } }) } diff --git a/internal/controller/datadogagent_controller_test.go b/internal/controller/datadogagent_controller_test.go index b8442043a..97d7e6def 100644 --- a/internal/controller/datadogagent_controller_test.go +++ b/internal/controller/datadogagent_controller_test.go @@ -168,6 +168,11 @@ var _ = Describe("V2 Controller - DatadogAgent Deployment", func() { "with overrides", testFunction(testutils.NewDatadogAgentWithOverrides(namespace, "with-overrides")), ) + + Context( + "with GPU monitoring", + testFunction(testutils.NewDatadogAgentWithGPUMonitoring(namespace, "with-gpu-monitoring")), + ) }) func testFunction(agent v2alpha1.DatadogAgent) func() { diff --git a/internal/controller/testutils/agent.go b/internal/controller/testutils/agent.go index 89244d725..83a9422cc 100644 --- a/internal/controller/testutils/agent.go +++ b/internal/controller/testutils/agent.go @@ -351,6 +351,19 @@ func NewDatadogAgentWithUSM(namespace string, name string) v2alpha1.DatadogAgent ) } +// NewDatadogAgentWithGPUMonitoring returns an agent with GPU monitoring enabled +func NewDatadogAgentWithGPUMonitoring(namespace string, name string) v2alpha1.DatadogAgent { + return newDatadogAgentWithFeatures( + namespace, + name, + &v2alpha1.DatadogFeatures{ + GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + Enabled: apiutils.NewBoolPointer(true), + }, + }, + ) +} + // NewDatadogAgentWithGlobalConfigSettings returns an agent with some global // settings set func NewDatadogAgentWithGlobalConfigSettings(namespace string, name string) v2alpha1.DatadogAgent { diff --git a/pkg/testutils/builder.go b/pkg/testutils/builder.go index 28a2f4e61..dae0f5c40 100644 --- a/pkg/testutils/builder.go +++ b/pkg/testutils/builder.go @@ -949,3 +949,18 @@ func (builder *DatadogAgentBuilder) WithFIPS(fipsConfig v2alpha1.FIPSConfig) *Da builder.datadogAgent.Spec.Global.FIPS = &fipsConfig return builder } + + +// GPU + +func (builder *DatadogAgentBuilder) initGPUMonitoring() { + if builder.datadogAgent.Spec.Features.GPUMonitoring == nil { + builder.datadogAgent.Spec.Features.GPUMonitoring = &v2alpha1.GPUMonitoringFeatureConfig{} + } +} + +func (builder *DatadogAgentBuilder) WithGPUMonitoringEnabled(enabled bool) *DatadogAgentBuilder { + builder.initGPUMonitoring() + builder.datadogAgent.Spec.Features.GPUMonitoring.Enabled = apiutils.NewBoolPointer(enabled) + return builder +} From dd0dd9c9c87d1ff30be2d0519787144085bc023d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Tue, 7 Jan 2025 13:40:30 +0000 Subject: [PATCH 02/14] Add tests for runtime class changes --- .../datadogagent/feature/gpu/feature_test.go | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/internal/controller/datadogagent/feature/gpu/feature_test.go b/internal/controller/datadogagent/feature/gpu/feature_test.go index 82a291dbb..01190254e 100644 --- a/internal/controller/datadogagent/feature/gpu/feature_test.go +++ b/internal/controller/datadogagent/feature/gpu/feature_test.go @@ -16,6 +16,8 @@ import ( "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/test" ) +const alternativeRuntimeClass = "nvidia-like" + func Test_GPUMonitoringFeature_Configure(t *testing.T) { ddaGPUMonitoringDisabled := v2alpha1.DatadogAgent{ Spec: v2alpha1.DatadogAgentSpec{ @@ -29,7 +31,13 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { ddaGPUMonitoringEnabled := ddaGPUMonitoringDisabled.DeepCopy() ddaGPUMonitoringEnabled.Spec.Features.GPUMonitoring.Enabled = apiutils.NewBoolPointer(true) - GPUMonitoringAgentNodeWantFunc := func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + ddaGPUMonitoringEnabledAlternativeRuntimeClass := ddaGPUMonitoringEnabled.DeepCopy() + ddaGPUMonitoringEnabledAlternativeRuntimeClass.Spec.Features.GPUMonitoring.PodRuntimeClassName = apiutils.NewStringPointer(alternativeRuntimeClass) + + ddaGPUMonitoringEnabledANoRuntimeClass := ddaGPUMonitoringEnabled.DeepCopy() + ddaGPUMonitoringEnabledANoRuntimeClass.Spec.Features.GPUMonitoring.PodRuntimeClassName = apiutils.NewStringPointer("") + + GPUMonitoringAgentNodeWantFunc := func(t testing.TB, mgrInterface feature.PodTemplateManagers, expectedRuntimeClass string) { mgr := mgrInterface.(*fake.PodTemplateManagers) // check security context capabilities @@ -128,6 +136,13 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { systemProbeEnvVars := mgr.EnvVarMgr.EnvVarsByC[apicommon.SystemProbeContainerName] assert.True(t, apiutils.IsEqualStruct(systemProbeEnvVars, wantEnvVars), "System Probe envvars \ndiff = %s", cmp.Diff(systemProbeEnvVars, wantEnvVars)) + + // Check runtime class + if expectedRuntimeClass == "" { + assert.Nil(t, mgr.PodTemplateSpec().Spec.RuntimeClassName) + } else { + assert.Equal(t, expectedRuntimeClass, *mgr.PodTemplateSpec().Spec.RuntimeClassName) + } } tests := test.FeatureTestSuite{ @@ -140,7 +155,20 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { Name: "gpu monitoring enabled", DDA: ddaGPUMonitoringEnabled, WantConfigure: true, - Agent: test.NewDefaultComponentTest().WithWantFunc(GPUMonitoringAgentNodeWantFunc), + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, v2alpha1.DefaultGPUMonitoringRuntimeClass) }), + }, + { + Name: "gpu monitoring enabled, alternative runtime class", + DDA: ddaGPUMonitoringEnabledAlternativeRuntimeClass, + WantConfigure: true, + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, alternativeRuntimeClass) }), + }, + + { + Name: "gpu monitoring enabled, no runtime class", + DDA: ddaGPUMonitoringEnabledANoRuntimeClass, + WantConfigure: true, + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, "") }), }, } From 7d40ddb8b101a38375467bf57397c9149d71b693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Wed, 8 Jan 2025 10:07:31 +0000 Subject: [PATCH 03/14] Documentation --- config/manager/kustomization.yaml | 4 ++-- .../datadogagent/feature/gpu/feature.go | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 58b7c4148..8c4744223 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -2,7 +2,7 @@ resources: - manager.yaml images: - name: controller - newName: gcr.io/datadoghq/operator - newTag: 1.11.1 + newName: 601427279990.dkr.ecr.us-east-1.amazonaws.com/guillermo.julian/sandbox + newTag: operator apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go index 1d4f8f9ff..eccee6a9d 100644 --- a/internal/controller/datadogagent/feature/gpu/feature.go +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -22,6 +22,9 @@ func buildFeature(*feature.Options) feature.Feature { } type gpuMonitoringFeature struct { + // podRuntimeClassName is the value to set in the runtimeClassName + // configuration of the agent pod. If this is empty, the runtimeClassName + // will not be changed. podRuntimeClassName string } @@ -42,8 +45,11 @@ func (f *gpuMonitoringFeature) Configure(dda *v2alpha1.DatadogAgent) (reqComp fe } if dda.Spec.Features.GPUMonitoring.PodRuntimeClassName == nil { + // Configuration option not set, so revert to the default f.podRuntimeClassName = v2alpha1.DefaultGPUMonitoringRuntimeClass } else { + // Configuration option set, use the value. Note that here the value might be an empty + // string, which tells us to not change the runtime class. f.podRuntimeClassName = *dda.Spec.Features.GPUMonitoring.PodRuntimeClassName } @@ -122,7 +128,7 @@ func (f *gpuMonitoringFeature) ManageNodeAgent(managers feature.PodTemplateManag // env variable. This is usually configured with the options // accept-nvidia-visible-devices-envvar-when-unprivileged = true // accept-nvidia-visible-devices-as-volume-mounts = true - // in the NVIDIA conatiner runtime config. In this case, we need to mount the + // in the NVIDIA container runtime config. In this case, we need to mount the // /var/run/nvidia-container-devices/all directory into the container, so that // the nvidia-container-runtime can see that we want to use all GPUs. devicesVol, devicesMount := volume.GetVolumes(v2alpha1.NVIDIADevicesVolumeName, v2alpha1.DevNullPath, v2alpha1.NVIDIADevicesMountPath, true) @@ -134,8 +140,11 @@ func (f *gpuMonitoringFeature) ManageNodeAgent(managers feature.PodTemplateManag managers.PodTemplateSpec().Spec.RuntimeClassName = &f.podRuntimeClassName } - // Note: we don't need to mount the NVML library, as it's mounted automatically - // by the nvidia-container-runtime. + // Note: we don't need to mount the NVML library, as it's mounted + // automatically by the nvidia-container-runtime. However, if needed, we + // could add a config option for that and mount that in the agent and + // system-probe folders, and then set the correct configuration option so + // that the binaries can find the library. return nil } From da4ab242822e62b7af778e5226fb476290c1cce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 9 Jan 2025 11:02:55 +0000 Subject: [PATCH 04/14] Update docs --- api/datadoghq/v2alpha1/datadogagent_types.go | 2 +- config/crd/bases/v1/datadoghq.com_datadogagents.yaml | 4 ++-- .../v1/datadoghq.com_datadogagents_v2alpha1.json | 4 ++-- docs/configuration.v2alpha1.md | 2 +- .../datadogagent/feature/gpu/feature_test.go | 12 +++++++++--- pkg/testutils/builder.go | 1 - 6 files changed, 15 insertions(+), 10 deletions(-) diff --git a/api/datadoghq/v2alpha1/datadogagent_types.go b/api/datadoghq/v2alpha1/datadogagent_types.go index 1756ba844..232b653ed 100644 --- a/api/datadoghq/v2alpha1/datadogagent_types.go +++ b/api/datadoghq/v2alpha1/datadogagent_types.go @@ -508,7 +508,7 @@ type GPUMonitoringFeatureConfig struct { Enabled *bool `json:"enabled,omitempty"` // PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. - // If left empty, the runtime class will not be set. + // If left empty, the runtime class is not set. // Default: nvidia // +optional PodRuntimeClassName *string `json:"requiredRuntimeClassName"` diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml index f862243d3..effa55c25 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml +++ b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml @@ -1030,7 +1030,7 @@ spec: requiredRuntimeClassName: description: |- PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. - If left empty, the runtime class will not be set. + If left empty, the runtime class is not set. Default: nvidia type: string type: object @@ -7909,7 +7909,7 @@ spec: requiredRuntimeClassName: description: |- PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. - If left empty, the runtime class will not be set. + If left empty, the runtime class is not set. Default: nvidia type: string type: object diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json index 62a36b6d3..d5922be92 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json +++ b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json @@ -1074,7 +1074,7 @@ "type": "boolean" }, "requiredRuntimeClassName": { - "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class will not be set.\nDefault: nvidia", + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class is not set.\nDefault: nvidia", "type": "string" } }, @@ -7895,7 +7895,7 @@ "type": "boolean" }, "requiredRuntimeClassName": { - "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class will not be set.\nDefault: nvidia", + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class is not set.\nDefault: nvidia", "type": "string" } }, diff --git a/docs/configuration.v2alpha1.md b/docs/configuration.v2alpha1.md index 40a37b5b0..7a73f5c46 100644 --- a/docs/configuration.v2alpha1.md +++ b/docs/configuration.v2alpha1.md @@ -112,7 +112,7 @@ spec: | features.externalMetricsServer.useDatadogMetrics | UseDatadogMetrics enables usage of the DatadogMetrics CRD (allowing one to scale on arbitrary Datadog metric queries). Default: true | | features.externalMetricsServer.wpaController | WPAController enables the informer and controller of the Watermark Pod Autoscaler. NOTE: The Watermark Pod Autoscaler controller needs to be installed. See also: https://github.com/DataDog/watermarkpodautoscaler. Default: false | | features.gpu.enabled | Enables GPU monitoring. Default: false | -| features.gpu.requiredRuntimeClassName | PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. If left empty, the runtime class will not be set. Default: nvidia | +| features.gpu.requiredRuntimeClassName | PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. If left empty, the runtime class is not set. Default: nvidia | | features.helmCheck.collectEvents | CollectEvents set to `true` enables event collection in the Helm check (Requires Agent 7.36.0+ and Cluster Agent 1.20.0+) Default: false | | features.helmCheck.enabled | Enables the Helm check. Default: false | | features.helmCheck.valuesAsTags | ValuesAsTags collects Helm values from a release and uses them as tags (Requires Agent and Cluster Agent 7.40.0+). Default: {} | diff --git a/internal/controller/datadogagent/feature/gpu/feature_test.go b/internal/controller/datadogagent/feature/gpu/feature_test.go index 01190254e..65224cd7d 100644 --- a/internal/controller/datadogagent/feature/gpu/feature_test.go +++ b/internal/controller/datadogagent/feature/gpu/feature_test.go @@ -155,20 +155,26 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { Name: "gpu monitoring enabled", DDA: ddaGPUMonitoringEnabled, WantConfigure: true, - Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, v2alpha1.DefaultGPUMonitoringRuntimeClass) }), + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + GPUMonitoringAgentNodeWantFunc(t, mgrInterface, v2alpha1.DefaultGPUMonitoringRuntimeClass) + }), }, { Name: "gpu monitoring enabled, alternative runtime class", DDA: ddaGPUMonitoringEnabledAlternativeRuntimeClass, WantConfigure: true, - Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, alternativeRuntimeClass) }), + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + GPUMonitoringAgentNodeWantFunc(t, mgrInterface, alternativeRuntimeClass) + }), }, { Name: "gpu monitoring enabled, no runtime class", DDA: ddaGPUMonitoringEnabledANoRuntimeClass, WantConfigure: true, - Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { GPUMonitoringAgentNodeWantFunc(t, mgrInterface, "") }), + Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { + GPUMonitoringAgentNodeWantFunc(t, mgrInterface, "") + }), }, } diff --git a/pkg/testutils/builder.go b/pkg/testutils/builder.go index dae0f5c40..1a0c21107 100644 --- a/pkg/testutils/builder.go +++ b/pkg/testutils/builder.go @@ -950,7 +950,6 @@ func (builder *DatadogAgentBuilder) WithFIPS(fipsConfig v2alpha1.FIPSConfig) *Da return builder } - // GPU func (builder *DatadogAgentBuilder) initGPUMonitoring() { From f5b5e32b539808c89885aff2f5e8a824250de68a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Juli=C3=A1n?= Date: Fri, 24 Jan 2025 11:03:36 +0100 Subject: [PATCH 05/14] Update api/datadoghq/v2alpha1/datadogagent_types.go Co-authored-by: Celene --- api/datadoghq/v2alpha1/datadogagent_types.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/datadoghq/v2alpha1/datadogagent_types.go b/api/datadoghq/v2alpha1/datadogagent_types.go index 232b653ed..baae6a970 100644 --- a/api/datadoghq/v2alpha1/datadogagent_types.go +++ b/api/datadoghq/v2alpha1/datadogagent_types.go @@ -508,7 +508,7 @@ type GPUMonitoringFeatureConfig struct { Enabled *bool `json:"enabled,omitempty"` // PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. - // If left empty, the runtime class is not set. + // If the value is an empty string, the runtime class is not set. // Default: nvidia // +optional PodRuntimeClassName *string `json:"requiredRuntimeClassName"` From dbf001949300b50ca50a04206ebfaf3439a6b3c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Fri, 24 Jan 2025 11:04:21 +0100 Subject: [PATCH 06/14] Remove debug changes --- config/manager/kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 8c4744223..58b7c4148 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -2,7 +2,7 @@ resources: - manager.yaml images: - name: controller - newName: 601427279990.dkr.ecr.us-east-1.amazonaws.com/guillermo.julian/sandbox - newTag: operator + newName: gcr.io/datadoghq/operator + newTag: 1.11.1 apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization From 0c78f33447db843d531c3aa554d46379b9b3da47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Fri, 24 Jan 2025 11:07:24 +0100 Subject: [PATCH 07/14] Move const variables to gpu package --- api/datadoghq/v2alpha1/const.go | 4 ---- .../controller/datadogagent/feature/gpu/const.go | 10 ++++++++++ .../controller/datadogagent/feature/gpu/feature.go | 2 +- .../datadogagent/feature/gpu/feature_test.go | 12 ++++++------ 4 files changed, 17 insertions(+), 11 deletions(-) create mode 100644 internal/controller/datadogagent/feature/gpu/const.go diff --git a/api/datadoghq/v2alpha1/const.go b/api/datadoghq/v2alpha1/const.go index c6662fa2a..168f032bf 100644 --- a/api/datadoghq/v2alpha1/const.go +++ b/api/datadoghq/v2alpha1/const.go @@ -204,10 +204,6 @@ const ( FIPSProxyCustomConfigFileName = "datadog-fips-proxy.cfg" FIPSProxyCustomConfigMapName = "%s-fips-config" FIPSProxyCustomConfigMountPath = "/etc/datadog-fips-proxy/datadog-fips-proxy.cfg" - - NVIDIADevicesMountPath = "/var/run/nvidia-container-devices/all" - NVIDIADevicesVolumeName = "nvidia-devices" - DevNullPath = "/dev/null" // used to mount the NVIDIADevicesHostPath to /dev/null in the container, it's just used as a "signal" to the nvidia runtime to use the nvidia devices ) // Field paths diff --git a/internal/controller/datadogagent/feature/gpu/const.go b/internal/controller/datadogagent/feature/gpu/const.go new file mode 100644 index 000000000..4c5fa4444 --- /dev/null +++ b/internal/controller/datadogagent/feature/gpu/const.go @@ -0,0 +1,10 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package gpu + +const nvidiaDevicesMountPath = "/var/run/nvidia-container-devices/all" +const nvidiaDevicesVolumeName = "nvidia-devices" +const devNullPath = "/dev/null" // used to mount the NVIDIADevicesHostPath to /dev/null in the container, it's just used as a "signal" to the nvidia runtime to use the nvidia devices diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go index eccee6a9d..a7fbdb524 100644 --- a/internal/controller/datadogagent/feature/gpu/feature.go +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -131,7 +131,7 @@ func (f *gpuMonitoringFeature) ManageNodeAgent(managers feature.PodTemplateManag // in the NVIDIA container runtime config. In this case, we need to mount the // /var/run/nvidia-container-devices/all directory into the container, so that // the nvidia-container-runtime can see that we want to use all GPUs. - devicesVol, devicesMount := volume.GetVolumes(v2alpha1.NVIDIADevicesVolumeName, v2alpha1.DevNullPath, v2alpha1.NVIDIADevicesMountPath, true) + devicesVol, devicesMount := volume.GetVolumes(nvidiaDevicesVolumeName, devNullPath, nvidiaDevicesMountPath, true) managers.Volume().AddVolume(&devicesVol) managers.VolumeMount().AddVolumeMountToContainers(&devicesMount, []apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}) diff --git a/internal/controller/datadogagent/feature/gpu/feature_test.go b/internal/controller/datadogagent/feature/gpu/feature_test.go index 65224cd7d..6c51d1109 100644 --- a/internal/controller/datadogagent/feature/gpu/feature_test.go +++ b/internal/controller/datadogagent/feature/gpu/feature_test.go @@ -57,8 +57,8 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { ReadOnly: true, }, { - Name: v2alpha1.NVIDIADevicesVolumeName, - MountPath: v2alpha1.NVIDIADevicesMountPath, + Name: nvidiaDevicesVolumeName, + MountPath: nvidiaDevicesMountPath, ReadOnly: true, }, } @@ -75,8 +75,8 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { ReadOnly: false, }, { - Name: v2alpha1.NVIDIADevicesVolumeName, - MountPath: v2alpha1.NVIDIADevicesMountPath, + Name: nvidiaDevicesVolumeName, + MountPath: nvidiaDevicesMountPath, ReadOnly: true, }, } @@ -104,10 +104,10 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { }, }, { - Name: v2alpha1.NVIDIADevicesVolumeName, + Name: nvidiaDevicesVolumeName, VolumeSource: corev1.VolumeSource{ HostPath: &corev1.HostPathVolumeSource{ - Path: v2alpha1.DevNullPath, + Path: devNullPath, }, }, }, From 3fcc699ce149dc6e417e829a31948e0405503149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Fri, 24 Jan 2025 11:08:25 +0100 Subject: [PATCH 08/14] GPUMonitoringType -> GPUIDType --- internal/controller/datadogagent/feature/gpu/feature.go | 4 ++-- internal/controller/datadogagent/feature/ids.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go index a7fbdb524..58984d7a5 100644 --- a/internal/controller/datadogagent/feature/gpu/feature.go +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -12,7 +12,7 @@ import ( ) func init() { - if err := feature.Register(feature.GPUMonitoringType, buildFeature); err != nil { + if err := feature.Register(feature.GPUIDType, buildFeature); err != nil { panic(err) } } @@ -30,7 +30,7 @@ type gpuMonitoringFeature struct { // ID returns the ID of the Feature func (f *gpuMonitoringFeature) ID() feature.IDType { - return feature.GPUMonitoringType + return feature.GPUIDType } // Configure is used to configure the feature from a v2alpha1.DatadogAgent instance. diff --git a/internal/controller/datadogagent/feature/ids.go b/internal/controller/datadogagent/feature/ids.go index ecd4365e3..a73e5b82b 100644 --- a/internal/controller/datadogagent/feature/ids.go +++ b/internal/controller/datadogagent/feature/ids.go @@ -71,6 +71,6 @@ const ( DummyIDType = "dummy" // ServiceDiscoveryType service discovery feature. ServiceDiscoveryType = "service_discovery" - // GPUMonitoringType monitoring feature. - GPUMonitoringType = "gpu" + // GPUIDType monitoring feature. + GPUIDType = "gpu" ) From 76a3ac303483cae0a0df935af7706079a1f0ddd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Fri, 24 Jan 2025 11:08:51 +0100 Subject: [PATCH 09/14] Rename gpuMonitoringFeature to gpuFeature --- .../datadogagent/feature/gpu/feature.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go index 58984d7a5..a607bb498 100644 --- a/internal/controller/datadogagent/feature/gpu/feature.go +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -18,10 +18,10 @@ func init() { } func buildFeature(*feature.Options) feature.Feature { - return &gpuMonitoringFeature{} + return &gpuFeature{} } -type gpuMonitoringFeature struct { +type gpuFeature struct { // podRuntimeClassName is the value to set in the runtimeClassName // configuration of the agent pod. If this is empty, the runtimeClassName // will not be changed. @@ -29,12 +29,12 @@ type gpuMonitoringFeature struct { } // ID returns the ID of the Feature -func (f *gpuMonitoringFeature) ID() feature.IDType { +func (f *gpuFeature) ID() feature.IDType { return feature.GPUIDType } // Configure is used to configure the feature from a v2alpha1.DatadogAgent instance. -func (f *gpuMonitoringFeature) Configure(dda *v2alpha1.DatadogAgent) (reqComp feature.RequiredComponents) { +func (f *gpuFeature) Configure(dda *v2alpha1.DatadogAgent) (reqComp feature.RequiredComponents) { if dda.Spec.Features == nil || dda.Spec.Features.GPUMonitoring == nil || !apiutils.BoolValue(dda.Spec.Features.GPUMonitoring.Enabled) { return reqComp } @@ -58,13 +58,13 @@ func (f *gpuMonitoringFeature) Configure(dda *v2alpha1.DatadogAgent) (reqComp fe // ManageDependencies allows a feature to manage its dependencies. // Feature's dependencies should be added in the store. -func (f *gpuMonitoringFeature) ManageDependencies(feature.ResourceManagers, feature.RequiredComponents) error { +func (f *gpuFeature) ManageDependencies(feature.ResourceManagers, feature.RequiredComponents) error { return nil } // ManageClusterAgent allows a feature to configure the ClusterAgent's corev1.PodTemplateSpec // It should do nothing if the feature doesn't need to configure it. -func (f *gpuMonitoringFeature) ManageClusterAgent(feature.PodTemplateManagers) error { +func (f *gpuFeature) ManageClusterAgent(feature.PodTemplateManagers) error { return nil } @@ -98,7 +98,7 @@ func configureSystemProbe(managers feature.PodTemplateManagers) { // ManageNodeAgent allows a feature to configure the Node Agent's corev1.PodTemplateSpec // It should do nothing if the feature doesn't need to configure it. -func (f *gpuMonitoringFeature) ManageNodeAgent(managers feature.PodTemplateManagers, _ string) error { +func (f *gpuFeature) ManageNodeAgent(managers feature.PodTemplateManagers, _ string) error { configureSystemProbe(managers) // env var to enable the GPU module @@ -152,12 +152,12 @@ func (f *gpuMonitoringFeature) ManageNodeAgent(managers feature.PodTemplateManag // ManageSingleContainerNodeAgent allows a feature to configure the Agent container for the Node Agent's corev1.PodTemplateSpec // if SingleContainerStrategy is enabled and can be used with the configured feature set. // It should do nothing if the feature doesn't need to configure it. -func (f *gpuMonitoringFeature) ManageSingleContainerNodeAgent(feature.PodTemplateManagers, string) error { +func (f *gpuFeature) ManageSingleContainerNodeAgent(feature.PodTemplateManagers, string) error { return nil } // ManageClusterChecksRunner allows a feature to configure the ClusterChecksRunner's corev1.PodTemplateSpec // It should do nothing if the feature doesn't need to configure it. -func (f *gpuMonitoringFeature) ManageClusterChecksRunner(feature.PodTemplateManagers) error { +func (f *gpuFeature) ManageClusterChecksRunner(feature.PodTemplateManagers) error { return nil } From 8d4c14e6453aecd60d816fe72dad15df6f751db4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Fri, 24 Jan 2025 11:09:10 +0100 Subject: [PATCH 10/14] Apply suggestion --- internal/controller/datadogagent/feature/gpu/feature.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go index a607bb498..255c183e0 100644 --- a/internal/controller/datadogagent/feature/gpu/feature.go +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -135,7 +135,7 @@ func (f *gpuFeature) ManageNodeAgent(managers feature.PodTemplateManagers, _ str managers.Volume().AddVolume(&devicesVol) managers.VolumeMount().AddVolumeMountToContainers(&devicesMount, []apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}) - // Configure the runtime class for the container + // Configure the runtime class for the pod if f.podRuntimeClassName != "" { managers.PodTemplateSpec().Spec.RuntimeClassName = &f.podRuntimeClassName } From 9706adc4ebabcb72fae2cddc48b2c860773e8b80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 30 Jan 2025 11:23:17 +0100 Subject: [PATCH 11/14] defaultGPURuntimeClass --- api/datadoghq/v2alpha1/const.go | 3 --- internal/controller/datadogagent/feature/gpu/const.go | 11 ++++++++--- .../controller/datadogagent/feature/gpu/feature.go | 2 +- .../datadogagent/feature/gpu/feature_test.go | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/api/datadoghq/v2alpha1/const.go b/api/datadoghq/v2alpha1/const.go index 168f032bf..ef52fb674 100644 --- a/api/datadoghq/v2alpha1/const.go +++ b/api/datadoghq/v2alpha1/const.go @@ -78,9 +78,6 @@ const ( KubeServicesAndEndpointsListeners = "kube_services kube_endpoints" EndpointsChecksConfigProvider = "endpointschecks" ClusterAndEndpointsConfigProviders = "clusterchecks endpointschecks" - - // DefaultGPUMonitoringRuntimeClass default runtime class for GPU pods - DefaultGPUMonitoringRuntimeClass = "nvidia" ) // Labels diff --git a/internal/controller/datadogagent/feature/gpu/const.go b/internal/controller/datadogagent/feature/gpu/const.go index 4c5fa4444..dbc41ee9f 100644 --- a/internal/controller/datadogagent/feature/gpu/const.go +++ b/internal/controller/datadogagent/feature/gpu/const.go @@ -5,6 +5,11 @@ package gpu -const nvidiaDevicesMountPath = "/var/run/nvidia-container-devices/all" -const nvidiaDevicesVolumeName = "nvidia-devices" -const devNullPath = "/dev/null" // used to mount the NVIDIADevicesHostPath to /dev/null in the container, it's just used as a "signal" to the nvidia runtime to use the nvidia devices +const ( + nvidiaDevicesMountPath = "/var/run/nvidia-container-devices/all" + nvidiaDevicesVolumeName = "nvidia-devices" + devNullPath = "/dev/null" // used to mount the NVIDIADevicesHostPath to /dev/null in the container, it's just used as a "signal" to the nvidia runtime to use the nvidia devices + + // defaultGPURuntimeClass default runtime class for GPU pods + defaultGPURuntimeClass = "nvidia" +) diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go index 255c183e0..b0f712d69 100644 --- a/internal/controller/datadogagent/feature/gpu/feature.go +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -46,7 +46,7 @@ func (f *gpuFeature) Configure(dda *v2alpha1.DatadogAgent) (reqComp feature.Requ if dda.Spec.Features.GPUMonitoring.PodRuntimeClassName == nil { // Configuration option not set, so revert to the default - f.podRuntimeClassName = v2alpha1.DefaultGPUMonitoringRuntimeClass + f.podRuntimeClassName = defaultGPURuntimeClass } else { // Configuration option set, use the value. Note that here the value might be an empty // string, which tells us to not change the runtime class. diff --git a/internal/controller/datadogagent/feature/gpu/feature_test.go b/internal/controller/datadogagent/feature/gpu/feature_test.go index 6c51d1109..7928a6a20 100644 --- a/internal/controller/datadogagent/feature/gpu/feature_test.go +++ b/internal/controller/datadogagent/feature/gpu/feature_test.go @@ -156,7 +156,7 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { DDA: ddaGPUMonitoringEnabled, WantConfigure: true, Agent: test.NewDefaultComponentTest().WithWantFunc(func(t testing.TB, mgrInterface feature.PodTemplateManagers) { - GPUMonitoringAgentNodeWantFunc(t, mgrInterface, v2alpha1.DefaultGPUMonitoringRuntimeClass) + GPUMonitoringAgentNodeWantFunc(t, mgrInterface, defaultGPURuntimeClass) }), }, { From cc0009068e83482c0374db3bcf37f37c3e1895d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 30 Jan 2025 11:23:48 +0100 Subject: [PATCH 12/14] Rename GPUMonitoringConfig --- api/datadoghq/v2alpha1/datadogagent_types.go | 6 ++-- .../v2alpha1/zz_generated.deepcopy.go | 12 +++---- .../defaults/datadogagent_default.go | 6 ++-- .../defaults/datadogagent_default_test.go | 32 +++++++++---------- .../datadogagent/feature/gpu/feature.go | 6 ++-- .../datadogagent/feature/gpu/feature_test.go | 8 ++--- internal/controller/testutils/agent.go | 2 +- pkg/testutils/builder.go | 6 ++-- 8 files changed, 39 insertions(+), 39 deletions(-) diff --git a/api/datadoghq/v2alpha1/datadogagent_types.go b/api/datadoghq/v2alpha1/datadogagent_types.go index baae6a970..461f98fdb 100644 --- a/api/datadoghq/v2alpha1/datadogagent_types.go +++ b/api/datadoghq/v2alpha1/datadogagent_types.go @@ -83,7 +83,7 @@ type DatadogFeatures struct { // ServiceDiscovery ServiceDiscovery *ServiceDiscoveryFeatureConfig `json:"serviceDiscovery,omitempty"` // GPU monitoring - GPUMonitoring *GPUMonitoringFeatureConfig `json:"gpu,omitempty"` + GPU *GPUFeatureConfig `json:"gpu,omitempty"` // Cluster-level features @@ -500,8 +500,8 @@ type ServiceDiscoveryFeatureConfig struct { Enabled *bool `json:"enabled,omitempty"` } -// GPUMonitoringFeatureConfig contains the GPU monitoring configuration. -type GPUMonitoringFeatureConfig struct { +// GPUFeatureConfig contains the GPU monitoring configuration. +type GPUFeatureConfig struct { // Enabled enables GPU monitoring. // Default: false // +optional diff --git a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go index 30bdf8e78..11fb7edbc 100644 --- a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go +++ b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go @@ -1232,9 +1232,9 @@ func (in *DatadogFeatures) DeepCopyInto(out *DatadogFeatures) { *out = new(ServiceDiscoveryFeatureConfig) (*in).DeepCopyInto(*out) } - if in.GPUMonitoring != nil { - in, out := &in.GPUMonitoring, &out.GPUMonitoring - *out = new(GPUMonitoringFeatureConfig) + if in.GPU != nil { + in, out := &in.GPU, &out.GPU + *out = new(GPUFeatureConfig) (*in).DeepCopyInto(*out) } if in.EventCollection != nil { @@ -1551,7 +1551,7 @@ func (in *FIPSConfig) DeepCopy() *FIPSConfig { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GPUMonitoringFeatureConfig) DeepCopyInto(out *GPUMonitoringFeatureConfig) { +func (in *GPUFeatureConfig) DeepCopyInto(out *GPUFeatureConfig) { *out = *in if in.Enabled != nil { in, out := &in.Enabled, &out.Enabled @@ -1566,11 +1566,11 @@ func (in *GPUMonitoringFeatureConfig) DeepCopyInto(out *GPUMonitoringFeatureConf } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUMonitoringFeatureConfig. -func (in *GPUMonitoringFeatureConfig) DeepCopy() *GPUMonitoringFeatureConfig { +func (in *GPUFeatureConfig) DeepCopy() *GPUFeatureConfig { if in == nil { return nil } - out := new(GPUMonitoringFeatureConfig) + out := new(GPUFeatureConfig) in.DeepCopyInto(out) return out } diff --git a/internal/controller/datadogagent/defaults/datadogagent_default.go b/internal/controller/datadogagent/defaults/datadogagent_default.go index 7b43579ba..26e118105 100644 --- a/internal/controller/datadogagent/defaults/datadogagent_default.go +++ b/internal/controller/datadogagent/defaults/datadogagent_default.go @@ -268,10 +268,10 @@ func defaultFeaturesConfig(ddaSpec *v2alpha1.DatadogAgentSpec) { apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.ServiceDiscovery.Enabled, defaultServiceDiscoveryEnabled) // GPU monitoring feature - if ddaSpec.Features.GPUMonitoring == nil { - ddaSpec.Features.GPUMonitoring = &v2alpha1.GPUMonitoringFeatureConfig{} + if ddaSpec.Features.GPU == nil { + ddaSpec.Features.GPU = &v2alpha1.GPUFeatureConfig{} } - apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.GPUMonitoring.Enabled, defaultGPUMonitoringEnabled) + apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.GPU.Enabled, defaultGPUMonitoringEnabled) // APM Feature // APM is enabled by default diff --git a/internal/controller/datadogagent/defaults/datadogagent_default_test.go b/internal/controller/datadogagent/defaults/datadogagent_default_test.go index 52c9cb446..35978cfa6 100644 --- a/internal/controller/datadogagent/defaults/datadogagent_default_test.go +++ b/internal/controller/datadogagent/defaults/datadogagent_default_test.go @@ -198,7 +198,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -336,7 +336,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -429,7 +429,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -558,7 +558,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -708,7 +708,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -853,7 +853,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -998,7 +998,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -1152,7 +1152,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -1297,7 +1297,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -1445,7 +1445,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -1632,7 +1632,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, CSPM: &v2alpha1.CSPMFeatureConfig{ @@ -1750,7 +1750,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -1896,7 +1896,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -2018,7 +2018,7 @@ func Test_defaultFeatures(t *testing.T) { OOMKill: &v2alpha1.OOMKillFeatureConfig{}, TCPQueueLength: &v2alpha1.TCPQueueLengthFeatureConfig{}, EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{}, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{}, + GPU: &v2alpha1.GPUFeatureConfig{}, ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{}, APM: &v2alpha1.APMFeatureConfig{}, ASM: &v2alpha1.ASMFeatureConfig{}, @@ -2064,7 +2064,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ @@ -2212,7 +2212,7 @@ func Test_defaultFeatures(t *testing.T) { ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultServiceDiscoveryEnabled), }, - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(defaultGPUMonitoringEnabled), }, APM: &v2alpha1.APMFeatureConfig{ diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go index b0f712d69..8aebca212 100644 --- a/internal/controller/datadogagent/feature/gpu/feature.go +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -35,7 +35,7 @@ func (f *gpuFeature) ID() feature.IDType { // Configure is used to configure the feature from a v2alpha1.DatadogAgent instance. func (f *gpuFeature) Configure(dda *v2alpha1.DatadogAgent) (reqComp feature.RequiredComponents) { - if dda.Spec.Features == nil || dda.Spec.Features.GPUMonitoring == nil || !apiutils.BoolValue(dda.Spec.Features.GPUMonitoring.Enabled) { + if dda.Spec.Features == nil || dda.Spec.Features.GPU == nil || !apiutils.BoolValue(dda.Spec.Features.GPU.Enabled) { return reqComp } @@ -44,13 +44,13 @@ func (f *gpuFeature) Configure(dda *v2alpha1.DatadogAgent) (reqComp feature.Requ Containers: []apicommon.AgentContainerName{apicommon.CoreAgentContainerName, apicommon.SystemProbeContainerName}, } - if dda.Spec.Features.GPUMonitoring.PodRuntimeClassName == nil { + if dda.Spec.Features.GPU.PodRuntimeClassName == nil { // Configuration option not set, so revert to the default f.podRuntimeClassName = defaultGPURuntimeClass } else { // Configuration option set, use the value. Note that here the value might be an empty // string, which tells us to not change the runtime class. - f.podRuntimeClassName = *dda.Spec.Features.GPUMonitoring.PodRuntimeClassName + f.podRuntimeClassName = *dda.Spec.Features.GPU.PodRuntimeClassName } return reqComp diff --git a/internal/controller/datadogagent/feature/gpu/feature_test.go b/internal/controller/datadogagent/feature/gpu/feature_test.go index 7928a6a20..2202bf7dc 100644 --- a/internal/controller/datadogagent/feature/gpu/feature_test.go +++ b/internal/controller/datadogagent/feature/gpu/feature_test.go @@ -22,20 +22,20 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { ddaGPUMonitoringDisabled := v2alpha1.DatadogAgent{ Spec: v2alpha1.DatadogAgentSpec{ Features: &v2alpha1.DatadogFeatures{ - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(false), }, }, }, } ddaGPUMonitoringEnabled := ddaGPUMonitoringDisabled.DeepCopy() - ddaGPUMonitoringEnabled.Spec.Features.GPUMonitoring.Enabled = apiutils.NewBoolPointer(true) + ddaGPUMonitoringEnabled.Spec.Features.GPU.Enabled = apiutils.NewBoolPointer(true) ddaGPUMonitoringEnabledAlternativeRuntimeClass := ddaGPUMonitoringEnabled.DeepCopy() - ddaGPUMonitoringEnabledAlternativeRuntimeClass.Spec.Features.GPUMonitoring.PodRuntimeClassName = apiutils.NewStringPointer(alternativeRuntimeClass) + ddaGPUMonitoringEnabledAlternativeRuntimeClass.Spec.Features.GPU.PodRuntimeClassName = apiutils.NewStringPointer(alternativeRuntimeClass) ddaGPUMonitoringEnabledANoRuntimeClass := ddaGPUMonitoringEnabled.DeepCopy() - ddaGPUMonitoringEnabledANoRuntimeClass.Spec.Features.GPUMonitoring.PodRuntimeClassName = apiutils.NewStringPointer("") + ddaGPUMonitoringEnabledANoRuntimeClass.Spec.Features.GPU.PodRuntimeClassName = apiutils.NewStringPointer("") GPUMonitoringAgentNodeWantFunc := func(t testing.TB, mgrInterface feature.PodTemplateManagers, expectedRuntimeClass string) { mgr := mgrInterface.(*fake.PodTemplateManagers) diff --git a/internal/controller/testutils/agent.go b/internal/controller/testutils/agent.go index 83a9422cc..9bdf855fc 100644 --- a/internal/controller/testutils/agent.go +++ b/internal/controller/testutils/agent.go @@ -357,7 +357,7 @@ func NewDatadogAgentWithGPUMonitoring(namespace string, name string) v2alpha1.Da namespace, name, &v2alpha1.DatadogFeatures{ - GPUMonitoring: &v2alpha1.GPUMonitoringFeatureConfig{ + GPU: &v2alpha1.GPUFeatureConfig{ Enabled: apiutils.NewBoolPointer(true), }, }, diff --git a/pkg/testutils/builder.go b/pkg/testutils/builder.go index 1a0c21107..e049886a2 100644 --- a/pkg/testutils/builder.go +++ b/pkg/testutils/builder.go @@ -953,13 +953,13 @@ func (builder *DatadogAgentBuilder) WithFIPS(fipsConfig v2alpha1.FIPSConfig) *Da // GPU func (builder *DatadogAgentBuilder) initGPUMonitoring() { - if builder.datadogAgent.Spec.Features.GPUMonitoring == nil { - builder.datadogAgent.Spec.Features.GPUMonitoring = &v2alpha1.GPUMonitoringFeatureConfig{} + if builder.datadogAgent.Spec.Features.GPU == nil { + builder.datadogAgent.Spec.Features.GPU = &v2alpha1.GPUFeatureConfig{} } } func (builder *DatadogAgentBuilder) WithGPUMonitoringEnabled(enabled bool) *DatadogAgentBuilder { builder.initGPUMonitoring() - builder.datadogAgent.Spec.Features.GPUMonitoring.Enabled = apiutils.NewBoolPointer(enabled) + builder.datadogAgent.Spec.Features.GPU.Enabled = apiutils.NewBoolPointer(enabled) return builder } From f2ac105a1dc34d3fd161acbdbe6a377f40808c4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 30 Jan 2025 11:24:01 +0100 Subject: [PATCH 13/14] Fix comment --- internal/controller/datadogagent/feature/ids.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/controller/datadogagent/feature/ids.go b/internal/controller/datadogagent/feature/ids.go index a73e5b82b..274599b08 100644 --- a/internal/controller/datadogagent/feature/ids.go +++ b/internal/controller/datadogagent/feature/ids.go @@ -71,6 +71,6 @@ const ( DummyIDType = "dummy" // ServiceDiscoveryType service discovery feature. ServiceDiscoveryType = "service_discovery" - // GPUIDType monitoring feature. + // GPUIDType GPU monitoring feature. GPUIDType = "gpu" ) From e44a1677aa6d67e9e0ca29664053a04299dfc1fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 30 Jan 2025 11:39:32 +0100 Subject: [PATCH 14/14] Generate code --- api/datadoghq/v2alpha1/zz_generated.deepcopy.go | 2 +- api/datadoghq/v2alpha1/zz_generated.openapi.go | 4 ++-- config/crd/bases/v1/datadoghq.com_datadogagents.yaml | 4 ++-- config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json | 4 ++-- docs/configuration.v2alpha1.md | 2 +- .../datadogagent/defaults/datadogagent_default_test.go | 2 +- internal/controller/datadogagent/feature/gpu/const.go | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go index 11fb7edbc..cc3d92e54 100644 --- a/api/datadoghq/v2alpha1/zz_generated.deepcopy.go +++ b/api/datadoghq/v2alpha1/zz_generated.deepcopy.go @@ -1565,7 +1565,7 @@ func (in *GPUFeatureConfig) DeepCopyInto(out *GPUFeatureConfig) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUMonitoringFeatureConfig. +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUFeatureConfig. func (in *GPUFeatureConfig) DeepCopy() *GPUFeatureConfig { if in == nil { return nil diff --git a/api/datadoghq/v2alpha1/zz_generated.openapi.go b/api/datadoghq/v2alpha1/zz_generated.openapi.go index ae577cbf1..f1aa9771c 100644 --- a/api/datadoghq/v2alpha1/zz_generated.openapi.go +++ b/api/datadoghq/v2alpha1/zz_generated.openapi.go @@ -678,7 +678,7 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_DatadogFeatures(ref common.R "gpu": { SchemaProps: spec.SchemaProps{ Description: "GPU monitoring", - Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.GPUMonitoringFeatureConfig"), + Ref: ref("github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.GPUFeatureConfig"), }, }, "eventCollection": { @@ -739,7 +739,7 @@ func schema_datadog_operator_api_datadoghq_v2alpha1_DatadogFeatures(ref common.R }, }, Dependencies: []string{ - "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.APMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ASMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AdmissionControllerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AutoscalingFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CSPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CWSFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ClusterChecksFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DogstatsdFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EBPFCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EventCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExternalMetricsServerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.GPUMonitoringFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.HelmCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.KubeStateMetricsCoreFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveContainerCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveProcessCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LogCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.NPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OOMKillFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OTLPFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OrchestratorExplorerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OtelCollectorFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ProcessDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.PrometheusScrapeFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigurationFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.SBOMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ServiceDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.TCPQueueLengthFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.USMFeatureConfig"}, + "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.APMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ASMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AdmissionControllerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.AutoscalingFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CSPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.CWSFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ClusterChecksFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.DogstatsdFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EBPFCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.EventCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ExternalMetricsServerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.GPUFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.HelmCheckFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.KubeStateMetricsCoreFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveContainerCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LiveProcessCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.LogCollectionFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.NPMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OOMKillFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OTLPFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OrchestratorExplorerFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.OtelCollectorFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ProcessDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.PrometheusScrapeFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.RemoteConfigurationFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.SBOMFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.ServiceDiscoveryFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.TCPQueueLengthFeatureConfig", "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1.USMFeatureConfig"}, } } diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml index effa55c25..383dc4188 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents.yaml +++ b/config/crd/bases/v1/datadoghq.com_datadogagents.yaml @@ -1030,7 +1030,7 @@ spec: requiredRuntimeClassName: description: |- PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. - If left empty, the runtime class is not set. + If the value is an empty string, the runtime class is not set. Default: nvidia type: string type: object @@ -7909,7 +7909,7 @@ spec: requiredRuntimeClassName: description: |- PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. - If left empty, the runtime class is not set. + If the value is an empty string, the runtime class is not set. Default: nvidia type: string type: object diff --git a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json index d5922be92..324d0267b 100644 --- a/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json +++ b/config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json @@ -1074,7 +1074,7 @@ "type": "boolean" }, "requiredRuntimeClassName": { - "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class is not set.\nDefault: nvidia", + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf the value is an empty string, the runtime class is not set.\nDefault: nvidia", "type": "string" } }, @@ -7895,7 +7895,7 @@ "type": "boolean" }, "requiredRuntimeClassName": { - "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class is not set.\nDefault: nvidia", + "description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf the value is an empty string, the runtime class is not set.\nDefault: nvidia", "type": "string" } }, diff --git a/docs/configuration.v2alpha1.md b/docs/configuration.v2alpha1.md index 7a73f5c46..2932d7d5c 100644 --- a/docs/configuration.v2alpha1.md +++ b/docs/configuration.v2alpha1.md @@ -112,7 +112,7 @@ spec: | features.externalMetricsServer.useDatadogMetrics | UseDatadogMetrics enables usage of the DatadogMetrics CRD (allowing one to scale on arbitrary Datadog metric queries). Default: true | | features.externalMetricsServer.wpaController | WPAController enables the informer and controller of the Watermark Pod Autoscaler. NOTE: The Watermark Pod Autoscaler controller needs to be installed. See also: https://github.com/DataDog/watermarkpodautoscaler. Default: false | | features.gpu.enabled | Enables GPU monitoring. Default: false | -| features.gpu.requiredRuntimeClassName | PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. If left empty, the runtime class is not set. Default: nvidia | +| features.gpu.requiredRuntimeClassName | PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. If the value is an empty string, the runtime class is not set. Default: nvidia | | features.helmCheck.collectEvents | CollectEvents set to `true` enables event collection in the Helm check (Requires Agent 7.36.0+ and Cluster Agent 1.20.0+) Default: false | | features.helmCheck.enabled | Enables the Helm check. Default: false | | features.helmCheck.valuesAsTags | ValuesAsTags collects Helm values from a release and uses them as tags (Requires Agent and Cluster Agent 7.40.0+). Default: {} | diff --git a/internal/controller/datadogagent/defaults/datadogagent_default_test.go b/internal/controller/datadogagent/defaults/datadogagent_default_test.go index 35978cfa6..18310a116 100644 --- a/internal/controller/datadogagent/defaults/datadogagent_default_test.go +++ b/internal/controller/datadogagent/defaults/datadogagent_default_test.go @@ -2018,7 +2018,7 @@ func Test_defaultFeatures(t *testing.T) { OOMKill: &v2alpha1.OOMKillFeatureConfig{}, TCPQueueLength: &v2alpha1.TCPQueueLengthFeatureConfig{}, EBPFCheck: &v2alpha1.EBPFCheckFeatureConfig{}, - GPU: &v2alpha1.GPUFeatureConfig{}, + GPU: &v2alpha1.GPUFeatureConfig{}, ServiceDiscovery: &v2alpha1.ServiceDiscoveryFeatureConfig{}, APM: &v2alpha1.APMFeatureConfig{}, ASM: &v2alpha1.ASMFeatureConfig{}, diff --git a/internal/controller/datadogagent/feature/gpu/const.go b/internal/controller/datadogagent/feature/gpu/const.go index dbc41ee9f..fedde4d70 100644 --- a/internal/controller/datadogagent/feature/gpu/const.go +++ b/internal/controller/datadogagent/feature/gpu/const.go @@ -7,7 +7,7 @@ package gpu const ( nvidiaDevicesMountPath = "/var/run/nvidia-container-devices/all" - nvidiaDevicesVolumeName = "nvidia-devices" + nvidiaDevicesVolumeName = "nvidia-devices" devNullPath = "/dev/null" // used to mount the NVIDIADevicesHostPath to /dev/null in the container, it's just used as a "signal" to the nvidia runtime to use the nvidia devices // defaultGPURuntimeClass default runtime class for GPU pods