Skip to content

Commit

Permalink
Add GPU monitoring feature
Browse files Browse the repository at this point in the history
  • Loading branch information
gjulianm committed Jan 7, 2025
1 parent 87f8143 commit 82ba349
Show file tree
Hide file tree
Showing 13 changed files with 343 additions and 1 deletion.
7 changes: 7 additions & 0 deletions api/datadoghq/v2alpha1/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ const (
KubeServicesAndEndpointsListeners = "kube_services kube_endpoints"
EndpointsChecksConfigProvider = "endpointschecks"
ClusterAndEndpointsConfigProviders = "clusterchecks endpointschecks"

// DefaultGPUMonitoringRuntimeClass default runtime class for GPU pods
DefaultGPUMonitoringRuntimeClass = "nvidia"
)

// Labels
Expand Down Expand Up @@ -201,6 +204,10 @@ const (
FIPSProxyCustomConfigFileName = "datadog-fips-proxy.cfg"
FIPSProxyCustomConfigMapName = "%s-fips-config"
FIPSProxyCustomConfigMountPath = "/etc/datadog-fips-proxy/datadog-fips-proxy.cfg"

NVIDIADevicesMountPath = "/var/run/nvidia-container-devices/all"
NVIDIADevicesVolumeName = "nvidia-devices"
DevNullPath = "/dev/null" // used to mount the NVIDIADevicesHostPath to /dev/null in the container, it's just used as a "signal" to the nvidia runtime to use the nvidia devices
)

// Field paths
Expand Down
16 changes: 16 additions & 0 deletions api/datadoghq/v2alpha1/datadogagent_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ type DatadogFeatures struct {
SBOM *SBOMFeatureConfig `json:"sbom,omitempty"`
// ServiceDiscovery
ServiceDiscovery *ServiceDiscoveryFeatureConfig `json:"serviceDiscovery,omitempty"`
// GPU monitoring
GPUMonitoring *GPUMonitoringFeatureConfig `json:"gpu,omitempty"`

// Cluster-level features

Expand Down Expand Up @@ -498,6 +500,20 @@ type ServiceDiscoveryFeatureConfig struct {
Enabled *bool `json:"enabled,omitempty"`
}

// GPUMonitoringFeatureConfig contains the GPU monitoring configuration.
type GPUMonitoringFeatureConfig struct {
// Enabled enables GPU monitoring.
// Default: false
// +optional
Enabled *bool `json:"enabled,omitempty"`

// PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.
// If left empty, the runtime class will not be set.
// Default: nvidia
// +optional
PodRuntimeClassName *string `json:"requiredRuntimeClassName"`
}

// DogstatsdFeatureConfig contains the Dogstatsd configuration parameters.
// +k8s:openapi-gen=true
type DogstatsdFeatureConfig struct {
Expand Down
30 changes: 30 additions & 0 deletions api/datadoghq/v2alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion api/datadoghq/v2alpha1/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions config/crd/bases/v1/datadoghq.com_datadogagents.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,21 @@ spec:
Default: false
type: boolean
type: object
gpu:
description: GPU monitoring
properties:
enabled:
description: |-
Enabled enables GPU monitoring.
Default: false
type: boolean
requiredRuntimeClassName:
description: |-
PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.
If left empty, the runtime class will not be set.
Default: nvidia
type: string
type: object
helmCheck:
description: HelmCheck configuration.
properties:
Expand Down Expand Up @@ -7882,6 +7897,21 @@ spec:
Default: false
type: boolean
type: object
gpu:
description: GPU monitoring
properties:
enabled:
description: |-
Enabled enables GPU monitoring.
Default: false
type: boolean
requiredRuntimeClassName:
description: |-
PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.
If left empty, the runtime class will not be set.
Default: nvidia
type: string
type: object
helmCheck:
description: HelmCheck configuration.
properties:
Expand Down
30 changes: 30 additions & 0 deletions config/crd/bases/v1/datadoghq.com_datadogagents_v2alpha1.json
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,21 @@
},
"type": "object"
},
"gpu": {
"additionalProperties": false,
"description": "GPU monitoring",
"properties": {
"enabled": {
"description": "Enabled enables GPU monitoring.\nDefault: false",
"type": "boolean"
},
"requiredRuntimeClassName": {
"description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class will not be set.\nDefault: nvidia",
"type": "string"
}
},
"type": "object"
},
"helmCheck": {
"additionalProperties": false,
"description": "HelmCheck configuration.",
Expand Down Expand Up @@ -7870,6 +7885,21 @@
},
"type": "object"
},
"gpu": {
"additionalProperties": false,
"description": "GPU monitoring",
"properties": {
"enabled": {
"description": "Enabled enables GPU monitoring.\nDefault: false",
"type": "boolean"
},
"requiredRuntimeClassName": {
"description": "PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature.\nIf left empty, the runtime class will not be set.\nDefault: nvidia",
"type": "string"
}
},
"type": "object"
},
"helmCheck": {
"additionalProperties": false,
"description": "HelmCheck configuration.",
Expand Down
2 changes: 2 additions & 0 deletions docs/configuration.v2alpha1.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ spec:
| features.externalMetricsServer.registerAPIService | RegisterAPIService registers the External Metrics endpoint as an APIService Default: true |
| features.externalMetricsServer.useDatadogMetrics | UseDatadogMetrics enables usage of the DatadogMetrics CRD (allowing one to scale on arbitrary Datadog metric queries). Default: true |
| features.externalMetricsServer.wpaController | WPAController enables the informer and controller of the Watermark Pod Autoscaler. NOTE: The Watermark Pod Autoscaler controller needs to be installed. See also: https://github.com/DataDog/watermarkpodautoscaler. Default: false |
| features.gpu.enabled | Enables GPU monitoring. Default: false |
| features.gpu.requiredRuntimeClassName | PodRuntimeClassName specifies the runtime class name required for the GPU monitoring feature. If left empty, the runtime class will not be set. Default: nvidia |
| features.helmCheck.collectEvents | CollectEvents set to `true` enables event collection in the Helm check (Requires Agent 7.36.0+ and Cluster Agent 1.20.0+) Default: false |
| features.helmCheck.enabled | Enables the Helm check. Default: false |
| features.helmCheck.valuesAsTags | ValuesAsTags collects Helm values from a release and uses them as tags (Requires Agent and Cluster Agent 7.40.0+). Default: {} |
Expand Down
2 changes: 2 additions & 0 deletions examples/datadogagent/datadog-agent-all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ spec:
enabled: true
serviceDiscovery:
enabled: true
gpu:
enabled: true
eventCollection:
collectKubernetesEvents: true
orchestratorExplorer:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ const (

defaultEBPFCheckEnabled bool = false

defaultGPUMonitoringEnabled bool = false

defaultServiceDiscoveryEnabled bool = false

defaultAPMEnabled bool = true
Expand Down Expand Up @@ -265,6 +267,12 @@ func defaultFeaturesConfig(ddaSpec *v2alpha1.DatadogAgentSpec) {
}
apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.ServiceDiscovery.Enabled, defaultServiceDiscoveryEnabled)

// GPU monitoring feature
if ddaSpec.Features.GPUMonitoring == nil {
ddaSpec.Features.GPUMonitoring = &GPUMonitoringFeatureConfig{}

Check failure on line 272 in internal/controller/datadogagent/defaults/datadogagent_default.go

View workflow job for this annotation

GitHub Actions / build

undefined: GPUMonitoringFeatureConfig

Check failure on line 272 in internal/controller/datadogagent/defaults/datadogagent_default.go

View workflow job for this annotation

GitHub Actions / build

undefined: GPUMonitoringFeatureConfig

Check failure on line 272 in internal/controller/datadogagent/defaults/datadogagent_default.go

View workflow job for this annotation

GitHub Actions / build

undefined: GPUMonitoringFeatureConfig

Check failure on line 272 in internal/controller/datadogagent/defaults/datadogagent_default.go

View workflow job for this annotation

GitHub Actions / build

undefined: GPUMonitoringFeatureConfig
}
apiutils.DefaultBooleanIfUnset(&ddaSpec.Features.GPUMonitoring.Enabled, defaultGPUMonitoringEnabled)

// APM Feature
// APM is enabled by default
if ddaSpec.Features.APM == nil {
Expand Down
Loading

0 comments on commit 82ba349

Please sign in to comment.