Skip to content

Commit

Permalink
Refactor DCGM exporter constants and update metrics exporter labels
Browse files Browse the repository at this point in the history
  • Loading branch information
gshaibi committed Dec 2, 2024
1 parent 67eb954 commit df53acc
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 7 deletions.
1 change: 1 addition & 0 deletions internal/common/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ const (
LabelTopologyCMNodeName = "node-name"
LabelApp = "app"

DCGMExporterApp = "nvidia-dcgm-exporter"
KwokDCGMExporterApp = "kwok-nvidia-dcgm-exporter"

ReservationNs = "runai-reservation"
Expand Down
14 changes: 9 additions & 5 deletions internal/status-exporter/export/metrics/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ import (
)

const (
exporterPort = 9400
exporterContainerName = "nvidia-dcgm-exporter"
exporterPort = 9400

exporterJobName = constants.DCGMExporterApp
exporterServiceName = constants.DCGMExporterApp
exporterContainerName = constants.DCGMExporterApp
exporterHostnamePrefix = constants.DCGMExporterApp
)

type MetricsExporter struct {
Expand Down Expand Up @@ -116,7 +120,7 @@ func generateFakeHostname(nodeName string) string {
h := sha1.New()
h.Write([]byte(nodeName))
nodeNameSHA1 := h.Sum(nil)
nodeHostname := fmt.Sprintf("%s-%x", "nvidia-dcgm-exporter", nodeNameSHA1[:3])
nodeHostname := fmt.Sprintf("%s-%x", exporterHostnamePrefix, nodeNameSHA1[:3])
return nodeHostname
}

Expand All @@ -132,8 +136,8 @@ func (e *MetricsExporter) enrichWithPrometheusLabels(labels prometheus.Labels) p
labels["pod"] = viper.GetString(constants.EnvImpersonatePodName)

labels["instance"] = fmt.Sprintf("%s:%d", viper.GetString(constants.EnvImpersonatePodIP), exporterPort)
labels["job"] = "nvidia-dcgm-exporter"
labels["service"] = "nvidia-dcgm-exporter"
labels["job"] = exporterJobName
labels["service"] = exporterServiceName

return labels
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ import (
"k8s.io/utils/ptr"
)

const (
dummyDcgmExporterPodTimeout = 5 * time.Minute
)

func (p *NodeHandler) applyFakeNodeDeployments(node *v1.Node) error {
if !isFakeNode(node) {
return nil
Expand Down Expand Up @@ -145,10 +149,10 @@ func (p *NodeHandler) generateFakeNodeDeploymentFromTemplate(template *appsv1.De
}

func (p *NodeHandler) getDummyDcgmExporterPod(nodeName string) (*v1.Pod, error) {
labelSelector := "app=nvidia-dcgm-exporter"
labelSelector := fmt.Sprintf("%s=%s", constants.LabelApp, constants.DCGMExporterApp)
fieldSelector := fields.OneTermEqualSelector("spec.nodeName", nodeName).String()

ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), dummyDcgmExporterPodTimeout)
defer cancel()

watcher, err := p.kubeClient.CoreV1().Pods(v1.NamespaceAll).Watch(ctx, metav1.ListOptions{
Expand Down

0 comments on commit df53acc

Please sign in to comment.