Skip to content

Commit

Permalink
Merge pull request #91 from run-ai/erez/kwok-fake-node-updater-RUN-19999
Browse files Browse the repository at this point in the history
adding status updater / device plugin for kwok nodes
  • Loading branch information
enoodle authored Aug 20, 2024
2 parents 031a001 + 63ee0bb commit 1db44fa
Show file tree
Hide file tree
Showing 18 changed files with 541 additions and 43 deletions.
40 changes: 25 additions & 15 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM --platform=$BUILDPLATFORM golang:1.22.1 as common-builder
FROM --platform=$BUILDPLATFORM golang:1.22.1 AS common-builder
WORKDIR $GOPATH/src/github.com/run-ai/fake-gpu-operator
COPY go.mod .
COPY go.sum .
Expand All @@ -7,60 +7,70 @@ COPY Makefile .
COPY internal/common ./internal/common
ARG TARGETOS TARGETARCH

FROM common-builder as device-plugin-builder
FROM common-builder AS device-plugin-builder
COPY ./cmd/device-plugin/ ./cmd/device-plugin/
COPY ./internal/deviceplugin/ ./internal/deviceplugin/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=device-plugin

FROM common-builder as status-updater-builder
FROM common-builder AS status-updater-builder
COPY ./cmd/status-updater/ ./cmd/status-updater/
COPY ./internal/status-updater/ ./internal/status-updater/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=status-updater

FROM common-builder as status-exporter-builder
FROM common-builder AS kwok-gpu-device-plugin-builder
COPY ./cmd/kwok-gpu-device-plugin/ ./cmd/kwok-gpu-device-plugin/
COPY ./internal/status-updater/ ./internal/status-updater/
COPY ./internal/kwok-gpu-device-plugin/ ./internal/kwok-gpu-device-plugin/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=kwok-gpu-device-plugin

FROM common-builder AS status-exporter-builder
COPY ./cmd/status-exporter/ ./cmd/status-exporter/
COPY ./internal/ ./internal/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=status-exporter

FROM common-builder as topology-server-builder
FROM common-builder AS topology-server-builder
COPY ./cmd/topology-server/ ./cmd/topology-server/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=topology-server

FROM common-builder as nvidia-smi-builder
FROM common-builder AS nvidia-smi-builder
COPY ./cmd/nvidia-smi/ ./cmd/nvidia-smi/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=nvidia-smi

FROM common-builder as mig-faker-builder
FROM common-builder AS mig-faker-builder
COPY ./cmd/mig-faker/ ./cmd/mig-faker/
COPY ./internal/ ./internal/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=mig-faker

FROM common-builder as preloader-builder
FROM common-builder AS preloader-builder
COPY ./cmd/preloader/ ./cmd/preloader/
RUN make build-preloader

FROM jupyter/minimal-notebook as jupyter-notebook
FROM jupyter/minimal-notebook AS jupyter-notebook
COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/

FROM ubuntu as device-plugin
FROM ubuntu AS device-plugin
COPY --from=device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/device-plugin /bin/
COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/
COPY --from=preloader-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/preloader /shared/memory/preloader.so
COPY --from=preloader-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/preloader /shared/pid/preloader.so
ENTRYPOINT ["/bin/device-plugin"]

FROM ubuntu as status-updater
FROM ubuntu AS status-updater
COPY --from=status-updater-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-updater /bin/
ENTRYPOINT ["/bin/status-updater"]

FROM ubuntu as status-exporter
FROM ubuntu AS status-exporter
COPY --from=status-exporter-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-exporter /bin/
ENTRYPOINT ["/bin/status-exporter"]

FROM ubuntu as topology-server
FROM ubuntu AS topology-server
COPY --from=topology-server-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/topology-server /bin/
ENTRYPOINT ["/bin/topology-server"]

FROM ubuntu as mig-faker
FROM ubuntu AS mig-faker
COPY --from=mig-faker-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/mig-faker /bin/
ENTRYPOINT ["/bin/mig-faker"]
ENTRYPOINT ["/bin/mig-faker"]

FROM ubuntu AS kwok-gpu-device-plugin
COPY --from=kwok-gpu-device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/kwok-gpu-device-plugin /bin/
ENTRYPOINT ["/bin/kwok-gpu-device-plugin"]
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ COMPONENT="$1"

DOCKER_REPO_BASE=gcr.io/run-ai-lab/fake-gpu-operator
DOCKER_REPO_FULL=${DOCKER_REPO_BASE}/${COMPONENT}
DOCKER_TAG=0.0.0-dev
DOCKER_TAG?=0.0.0-dev
DOCKER_IMAGE_NAME=${DOCKER_REPO_FULL}:${DOCKER_TAG}
NAMESPACE=gpu-operator

Expand Down Expand Up @@ -39,6 +39,7 @@ image: init-buildx
images:
make image COMPONENT=device-plugin
make image COMPONENT=status-updater
make image COMPONENT=kwok-gpu-device-plugin
make image COMPONENT=status-exporter
make image COMPONENT=topology-server
make image COMPONENT=mig-faker
Expand Down
16 changes: 16 additions & 0 deletions cmd/kwok-gpu-device-plugin/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package main

import (
"github.com/run-ai/fake-gpu-operator/internal/common/app"
"github.com/run-ai/fake-gpu-operator/internal/common/config"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
status_updater "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin"
)

func main() {
requiredEnvVars := []string{constants.EnvTopologyCmName, constants.EnvTopologyCmNamespace, constants.EnvFakeGpuOperatorNs}
config.ValidateConfig(requiredEnvVars)

appRunner := app.NewAppRunner(&status_updater.KWOKDevicePluginApp{})
appRunner.Run()
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: fake-kwok-gpu-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/status
verbs:
- update
- list
- get
- watch
- patch
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- update
- create
- list
- delete
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: fake-kwok-gpu-device-plugin
roleRef:
kind: ClusterRole
apiGroup: rbac.authorization.k8s.io
name: fake-kwok-gpu-device-plugin
subjects:
- kind: ServiceAccount
name: kwok-gpu-device-plugin
namespace: "{{ .Release.Namespace }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: kwok-gpu-device-plugin
annotations:
checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: kwok-gpu-device-plugin
spec:
selector:
matchLabels:
app: kwok-gpu-device-plugin
component: kwok-gpu-device-plugin
replicas: 1
template:
metadata:
annotations:
checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: kwok-gpu-device-plugin
component: kwok-gpu-device-plugin
spec:
containers:
- name: kwok-gpu-device-plugin
image: "{{ .Values.kwokGpuDevicePlugin.image.repository }}:{{ .Values.kwokGpuDevicePlugin.image.tag }}"
imagePullPolicy: "{{ .Values.kwokGpuDevicePlugin.image.pullPolicy }}"
resources:
{{- toYaml .Values.kwokGpuDevicePlugin.resources | nindent 12 }}
env:
- name: TOPOLOGY_CM_NAME
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
- name: FAKE_GPU_OPERATOR_NAMESPACE
value: "{{ .Release.Namespace }}"
restartPolicy: Always
serviceAccountName: kwok-gpu-device-plugin
imagePullSecrets:
- name: gcr-secret
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: fake-kwok-gpu-device-plugin
rules:
- apiGroups:
- ""
resources:
- configmaps
verbs:
- list
- get
- watch
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: fake-kwok-gpu-device-plugin
roleRef:
kind: Role
apiGroup: rbac.authorization.k8s.io
name: fake-kwok-gpu-device-plugin
subjects:
- kind: ServiceAccount
name: kwok-gpu-device-plugin
namespace: "{{ .Release.Namespace }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: kwok-gpu-device-plugin
15 changes: 14 additions & 1 deletion deploy/fake-gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@ statusExporter:
memory: "200Mi"
topologyMaxExportInterval: 10s

kwokGpuDevicePlugin:
image:
pullPolicy: Always
repository: gcr.io/run-ai-lab/fake-gpu-operator/kwok-gpu-device-plugin
tag: 0.0.1
resources:
requests:
cpu: "100m"
memory: "200Mi"
limits:
cpu: "200m"
memory: "400Mi"

migFaker:
image:
pullPolicy: Always
Expand All @@ -72,4 +85,4 @@ topology:
gpuCount: 2
gpuMemory: 11441
nodePoolLabelKey: run.ai/simulated-gpu-node-pool
migStrategy: mixed
migStrategy: mixed
24 changes: 17 additions & 7 deletions internal/common/topology/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/spf13/viper"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
apcorev1 "k8s.io/client-go/applyconfigurations/core/v1"
"k8s.io/client-go/kubernetes"
)

Expand All @@ -25,25 +26,31 @@ func GetNodeTopologyFromCM(kubeclient kubernetes.Interface, nodeName string) (*N
return FromNodeTopologyCM(cm)
}

func CreateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, nodeName string) error {
cm, err := ToNodeTopologyCM(nodeTopology, nodeName)
func CreateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, node *corev1.Node) error {
cm, _, err := ToNodeTopologyCM(nodeTopology, node.Name)
if err != nil {
return err
}
if value, found := node.Annotations[constants.AnnotationKwokNode]; found {
if cm.Annotations == nil {
cm.Annotations = make(map[string]string)
}
cm.Annotations[constants.AnnotationKwokNode] = value
}

_, err = kubeclient.CoreV1().ConfigMaps(
viper.GetString(constants.EnvTopologyCmNamespace)).Create(context.TODO(), cm, metav1.CreateOptions{})
return err
}

func UpdateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, nodeName string) error {
cm, err := ToNodeTopologyCM(nodeTopology, nodeName)
_, cm, err := ToNodeTopologyCM(nodeTopology, nodeName)
if err != nil {
return err
}

_, err = kubeclient.CoreV1().ConfigMaps(
viper.GetString(constants.EnvTopologyCmNamespace)).Update(context.TODO(), cm, metav1.UpdateOptions{})
viper.GetString(constants.EnvTopologyCmNamespace)).Apply(context.TODO(), cm, metav1.ApplyOptions{})
return err
}

Expand Down Expand Up @@ -108,7 +115,7 @@ func ToClusterTopologyCM(clusterTopology *ClusterTopology) (*corev1.ConfigMap, e
return cm, nil
}

func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.ConfigMap, error) {
func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.ConfigMap, *apcorev1.ConfigMapApplyConfiguration, error) {
cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: GetNodeTopologyCMName(nodeName),
Expand All @@ -120,15 +127,18 @@ func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.Conf
},
Data: make(map[string]string),
}
cmApplyConfig := apcorev1.ConfigMap(cm.Name, cm.Namespace).WithLabels(cm.Labels)

topologyData, err := yaml.Marshal(nodeTopology)
if err != nil {
return nil, err
return nil, nil, err
}

cm.Data[cmTopologyKey] = string(topologyData)

return cm, nil
cmApplyConfig = cmApplyConfig.WithData(cm.Data)

return cm, cmApplyConfig, nil
}

func GetNodeTopologyCMName(nodeName string) string {
Expand Down
Loading

0 comments on commit 1db44fa

Please sign in to comment.