Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding status updater / device plugin for kwok nodes #91

Merged
merged 9 commits into from
Aug 20, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 25 additions & 15 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM --platform=$BUILDPLATFORM golang:1.22.1 as common-builder
FROM --platform=$BUILDPLATFORM golang:1.22.1 AS common-builder
WORKDIR $GOPATH/src/github.com/run-ai/fake-gpu-operator
COPY go.mod .
COPY go.sum .
@@ -7,60 +7,70 @@ COPY Makefile .
COPY internal/common ./internal/common
ARG TARGETOS TARGETARCH

FROM common-builder as device-plugin-builder
FROM common-builder AS device-plugin-builder
COPY ./cmd/device-plugin/ ./cmd/device-plugin/
COPY ./internal/deviceplugin/ ./internal/deviceplugin/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=device-plugin

FROM common-builder as status-updater-builder
FROM common-builder AS status-updater-builder
COPY ./cmd/status-updater/ ./cmd/status-updater/
COPY ./internal/status-updater/ ./internal/status-updater/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=status-updater

FROM common-builder as status-exporter-builder
FROM common-builder AS kwok-gpu-device-plugin-builder
COPY ./cmd/kwok-gpu-device-plugin/ ./cmd/kwok-gpu-device-plugin/
COPY ./internal/status-updater/ ./internal/status-updater/
COPY ./internal/kwok-gpu-device-plugin/ ./internal/kwok-gpu-device-plugin/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=kwok-gpu-device-plugin

FROM common-builder AS status-exporter-builder
COPY ./cmd/status-exporter/ ./cmd/status-exporter/
COPY ./internal/ ./internal/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=status-exporter

FROM common-builder as topology-server-builder
FROM common-builder AS topology-server-builder
COPY ./cmd/topology-server/ ./cmd/topology-server/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=topology-server

FROM common-builder as nvidia-smi-builder
FROM common-builder AS nvidia-smi-builder
COPY ./cmd/nvidia-smi/ ./cmd/nvidia-smi/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=nvidia-smi

FROM common-builder as mig-faker-builder
FROM common-builder AS mig-faker-builder
COPY ./cmd/mig-faker/ ./cmd/mig-faker/
COPY ./internal/ ./internal/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=mig-faker

FROM common-builder as preloader-builder
FROM common-builder AS preloader-builder
COPY ./cmd/preloader/ ./cmd/preloader/
RUN make build-preloader

FROM jupyter/minimal-notebook as jupyter-notebook
FROM jupyter/minimal-notebook AS jupyter-notebook
COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/

FROM ubuntu as device-plugin
FROM ubuntu AS device-plugin
COPY --from=device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/device-plugin /bin/
COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/
COPY --from=preloader-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/preloader /shared/memory/preloader.so
COPY --from=preloader-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/preloader /shared/pid/preloader.so
ENTRYPOINT ["/bin/device-plugin"]

FROM ubuntu as status-updater
FROM ubuntu AS status-updater
COPY --from=status-updater-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-updater /bin/
ENTRYPOINT ["/bin/status-updater"]

FROM ubuntu as status-exporter
FROM ubuntu AS status-exporter
COPY --from=status-exporter-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-exporter /bin/
ENTRYPOINT ["/bin/status-exporter"]

FROM ubuntu as topology-server
FROM ubuntu AS topology-server
COPY --from=topology-server-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/topology-server /bin/
ENTRYPOINT ["/bin/topology-server"]

FROM ubuntu as mig-faker
FROM ubuntu AS mig-faker
COPY --from=mig-faker-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/mig-faker /bin/
ENTRYPOINT ["/bin/mig-faker"]
ENTRYPOINT ["/bin/mig-faker"]

FROM ubuntu AS kwok-gpu-device-plugin
COPY --from=kwok-gpu-device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/kwok-gpu-device-plugin /bin/
ENTRYPOINT ["/bin/kwok-gpu-device-plugin"]
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ COMPONENT="$1"

DOCKER_REPO_BASE=gcr.io/run-ai-lab/fake-gpu-operator
DOCKER_REPO_FULL=${DOCKER_REPO_BASE}/${COMPONENT}
DOCKER_TAG=0.0.0-dev
DOCKER_TAG?=0.0.0-dev
DOCKER_IMAGE_NAME=${DOCKER_REPO_FULL}:${DOCKER_TAG}
NAMESPACE=gpu-operator

@@ -39,6 +39,7 @@ image: init-buildx
images:
make image COMPONENT=device-plugin
make image COMPONENT=status-updater
make image COMPONENT=kwok-gpu-device-plugin
make image COMPONENT=status-exporter
make image COMPONENT=topology-server
make image COMPONENT=mig-faker
16 changes: 16 additions & 0 deletions cmd/kwok-gpu-device-plugin/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package main

import (
"github.com/run-ai/fake-gpu-operator/internal/common/app"
"github.com/run-ai/fake-gpu-operator/internal/common/config"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
status_updater "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin"
)

func main() {
requiredEnvVars := []string{constants.EnvTopologyCmName, constants.EnvTopologyCmNamespace, constants.EnvFakeGpuOperatorNs}
config.ValidateConfig(requiredEnvVars)

appRunner := app.NewAppRunner(&status_updater.KWOKDevicePluginApp{})
appRunner.Run()
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: fake-kwok-gpu-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/status
verbs:
- update
- list
- get
- watch
- patch
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- update
- create
- list
- delete
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: fake-kwok-gpu-device-plugin
roleRef:
kind: ClusterRole
apiGroup: rbac.authorization.k8s.io
name: fake-kwok-gpu-device-plugin
subjects:
- kind: ServiceAccount
name: kwok-gpu-device-plugin
namespace: "{{ .Release.Namespace }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: kwok-gpu-device-plugin
annotations:
checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: kwok-gpu-device-plugin
spec:
selector:
matchLabels:
app: kwok-gpu-device-plugin
component: kwok-gpu-device-plugin
replicas: 1
template:
metadata:
annotations:
checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: kwok-gpu-device-plugin
component: kwok-gpu-device-plugin
spec:
containers:
- name: kwok-gpu-device-plugin
image: "{{ .Values.kwokGpuDevicePlugin.image.repository }}:{{ .Values.kwokGpuDevicePlugin.image.tag }}"
imagePullPolicy: "{{ .Values.kwokGpuDevicePlugin.image.pullPolicy }}"
resources:
{{- toYaml .Values.kwokGpuDevicePlugin.resources | nindent 12 }}
env:
- name: TOPOLOGY_CM_NAME
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
- name: FAKE_GPU_OPERATOR_NAMESPACE
value: "{{ .Release.Namespace }}"
restartPolicy: Always
serviceAccountName: kwok-gpu-device-plugin
imagePullSecrets:
- name: gcr-secret
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: fake-kwok-gpu-device-plugin
rules:
- apiGroups:
- ""
resources:
- configmaps
verbs:
- list
- get
- watch
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: fake-kwok-gpu-device-plugin
roleRef:
kind: Role
apiGroup: rbac.authorization.k8s.io
name: fake-kwok-gpu-device-plugin
subjects:
- kind: ServiceAccount
name: kwok-gpu-device-plugin
namespace: "{{ .Release.Namespace }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: kwok-gpu-device-plugin
15 changes: 14 additions & 1 deletion deploy/fake-gpu-operator/values.yaml
Original file line number Diff line number Diff line change
@@ -54,6 +54,19 @@ statusExporter:
memory: "200Mi"
topologyMaxExportInterval: 10s

kwokGpuDevicePlugin:
image:
pullPolicy: Always
repository: gcr.io/run-ai-lab/fake-gpu-operator/kwok-gpu-device-plugin
tag: 0.0.1
resources:
requests:
cpu: "100m"
memory: "200Mi"
limits:
cpu: "200m"
memory: "400Mi"

migFaker:
image:
pullPolicy: Always
@@ -72,4 +85,4 @@ topology:
gpuCount: 2
gpuMemory: 11441
nodePoolLabelKey: run.ai/simulated-gpu-node-pool
migStrategy: mixed
migStrategy: mixed
24 changes: 17 additions & 7 deletions internal/common/topology/kubernetes.go
Original file line number Diff line number Diff line change
@@ -10,6 +10,7 @@ import (
"github.com/spf13/viper"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
apcorev1 "k8s.io/client-go/applyconfigurations/core/v1"
"k8s.io/client-go/kubernetes"
)

@@ -25,25 +26,31 @@ func GetNodeTopologyFromCM(kubeclient kubernetes.Interface, nodeName string) (*N
return FromNodeTopologyCM(cm)
}

func CreateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, nodeName string) error {
cm, err := ToNodeTopologyCM(nodeTopology, nodeName)
func CreateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, node *corev1.Node) error {
cm, _, err := ToNodeTopologyCM(nodeTopology, node.Name)
if err != nil {
return err
}
if value, found := node.Annotations[constants.AnnotationKwokNode]; found {
if cm.Annotations == nil {
cm.Annotations = make(map[string]string)
}
cm.Annotations[constants.AnnotationKwokNode] = value
}

_, err = kubeclient.CoreV1().ConfigMaps(
viper.GetString(constants.EnvTopologyCmNamespace)).Create(context.TODO(), cm, metav1.CreateOptions{})
return err
}

func UpdateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTopology, nodeName string) error {
cm, err := ToNodeTopologyCM(nodeTopology, nodeName)
_, cm, err := ToNodeTopologyCM(nodeTopology, nodeName)
if err != nil {
return err
}

_, err = kubeclient.CoreV1().ConfigMaps(
viper.GetString(constants.EnvTopologyCmNamespace)).Update(context.TODO(), cm, metav1.UpdateOptions{})
viper.GetString(constants.EnvTopologyCmNamespace)).Apply(context.TODO(), cm, metav1.ApplyOptions{})
return err
}

@@ -108,7 +115,7 @@ func ToClusterTopologyCM(clusterTopology *ClusterTopology) (*corev1.ConfigMap, e
return cm, nil
}

func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.ConfigMap, error) {
func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.ConfigMap, *apcorev1.ConfigMapApplyConfiguration, error) {
cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: GetNodeTopologyCMName(nodeName),
@@ -120,15 +127,18 @@ func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.Conf
},
Data: make(map[string]string),
}
cmApplyConfig := apcorev1.ConfigMap(cm.Name, cm.Namespace).WithLabels(cm.Labels)

topologyData, err := yaml.Marshal(nodeTopology)
if err != nil {
return nil, err
return nil, nil, err
}

cm.Data[cmTopologyKey] = string(topologyData)

return cm, nil
cmApplyConfig = cmApplyConfig.WithData(cm.Data)

return cm, cmApplyConfig, nil
}

func GetNodeTopologyCMName(nodeName string) string {
Loading