Skip to content

Commit

Permalink
adding kwok fake gpu device plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
enoodle committed Aug 14, 2024
1 parent 031a001 commit da9a600
Show file tree
Hide file tree
Showing 13 changed files with 401 additions and 17 deletions.
40 changes: 25 additions & 15 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM --platform=$BUILDPLATFORM golang:1.22.1 as common-builder
FROM --platform=$BUILDPLATFORM golang:1.22.1 AS common-builder
WORKDIR $GOPATH/src/github.com/run-ai/fake-gpu-operator
COPY go.mod .
COPY go.sum .
Expand All @@ -7,60 +7,70 @@ COPY Makefile .
COPY internal/common ./internal/common
ARG TARGETOS TARGETARCH

FROM common-builder as device-plugin-builder
FROM common-builder AS device-plugin-builder
COPY ./cmd/device-plugin/ ./cmd/device-plugin/
COPY ./internal/deviceplugin/ ./internal/deviceplugin/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=device-plugin

FROM common-builder as status-updater-builder
FROM common-builder AS status-updater-builder
COPY ./cmd/status-updater/ ./cmd/status-updater/
COPY ./internal/status-updater/ ./internal/status-updater/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=status-updater

FROM common-builder as status-exporter-builder
FROM common-builder AS kwok-gpu-device-plugin-builder
COPY ./cmd/kwok-gpu-device-plugin/ ./cmd/kwok-gpu-device-plugin/
COPY ./internal/status-updater/ ./internal/status-updater/
COPY ./internal/kwok-gpu-device-plugin/ ./internal/kwok-gpu-device-plugin/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=kwok-gpu-device-plugin

FROM common-builder AS status-exporter-builder
COPY ./cmd/status-exporter/ ./cmd/status-exporter/
COPY ./internal/ ./internal/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=status-exporter

FROM common-builder as topology-server-builder
FROM common-builder AS topology-server-builder
COPY ./cmd/topology-server/ ./cmd/topology-server/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=topology-server

FROM common-builder as nvidia-smi-builder
FROM common-builder AS nvidia-smi-builder
COPY ./cmd/nvidia-smi/ ./cmd/nvidia-smi/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=nvidia-smi

FROM common-builder as mig-faker-builder
FROM common-builder AS mig-faker-builder
COPY ./cmd/mig-faker/ ./cmd/mig-faker/
COPY ./internal/ ./internal/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENT=mig-faker

FROM common-builder as preloader-builder
FROM common-builder AS preloader-builder
COPY ./cmd/preloader/ ./cmd/preloader/
RUN make build-preloader

FROM jupyter/minimal-notebook as jupyter-notebook
FROM jupyter/minimal-notebook AS jupyter-notebook
COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/

FROM ubuntu as device-plugin
FROM ubuntu AS device-plugin
COPY --from=device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/device-plugin /bin/
COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/
COPY --from=preloader-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/preloader /shared/memory/preloader.so
COPY --from=preloader-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/preloader /shared/pid/preloader.so
ENTRYPOINT ["/bin/device-plugin"]

FROM ubuntu as status-updater
FROM ubuntu AS status-updater
COPY --from=status-updater-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-updater /bin/
ENTRYPOINT ["/bin/status-updater"]

FROM ubuntu as status-exporter
FROM ubuntu AS status-exporter
COPY --from=status-exporter-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/status-exporter /bin/
ENTRYPOINT ["/bin/status-exporter"]

FROM ubuntu as topology-server
FROM ubuntu AS topology-server
COPY --from=topology-server-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/topology-server /bin/
ENTRYPOINT ["/bin/topology-server"]

FROM ubuntu as mig-faker
FROM ubuntu AS mig-faker
COPY --from=mig-faker-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/mig-faker /bin/
ENTRYPOINT ["/bin/mig-faker"]
ENTRYPOINT ["/bin/mig-faker"]

FROM ubuntu AS kwok-gpu-device-plugin
COPY --from=kwok-gpu-device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/kwok-gpu-device-plugin /bin/
ENTRYPOINT ["/bin/kwok-gpu-device-plugin"]
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ COMPONENT="$1"

DOCKER_REPO_BASE=gcr.io/run-ai-lab/fake-gpu-operator
DOCKER_REPO_FULL=${DOCKER_REPO_BASE}/${COMPONENT}
DOCKER_TAG=0.0.0-dev
DOCKER_TAG?=0.0.0-dev
DOCKER_IMAGE_NAME=${DOCKER_REPO_FULL}:${DOCKER_TAG}
NAMESPACE=gpu-operator

Expand Down Expand Up @@ -39,6 +39,7 @@ image: init-buildx
images:
make image COMPONENT=device-plugin
make image COMPONENT=status-updater
make image COMPONENT=kwok-gpu-device-plugin
make image COMPONENT=status-exporter
make image COMPONENT=topology-server
make image COMPONENT=mig-faker
Expand Down
16 changes: 16 additions & 0 deletions cmd/kwok-gpu-device-plugin/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package main

import (
"github.com/run-ai/fake-gpu-operator/internal/common/app"
"github.com/run-ai/fake-gpu-operator/internal/common/config"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
status_updater "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin"
)

func main() {
requiredEnvVars := []string{constants.EnvTopologyCmName, constants.EnvTopologyCmNamespace, constants.EnvFakeGpuOperatorNs}
config.ValidateConfig(requiredEnvVars)

appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{})
appRunner.Run()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: fake-kwok-gpu-device-plugin
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/status
verbs:
- update
- list
- get
- watch
- patch
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- update
- create
- list
- delete
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: fake-kwok-gpu-device-plugin
roleRef:
kind: ClusterRole
apiGroup: rbac.authorization.k8s.io
name: fake-kwok-gpu-device-plugin
subjects:
- kind: ServiceAccount
name: kwok-gpu-device-plugin
namespace: "{{ .Release.Namespace }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: kwok-gpu-device-plugin
annotations:
checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: kwok-gpu-device-plugin
spec:
selector:
matchLabels:
app: kwok-gpu-device-plugin
component: kwok-gpu-device-plugin
replicas: 1
template:
metadata:
annotations:
checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: kwok-gpu-device-plugin
component: kwok-gpu-device-plugin
spec:
containers:
- name: kwok-gpu-device-plugin
image: "{{ .Values.kwokGpuDevicePlugin.image.repository }}:{{ .Values.kwokGpuDevicePlugin.image.tag }}"
imagePullPolicy: "{{ .Values.kwokGpuDevicePlugin.image.pullPolicy }}"
resources:
{{- toYaml .Values.kwokGpuDevicePlugin.resources | nindent 12 }}
env:
- name: TOPOLOGY_CM_NAME
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
- name: FAKE_GPU_OPERATOR_NAMESPACE
value: "{{ .Release.Namespace }}"
restartPolicy: Always
serviceAccountName: kwok-gpu-device-plugin
imagePullSecrets:
- name: gcr-secret
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: plugin
rules:
- apiGroups:
- ""
resources:
- configmaps
verbs:
- list
- get
- watch
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: fake-kwok-gpu-device-plugin
roleRef:
kind: Role
apiGroup: rbac.authorization.k8s.io
name: fake-kwok-gpu-device-plugin
subjects:
- kind: ServiceAccount
name: kwok-gpu-device-plugin
namespace: "{{ .Release.Namespace }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: kwok-gpu-device-plugin
15 changes: 14 additions & 1 deletion deploy/fake-gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@ statusExporter:
memory: "200Mi"
topologyMaxExportInterval: 10s

kwokGpuDevicePlugin:
image:
pullPolicy: Always
repository: gcr.io/run-ai-lab/fake-gpu-operator/kwok-gpu-device-plugin
tag: 0.0.1
resources:
requests:
cpu: "100m"
memory: "200Mi"
limits:
cpu: "200m"
memory: "400Mi"

migFaker:
image:
pullPolicy: Always
Expand All @@ -72,4 +85,4 @@ topology:
gpuCount: 2
gpuMemory: 11441
nodePoolLabelKey: run.ai/simulated-gpu-node-pool
migStrategy: mixed
migStrategy: mixed
77 changes: 77 additions & 0 deletions internal/kwok-gpu-device-plugin/app.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package status_updater

import (
"sync"

"k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"

"github.com/spf13/viper"
ctrl "sigs.k8s.io/controller-runtime"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
cmcontroller "github.com/run-ai/fake-gpu-operator/internal/kwok-gpu-device-plugin/controllers/configmap"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers"
)

var InClusterConfigFn = ctrl.GetConfigOrDie
var KubeClientFn = func(c *rest.Config) kubernetes.Interface {
return kubernetes.NewForConfigOrDie(c)
}

var DynamicClientFn = func(c *rest.Config) dynamic.Interface {
return dynamic.NewForConfigOrDie(c)
}

type StatusUpdaterAppConfiguration struct {
TopologyCmName string `mapstructure:"TOPOLOGY_CM_NAME" validate:"required"`
TopologyCmNamespace string `mapstructure:"TOPOLOGY_CM_NAMESPACE" validate:"required"`
}

type StatusUpdaterApp struct {
Controllers []controllers.Interface
kubeClient kubernetes.Interface
stopCh chan struct{}
wg *sync.WaitGroup
}

func (app *StatusUpdaterApp) Run() {
app.wg.Add(len(app.Controllers))
for _, controller := range app.Controllers {
go func(controller controllers.Interface) {
defer app.wg.Done()
controller.Run(app.stopCh)
}(controller)
}

app.wg.Wait()
}

func (app *StatusUpdaterApp) Init(stopCh chan struct{}) {
app.stopCh = stopCh

clusterConfig := InClusterConfigFn()
clusterConfig.QPS = 100
clusterConfig.Burst = 200

app.wg = &sync.WaitGroup{}

app.kubeClient = KubeClientFn(clusterConfig)

app.Controllers = append(
app.Controllers, cmcontroller.NewConfigMapController(
app.kubeClient, viper.GetString(constants.EnvTopologyCmNamespace),
),
)
}

func (app *StatusUpdaterApp) Name() string {
return "StatusUpdater"
}

func (app *StatusUpdaterApp) GetConfig() interface{} {
var config StatusUpdaterAppConfiguration

return config
}
Loading

0 comments on commit da9a600

Please sign in to comment.