Skip to content

Commit

Permalink
feat: improve gpu-provisioner based on sigs.k8s.io/karpenter
Browse files Browse the repository at this point in the history
1. upgrade CRD: Machine to NodeClaim
2. update aws/karpenter-core to sigs.k8s.io/karpenter
3. add webhook for v1beta1.NodeClaim and v1.NodeClaim conversion
4. add instance garbage collection controller for cleanuping leaked cloud provider instance and node.
5. remove unused files like sku, pricing, instancetype, etc.
6. improve nodeclaim launch error cases: if the return error is InvalidParameterError, LocationRestrictionError or InsufficientCapacityError, [nodeclaim launch] controller will publish a warning event, then delete the nodeclaim because of these errors are not recoverable, so it is not necessary to retry create agentpool.
7. add unit test cases

Signed-off-by: rambohe-ch <[email protected]>
  • Loading branch information
rambohe-ch committed Dec 8, 2024
1 parent 46e16e0 commit 3304c0b
Show file tree
Hide file tree
Showing 2,750 changed files with 245,484 additions and 391,909 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/markdown-link-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,6 @@ jobs:
use-quiet-mode: 'yes'
# this will show detailed HTTP status for checked links
use-verbose-mode: 'yes'
config-file: '.github/markdown.links.config.json'
folder-path: '.'
check-modified-files-only: 'yes'
base-branch: 'main'
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ _output/
/hack/tools/bin/
/coverage.txt
/coverage.out
gpu-provisioner-values.yaml
9 changes: 5 additions & 4 deletions .golangci.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
run:
deadline: 10m
timeout: 15m

linters:
disable-all: true
enable:
- deadcode
- gosimple
- govet
- ineffassign
- misspell
- nakedret
- nilerr
- prealloc
- structcheck
- typecheck
- unused
- varcheck
- gci
- gofmt
- goimports
- staticcheck
# Run with --fast=false for more extensive checks
fast: true
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
FROM --platform=$BUILDPLATFORM golang:1.22 as builder
ARG TARGETOS
ARG TARGETARCH
ARG KARPENTERVER

WORKDIR /workspace
# Copy the Go Modules manifests
Expand All @@ -27,7 +28,7 @@ COPY vendor/ vendor/
# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
RUN --mount=type=cache,target=${GOCACHE} \
--mount=type=cache,id=gpu-provisioner-controller,sharing=locked,target=/go/pkg/mod \
CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} GO111MODULE=on go build -a -o manager cmd/main.go
CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} GO111MODULE=on go build -a -o manager -ldflags "-X sigs.k8s.io/karpenter/pkg/operator.Version=${KARPENTERVER}" cmd/main.go

# Use distroless as minimal base image to package the manager binary
# Refer to https://github.com/GoogleContainerTools/distroless for more details
Expand Down
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ endif
TOOLS_DIR := hack/tools
TOOLS_BIN_DIR := $(abspath $(TOOLS_DIR)/bin)

GOLANGCI_LINT_VER := v1.54.1
GOLANGCI_LINT_VER := v1.61.0
GOLANGCI_LINT_BIN := golangci-lint
GOLANGCI_LINT := $(abspath $(TOOLS_BIN_DIR)/$(GOLANGCI_LINT_BIN)-$(GOLANGCI_LINT_VER))

Expand All @@ -38,7 +38,9 @@ BUILD_DATE_VAR := $(REPO_PATH)/pkg/version.BuildDate
BUILD_DATE := $$(date +%Y-%m-%d-%H:%M)
GIT_VAR := $(REPO_PATH)/pkg/version.GitCommit
GIT_HASH := $$(git rev-parse --short HEAD)
LDFLAGS ?= "-X $(BUILD_DATE_VAR)=$(BUILD_DATE) -X $(BUILD_VERSION_VAR)=$(IMAGE_VERSION) -X $(GIT_VAR)=$(GIT_HASH)"
KARPENTER_VERSION_KEY := sigs.k8s.io/karpenter/pkg/operator.Version
KARPENTER_VERSION_VAL := $(shell git describe --tags --always | cut -d"v" -f2)
LDFLAGS ?= "-X $(BUILD_DATE_VAR)=$(BUILD_DATE) -X $(BUILD_VERSION_VAR)=$(IMAGE_VERSION) -X $(GIT_VAR)=$(GIT_HASH) -X $(KARPENTER_VERSION_KEY)=$(KARPENTER_VERSION_VAL)"

# AKS INT/Staging Test
AZURE_SUBSCRIPTION_ID ?= ff05f55d-22b5-44a7-b704-f9a8efd493ed
Expand Down Expand Up @@ -138,6 +140,7 @@ docker-build: docker-buildx
--output=$(OUTPUT_TYPE) \
--platform="linux/$(ARCH)" \
--pull \
--build-arg="KARPENTERVER=$(KARPENTER_VERSION_VAL)" \
--tag $(REGISTRY)/$(IMG_NAME):$(IMG_TAG) .


Expand Down
293 changes: 0 additions & 293 deletions charts/gpu-provisioner/crds/karpenter.sh_machines.yaml

This file was deleted.

825 changes: 825 additions & 0 deletions charts/gpu-provisioner/crds/karpenter.sh_nodeclaims.yaml

Large diffs are not rendered by default.

37 changes: 26 additions & 11 deletions charts/gpu-provisioner/templates/clusterrole-core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,37 +29,52 @@ metadata:
{{- end }}
rules:
# Read
- apiGroups: [ "karpenter.sh" ]
resources: [ "provisioners" ]
verbs: [ "get", "list", "watch" ]
- apiGroups: ["karpenter.sh"]
resources: ["machines", "machines/status"]
resources: ["nodeclaims", "nodeclaims/status"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["pods", "nodes", "persistentvolumes", "persistentvolumeclaims", "replicationcontrollers", "namespaces"]
resources: ["pods", "nodes", "persistentvolumes", "persistentvolumeclaims", "replicationcontrollers", "namespaces", "configmaps", "secrets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses", "csinodes"]
resources: ["storageclasses", "csinodes", "volumeattachments"]
verbs: ["get", "watch", "list"]
- apiGroups: ["apps"]
resources: ["daemonsets", "deployments", "replicasets", "statefulsets"]
verbs: ["list", "watch"]
- apiGroups: [ "policy" ]
resources: [ "poddisruptionbudgets" ]
verbs: [ "get", "list", "watch" ]
- apiGroups: ["policy"]
resources: ["poddisruptionbudgets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apiextensions.k8s.io"]
resources: ["customresourcedefinitions"]
verbs: ["list", "watch"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["list", "watch"]
# Write
- apiGroups: ["karpenter.sh"]
resources: ["machines", "machines/status"]
resources: ["nodeclaims", "nodeclaims/status"]
verbs: ["create", "delete", "update", "patch"]
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "patch"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["patch", "delete"]
verbs: ["patch", "delete", "update"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["delete"]
- apiGroups: ["apiextensions.k8s.io"]
resources: ["customresourcedefinitions"]
verbs: ["update", "patch"]
- apiGroups: [""]
resources: ["secrets"]
verbs: ["update"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["delete"]
{{- with .Values.additionalClusterRoleRules -}}
{{ toYaml . | nindent 2 }}
{{- end -}}
Expand Down
40 changes: 0 additions & 40 deletions charts/gpu-provisioner/templates/configmap-logging.yaml

This file was deleted.

13 changes: 0 additions & 13 deletions charts/gpu-provisioner/templates/configmap.yaml

This file was deleted.

11 changes: 10 additions & 1 deletion charts/gpu-provisioner/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ spec:
{{- with .Values.podAnnotations }}
{{- toYaml . | nindent 8 }}
{{- end }}
checksum/settings: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
Expand Down Expand Up @@ -74,6 +73,16 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: KARPENTER_SERVICE
value: {{ include "gpu-provisioner.fullname" . }}
- name: METRICS_PORT
value: "{{ .Values.controller.metrics.port }}"
- name: HEALTH_PROBE_PORT
value: "{{ .Values.controller.healthProbe.port }}"
- name: WEBHOOK_PORT
value: "{{ .Values.controller.webhook.port }}"
- name: WEBHOOK_METRICS_PORT
value: "{{ .Values.controller.webhookMetrics.port }}"
{{- with .Values.controller.env }}
{{- toYaml . | nindent 12 }}
{{- end }}
Expand Down
34 changes: 0 additions & 34 deletions charts/gpu-provisioner/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,7 @@ rules:
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get", "watch"]
- apiGroups: [""]
resources: ["configmaps", "namespaces", "secrets"]
verbs: ["get", "list", "watch"]
# Write
- apiGroups: [""]
resources: ["secrets"]
verbs: ["update"]
resourceNames: ["{{ include "gpu-provisioner.fullname" . }}-cert"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["update", "patch", "delete"]
resourceNames:
- gpu-provisioner-global-settings
- config-logging
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["patch", "update"]
Expand All @@ -38,24 +25,3 @@ rules:
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ include "gpu-provisioner.fullname" . }}-dns
namespace: kube-system
labels:
{{- include "gpu-provisioner.labels" . | nindent 4 }}
{{- with .Values.additionalAnnotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
rules:
# Read
- apiGroups: [""]
resources: ["services"]
resourceNames: ["kube-dns"]
verbs: ["get"]
20 changes: 0 additions & 20 deletions charts/gpu-provisioner/templates/rolebinding.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,6 @@ roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ include "gpu-provisioner.fullname" . }}
subjects:
- kind: ServiceAccount
name: gpu-provisioner
namespace: {{ .Release.Namespace }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ include "gpu-provisioner.fullname" . }}-dns
namespace: kube-system
labels:
{{- include "gpu-provisioner.labels" . | nindent 4 }}
{{- with .Values.additionalAnnotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ include "gpu-provisioner.fullname" . }}-dns
subjects:
- kind: ServiceAccount
name: gpu-provisioner
Expand Down
5 changes: 5 additions & 0 deletions charts/gpu-provisioner/templates/secret.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v1
kind: Secret
metadata:
name: {{ include "gpu-provisioner.fullname" . }}-cert
namespace: {{ .Release.Namespace }}
30 changes: 30 additions & 0 deletions charts/gpu-provisioner/templates/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "gpu-provisioner.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "gpu-provisioner.labels" . | nindent 4 }}
{{- if or .Values.additionalAnnotations .Values.service.annotations }}
annotations:
{{- with .Values.additionalAnnotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- with .Values.service.annotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
spec:
type: ClusterIP
ports:
- name: http-metrics
port: {{ .Values.controller.metrics.port }}
protocol: TCP
- name: webhook
port: {{ .Values.controller.webhook.port }}
protocol: TCP
- name: webhook-metrics
port: {{ .Values.controller.webhookMetrics.port }}
protocol: TCP
selector:
{{- include "gpu-provisioner.selectorLabels" . | nindent 4 }}
11 changes: 10 additions & 1 deletion charts/gpu-provisioner/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ additionalAnnotations: {}
imagePullPolicy: IfNotPresent
# -- Image pull secrets for Docker images.
imagePullSecrets: []
service:
# -- Additional annotations for the Service.
annotations: {}
serviceAccount:
# -- Specifies if a ServiceAccount should be created.
create: true
Expand Down Expand Up @@ -142,10 +145,16 @@ controller:
logEncoding: ""
metrics:
# -- The container port to use for metrics.
port: 8000
port: 8080
healthProbe:
# -- The container port to use for http health probe.
port: 8081
webhook:
# -- The container port to use for http health probe.
port: 8443
webhookMetrics:
# -- The container port to use for http health probe.
port: 8001
# -- Global log level
logLevel: debug
# -- Global log encoding
Expand Down
Loading

0 comments on commit 3304c0b

Please sign in to comment.