From f44efb8115ab89db76bd7f606b80751187436fa3 Mon Sep 17 00:00:00 2001 From: Rafael Felix Correa Date: Thu, 14 Nov 2019 18:34:06 +0100 Subject: [PATCH] initial commit --- .envrc | 3 + .gitignore | 4 ++ Makefile | 103 ++++++++++++++++++++++++++++++++ README.md | 56 +++++++++++++++++ bin/.gitkeep | 0 generate_clustermode_podspec.sh | 64 ++++++++++++++++++++ get_image_name.sh | 4 ++ registry-values.yaml | 3 + tmp/.gitkeep | 0 9 files changed, 237 insertions(+) create mode 100644 .envrc create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README.md create mode 100644 bin/.gitkeep create mode 100755 generate_clustermode_podspec.sh create mode 100755 get_image_name.sh create mode 100644 registry-values.yaml create mode 100644 tmp/.gitkeep diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..104fa72 --- /dev/null +++ b/.envrc @@ -0,0 +1,3 @@ +export KUBECONFIG=$(pwd)/kubeconfig +export HELM_HOME=$(pwd)/.helm +export SPARK_HOME=$(pwd)/tmp/spark diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6adcf65 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +tmp/ +bin/ +kubeconfig +.helm/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..141b342 --- /dev/null +++ b/Makefile @@ -0,0 +1,103 @@ +SPARK_VERSION ?= 2.4.4 +SPARK_VERSION_SUFFIX ?= -bin-hadoop2.7 +K8S_VERSION ?= v1.15.4 +HELM_VERSION ?= v2.14.2 +MINIKUBE_VERSION ?= latest +MINIKUBE_VMDRIVER ?= virtualbox +MIRROR ?= archive.apache.org + +OS ?= $(shell uname -s | tr '[:upper:]' '[:lower:]') +ARCH ?= amd64 + +.PHONY: all +all: k8s-tooling start-minikube helm-init start-registry + +################# +## k8s tooling ## +################# + +bin/kubectl: + curl -Lo bin/kubectl https://storage.googleapis.com/kubernetes-release/release/$(K8S_VERSION)/bin/$(OS)/$(ARCH)/kubectl + chmod +x bin/kubectl + +tmp/helm: + curl -Lo tmp/helm.tar.gz https://get.helm.sh/helm-$(HELM_VERSION)-$(OS)-$(ARCH).tar.gz + tar xvzf tmp/helm.tar.gz + mv $(OS)-$(ARCH) tmp/helm + rm -f tmp/helm.tar.gz + +bin/helm: tmp/helm + cp -a tmp/helm/helm bin/helm + chmod +x bin/helm + +bin/tiller: tmp/helm + cp -a tmp/helm/tiller bin/tiller + chmod +x bin/tiller + +bin/minikube: + curl -Lo bin/minikube https://storage.googleapis.com/minikube/releases/$(MINIKUBE_VERSION)/minikube-$(OS)-$(ARCH) + chmod +x bin/minikube + +.PHONY: helm-init +helm-init: bin/helm bin/tiller + ./bin/helm init --wait + +.PHONY: k8s-tooling +k8s-tooling: bin/kubectl bin/helm bin/tiller bin/minikube + +############## +## Minikube ## +############## + +.PHONY: start-minikube +start-minikube: bin/minikube + ./bin/minikube start --cpus=4 --memory=4000mb --vm-driver=$(MINIKUBE_VMDRIVER) --kubernetes-version=$(K8S_VERSION) + +.PHONY: stop-minikube +stop-minikube: bin/minikube + ./bin/minikube stop + +##################### +## Docker registry ## +##################### + +.PHONY: start-registry +start-registry: + ./bin/helm upgrade --install --wait registry -f registry-values.yaml stable/docker-registry + echo "Registry successfully deployed in minikube. Make sure you add $(shell minikube ip):30000 to your insecure registries before continuing. Check https://docs.docker.com/registry/insecure/ for more information on how to do it in your platform." + +.PHONY: stop-registry +stop-registry: + ./bin/helm delete --purge registry + +############################################################################### +## Spark docker image building ## +## see: https://github.com/apache/spark/blob/master/bin/docker-image-tool.sh ## +############################################################################### + +tmp/spark.tgz: + curl -Lo tmp/spark.tgz https://${MIRROR}/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}${SPARK_VERSION_SUFFIX}.tgz + +# preventing issue https://issues.apache.org/jira/browse/SPARK-28921 from happening +.PHONY: patch-SPARK-28921 +patch-SPARK-28921: + curl -Lo tmp/kubernetes-model-4.4.2.jar https://repo1.maven.org/maven2/io/fabric8/kubernetes-model/4.4.2/kubernetes-model-4.4.2.jar + curl -Lo tmp/kubernetes-model-common-4.4.2.jar https://repo1.maven.org/maven2/io/fabric8/kubernetes-model-common/4.4.2/kubernetes-model-common-4.4.2.jar + curl -Lo tmp/kubernetes-client-4.4.2.jar https://repo1.maven.org/maven2/io/fabric8/kubernetes-client/4.4.2/kubernetes-client-4.4.2.jar + +tmp/spark: tmp/spark.tgz patch-SPARK-28921 + cd tmp && tar xvzf spark.tgz && mv spark-${SPARK_VERSION}${SPARK_VERSION_SUFFIX} spark && rm -rfv spark/jars/kubernetes-*.jar && cp -av kubernetes-*.jar spark/jars/ + +.PHONY: docker-build +docker-build: tmp/spark + cd tmp/spark && ./bin/docker-image-tool.sh -r $(shell minikube ip):30000 -t latest build + +.PHONY: docker-push +docker-push: + cd tmp/spark && ./bin/docker-image-tool.sh -r $(shell minikube ip):30000 -t latest push + +.PHONY: clean +clean: + echo "Make sure you remove $(shell minikube ip):30000 from your list of insecure registries." + ./bin/minikube delete + rm -rf tmp/* bin/* diff --git a/README.md b/README.md new file mode 100644 index 0000000..4a60f07 --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# Spark on K8s + +## Objective + +A practical example on how to run Spark on kubernetes + +Reference: + +## Pre-requisites + +- [docker](https://docs.docker.com/install/) +- [envrc](https://direnv.net/docs/installation.html) +- [make](https://www.gnu.org/software/make/) +- [curl](https://curl.haxx.se/) +- [tar](https://www.gnu.org/software/tar/) + +A hypervisor for running minikube. Check possibilities [here](https://minikube.sigs.k8s.io/docs/reference/drivers/). The recommended one is [VirtualBox](https://www.virtualbox.org/wiki/Downloads). + +## Getting started + +```bash +# this will install k8s tooling locally, start minikube, initialize helm and deploy a docker registry chart to your minikube +make + +# if everything goes well, you should see a message like this: Registry successfully deployed in minikube. Make sure you add 192.168.99.105:30000 to your insecure registries before continuing. Check https://docs.docker.com/registry/insecure/ for more information on how to do it in your platform. + +# build the spark images +make docker-build + +# push the spark images our private docker registry +make docker-push +# HINT: if you see "Get https://192.168.99.105:30000/v2/: http: server gave HTTP response to HTTPS client" go back and check whether you have it listed in your insecure registries + +# once your images are pushed, let's run a sample spark job (first on client mode) +$SPARK_HOME/bin/spark-submit \ + --master k8s://https://$(minikube ip):8443 \ + --deploy-mode client \ + --conf spark.kubernetes.container.image=$(./get_image_name.sh spark) \ + --class org.apache.spark.examples.SparkPi \ + $SPARK_HOME/examples/jars/spark-examples_2.11-2.4.4.jar + +# ... and now, the same job but from within a pod in cluster mode +./generate_clustermode_podspec.sh +./bin/kubectl apply -f clustermode-podspec-with-rbac.yaml # make sure you check the contents of this file to understand better how it works + +# in case you want to rerun the example above, make sure you delete the pod first +./bin/kubectl delete pod spark-submit-example + +# check the executor pods in another terminal window while running +./bin/kubectl get pods -w + +# ... + +# deletes minikube and clean up downloaded tools +make clean +``` diff --git a/bin/.gitkeep b/bin/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/generate_clustermode_podspec.sh b/generate_clustermode_podspec.sh new file mode 100755 index 0000000..fa8d42b --- /dev/null +++ b/generate_clustermode_podspec.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +cat > clustermode-podspec-with-rbac.yaml << EOF +apiVersion: v1 +kind: ServiceAccount +metadata: + name: spark +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: spark-cluster-role +rules: +- apiGroups: [""] # "" indicates the core API group + resources: ["pods"] + verbs: ["get", "watch", "list", "create", "delete"] +- apiGroups: [""] # "" indicates the core API group + resources: ["services"] + verbs: ["get", "create", "delete"] +- apiGroups: [""] # "" indicates the core API group + resources: ["configmaps"] + verbs: ["get", "create", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: spark-cluster-role-binding +subjects: +- kind: ServiceAccount + name: spark + namespace: default +roleRef: + kind: ClusterRole + name: spark-cluster-role + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: v1 +kind: Pod +metadata: + name: spark-submit-example +spec: + serviceAccountName: spark + containers: + - name: spark-submit-example + args: + - /opt/spark/bin/spark-submit + - --master + - k8s://https://\$(KUBERNETES_PORT_443_TCP_ADDR):\$(KUBERNETES_PORT_443_TCP_PORT) + - --deploy-mode + - cluster + - --conf + - spark.kubernetes.container.image=$(./get_image_name.sh spark) + - --conf + - spark.kubernetes.authenticate.driver.serviceAccountName=spark + - --class + - org.apache.spark.examples.SparkPi + - local:///opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar + env: + - name: SPARK_HOME + value: /opt/spark + resources: {} + image: $(./get_image_name.sh spark):latest + imagePullPolicy: Always +EOF diff --git a/get_image_name.sh b/get_image_name.sh new file mode 100755 index 0000000..9aca0a5 --- /dev/null +++ b/get_image_name.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +# run this script after running make, otherwise it will fail +./bin/kubectl get svc registry-docker-registry -o=jsonpath='{.spec.clusterIP}':5000/$1 diff --git a/registry-values.yaml b/registry-values.yaml new file mode 100644 index 0000000..d26efdb --- /dev/null +++ b/registry-values.yaml @@ -0,0 +1,3 @@ +service: + type: NodePort + nodePort: 30000 diff --git a/tmp/.gitkeep b/tmp/.gitkeep new file mode 100644 index 0000000..e69de29