Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jpugliesi committed May 25, 2021
0 parents commit 87eedd7
Show file tree
Hide file tree
Showing 6 changed files with 579 additions and 0 deletions.
31 changes: 31 additions & 0 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Build and Push Docker Images

on: [push]

env:
DOCKER_BUILDKIT: 1

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Login to Docker Hub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Download latest earthly
run: "sudo /bin/sh -c 'wget https://github.com/earthly/earthly/releases/download/v0.5.13/earthly-linux-amd64 -O /usr/local/bin/earthly && chmod +x /usr/local/bin/earthly'"
- name: Earthly version
run: earthly --version
- name: Build and Push All Images
# Make this step dependent on the success of the previous one (ECR credentials)
if: success()
run: |
earthly --push --ci +build-spark-image
# Only push latest tags on master branch
- name: Push All Latest Images
if: success() && github.ref == 'refs/heads/master'
run: |
earthly --push --ci +build-spark-image --build-arg container_tag_suffix=""
177 changes: 177 additions & 0 deletions Earthfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
ARG spark_version=3.1.1
ARG hadoop_version=3.2.0
ARG hive_version=2.3.7
ARG maven_version=3.6.3

# Spark runtime build options
ARG scala_version=2.12
ARG aws_java_sdk_version=1.11.797

ARG container_repo=viaductai/spark
ARG container_tag_prefix="${latest:+${latest}}spark-${spark_version}_hadoop-${hadoop_version}_hive-${hive_version}_k8s_aws"
ARG container_tag_suffix="-${EARTHLY_TARGET_TAG_DOCKER}"
ARG container_tag="${container_tag_prefix}${container_tag_suffix}"

build-deps:
# use python base image because spark build process requires python
FROM python:3.7-slim-stretch

# TODO: use openjdk-11
RUN echo "deb http://ftp.us.debian.org/debian sid main" >> /etc/apt/sources.list \
&& apt-get update \
&& mkdir -p /usr/share/man/man1 \
&& apt-get install -y git curl wget openjdk-8-jdk patch \
&& rm -rf /var/cache/apt/*

# maven
RUN cd /opt \
&& wget https://downloads.apache.org/maven/maven-3/${maven_version}/binaries/apache-maven-${maven_version}-bin.tar.gz \
&& tar zxvf /opt/apache-maven-${maven_version}-bin.tar.gz \
&& rm apache-maven-${maven_version}-bin.tar.gz

ENV PATH=/opt/apache-maven-${maven_version}/bin:$PATH
ENV MAVEN_HOME /opt/apache-maven-${maven_version}

# configure the pentaho nexus repo to prevent build errors
# similar to the following: https://github.com/apache/hudi/issues/2479
COPY ./maven-settings.xml ${MAVEN_HOME}/conf/settings.xml

build-glue-hive-client:
FROM +build-deps

# Download and extract Apache hive source
RUN wget https://github.com/apache/hive/archive/rel/release-${hive_version}.tar.gz -O hive.tar.gz
RUN mkdir hive && tar xzf hive.tar.gz --strip-components=1 -C hive

## Build patched hive 2.3.7
# https://github.com/awslabs/aws-glue-data-catalog-client-for-apache-hive-metastore/issues/26
WORKDIR /hive
# Patch copied from: https://issues.apache.org/jira/secure/attachment/12958418/HIVE-12679.branch-2.3.patch
COPY ./aws-glue-spark-hive-client/HIVE-12679.branch-2.3.patch hive.patch
RUN patch -p0 <hive.patch &&\
mvn clean install -DskipTests

# Now with hive patched and installed, build the glue client
RUN git clone https://github.com/viaduct-ai/aws-glue-data-catalog-client-for-apache-hive-metastore /catalog

WORKDIR /catalog

RUN mvn clean package \
-DskipTests \
-Dhive2.version=${hive_version} \
-Dhadoop.version=${hadoop_version} \
-Daws.sdk.version=${aws_java_sdk_version} \
-pl -aws-glue-datacatalog-hive2-client

build-spark:
FROM +build-glue-hive-client

ENV MAKEFLAGS -j 4

# Build spark
WORKDIR /

RUN git clone https://github.com/apache/spark.git --branch v${spark_version} --single-branch && \
cd /spark && \
./dev/make-distribution.sh \
--name spark \
--pip \
-DskipTests \
-Pkubernetes \
-Phadoop-cloud \
-P"hadoop-${hadoop_version%.*}" \
-Dhadoop.version="${hadoop_version}" \
-Dhive.version="${hive_version}" \
-Phive \
-Phive-thriftserver

# copy the glue client jars to spark jars directory
RUN find /catalog -name "*.jar" | grep -Ev "test|original" | xargs -I{} cp {} /spark/dist/jars/

RUN rm /spark/dist/jars/aws-java-sdk-bundle-*.jar
RUN wget --quiet https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${aws_java_sdk_version}/aws-java-sdk-bundle-${aws_java_sdk_version}.jar -P /spark/dist/jars/
RUN chmod 0644 /spark/dist/jars/aws-java-sdk-bundle*.jar

# replace with guava version compatible with latest aws-java-sdk-bundle
RUN rm -f /spark/dist/jars/guava-14.0.1.jar
RUN wget --quiet https://repo1.maven.org/maven2/com/google/guava/guava/23.0/guava-23.0.jar -P /spark/dist/jars/
RUN chmod 0644 /spark/dist/jars/guava-23.0.jar

# save the final spark distribution
SAVE ARTIFACT /spark/dist

# Build the spark docker images according to the official Docker images published with the project
# (https://github.com/apache/spark/tree/master/resource-managers/kubernetes/docker/src/main/dockerfiles/spark)
# This ensures that the layout of the container is "correct", which is
# important for spark features like graceful node decommissioning (/opt/decom.sh)
build-spark-image:
FROM openjdk:8-jre-slim

ARG spark_uid=185
# Before building the docker image, first build and make a Spark distribution following
# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
# If this docker file is being used in the context of building your images from a Spark
# distribution, the docker build command should be invoked from the top level directory
# of the Spark distribution. E.g.:
# docker build -t spark:latest -f kubernetes/dockerfiles/spark/Dockerfile .

RUN set -ex && \
sed -i 's/http:\/\/deb.\(.*\)/https:\/\/deb.\1/g' /etc/apt/sources.list && \
apt-get update && \
ln -s /lib /lib64 && \
apt install -y bash tini libc6 libpam-modules krb5-user libnss3 procps && \
mkdir -p /opt/spark && \
mkdir -p /opt/spark/examples && \
mkdir -p /opt/spark/work-dir && \
mkdir -p /opt/spark/conf && \
touch /opt/spark/RELEASE && \
rm /bin/sh && \
ln -sv /bin/bash /bin/sh && \
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
rm -rf /var/cache/apt/*

COPY +build-spark/dist/jars /opt/spark/jars
COPY +build-spark/dist/bin /opt/spark/bin
COPY +build-spark/dist/sbin /opt/spark/sbin
COPY +build-spark/dist/kubernetes/dockerfiles/spark/entrypoint.sh /opt/
COPY +build-spark/dist/kubernetes/dockerfiles/spark/decom.sh /opt/
COPY +build-spark/dist/examples /opt/spark/examples
COPY +build-spark/dist/kubernetes/tests /opt/spark/tests
COPY +build-spark/dist/data /opt/spark/data

# configure aws glue data catalog as the Hive Metastore client
COPY ./conf/hive-site.xml /opt/spark/conf/hive-site.xml

ENV SPARK_HOME /opt/spark

WORKDIR /opt/spark/work-dir
RUN chmod g+w /opt/spark/work-dir
RUN chmod a+x /opt/decom.sh

ENTRYPOINT [ "/opt/entrypoint.sh" ]

# Install python
# Reset to root to run installation tasks
USER 0

RUN mkdir ${SPARK_HOME}/python
RUN apt-get update && \
apt install -y python3 python3-pip && \
pip3 install --upgrade pip setuptools && \
# Removed the .cache to save space
rm -r /root/.cache && rm -rf /var/cache/apt/*

COPY +build-spark/dist/python/pyspark ${SPARK_HOME}/python/pyspark
COPY +build-spark/dist/python/lib ${SPARK_HOME}/python/lib

ENV PATH "${PATH}:${SPARK_HOME}/bin"

WORKDIR /opt/spark/work-dir
ENTRYPOINT [ "/opt/entrypoint.sh" ]

# Specify the User that the actual main process will run as
ARG spark_uid=185
USER ${spark_uid}

SAVE IMAGE --push ${container_repo}:${container_tag}
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# spark-k8s-aws

Build for a spark on kubernetes-ready docker image configured with notable AWS Dependencies, including:
* An up-to-date AWS SDK capable of supporting [ IRSA ](https://aws.amazon.com/blogs/opensource/introducing-fine-grained-iam-roles-service-accounts/)
* [ AWS Glue Data Catalog client for Hive Metastore ](https://github.com/viaduct-ai/aws-glue-data-catalog-client-for-apache-hive-metastore)

## Build the Docker Image
Builds are managed using https://earthly.dev

```
earthly --use-inline-cache +build-spark-image
```

Use in your own Earthfile build:
```
my-image:
FROM +github.com/viaduct-ai/docker-spark-k8s-aws+build-spark-image
# ...
```

## Why?
If you've ever tried building a spark distribution/image with the AWS Glue Data
Catalog Client for Hive, you know it's a PITA.

This project aims to open source a working docker image, built using the
amazing [ Earthly ](https://earthly.dev) project, to democratize an integrate
spark on kubernetes on aws experience until someone develops a Spark
DataSourceV2 API-compliant Glue Data Catalog implementation (which won't
require the absolute hack of patching hive and building spark from source)

Loading

0 comments on commit 87eedd7

Please sign in to comment.