From a8c07cff82e20a8b0f8dc9c54491e41cbe206d1a Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 18 Jun 2024 14:37:54 -0500 Subject: [PATCH] Update RAPIDS accelerated UDF Dockerfile to better match spark-rapids-jni Signed-off-by: Jason Lowe --- .../RAPIDS-accelerated-UDFs/Dockerfile | 98 ++++++++----------- .../RAPIDS-accelerated-UDFs/README.md | 10 +- 2 files changed, 45 insertions(+), 63 deletions(-) diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile index b5ef1cc0c..f1c252fb5 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile @@ -16,72 +16,54 @@ # A container that can be used to build UDF native code against libcudf ARG CUDA_VERSION=11.8.0 -ARG LINUX_VERSION=ubuntu18.04 +ARG LINUX_VERSION=rockylinux8 FROM nvidia/cuda:${CUDA_VERSION}-devel-${LINUX_VERSION} -ARG DEBIAN_FRONTEND=noninteractive +ARG TOOLSET_VERSION=11 +ENV TOOLSET_VERSION=11 ARG PARALLEL_LEVEL=10 ENV PARALLEL_LEVEL=10 -RUN GCC_VERSION=$(bash -c '\ -CUDA_VERSION=$(nvcc --version | head -n4 | tail -n1 | cut -d" " -f5 | cut -d"," -f1); \ -CUDA_VERSION_MAJOR=$(echo $CUDA_VERSION | tr -d '.' | cut -c 1-2); \ -CUDA_VERSION_MINOR=$(echo $CUDA_VERSION | tr -d '.' | cut -c 3); \ - if [[ "$CUDA_VERSION_MAJOR" == 9 ]]; then echo "7"; \ - elif [[ "$CUDA_VERSION_MAJOR" == 10 ]]; then echo "8"; \ - elif [[ "$CUDA_VERSION_MAJOR" == 11 ]]; then echo "9"; \ - else echo "10"; \ - fi') \ -&& apt update -y \ -&& apt install -y software-properties-common \ -&& add-apt-repository -y ppa:git-core/ppa \ -&& add-apt-repository -y ppa:ubuntu-toolchain-r/test \ -&& add-apt-repository ppa:deadsnakes/ppa \ -&& apt update -y \ -&& apt install -y \ - build-essential git rsync wget \ - gcc-${GCC_VERSION} g++-${GCC_VERSION} \ - openjdk-8-jdk maven tzdata \ - # CMake dependencies - curl libssl-dev libcurl4-openssl-dev zlib1g-dev \ -&& apt autoremove -y \ -&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ -&& update-alternatives \ - --install /usr/bin/gcc gcc /usr/bin/gcc-${GCC_VERSION} 100 \ -&& update-alternatives \ - --install /usr/bin/g++ g++ /usr/bin/g++-${GCC_VERSION} 100 \ -# Set gcc-${GCC_VERSION} as the default gcc -&& update-alternatives --set gcc /usr/bin/gcc-${GCC_VERSION} \ -# Set gcc-${GCC_VERSION} as the default g++ -&& update-alternatives --set g++ /usr/bin/g++-${GCC_VERSION} \ -# Set JDK8 as the default Java -&& update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java +### Install basic requirements +RUN dnf --enablerepo=powertools install -y \ + gcc-toolset-${TOOLSET_VERSION} \ + git \ + java-1.8.0-openjdk \ + maven \ + ninja-build \ + patch \ + python39 \ + scl-utils \ + tar \ + wget \ + zlib-devel \ + && alternatives --set python /usr/bin/python3 +# 3.22.3: CUDA architecture 'native' support + flexible CMAKE__*_LAUNCHER for ccache ARG CMAKE_VERSION=3.26.4 +# default x86_64 from x86 build, aarch64 cmake for arm build +ARG CMAKE_ARCH=x86_64 +RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \ + tar zxf cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \ + rm cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz +ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}/bin:$PATH -# Install CMake -RUN cd /tmp \ - && curl -fsSLO --compressed "https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION.tar.gz" -o /tmp/cmake-$CMAKE_VERSION.tar.gz \ - && tar -xvzf /tmp/cmake-$CMAKE_VERSION.tar.gz && cd /tmp/cmake-$CMAKE_VERSION \ - && /tmp/cmake-$CMAKE_VERSION/bootstrap \ - --system-curl \ - --parallel=${PARALLEL_LEVEL} \ - && make install -j${PARALLEL_LEVEL} \ - && cd /tmp && rm -rf /tmp/cmake-$CMAKE_VERSION* - -# Install ccache +# ccache for interactive builds ARG CCACHE_VERSION=4.6 RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \ - tar zxf ccache-${CCACHE_VERSION}.tar.gz && \ - rm ccache-${CCACHE_VERSION}.tar.gz && \ - cd ccache-${CCACHE_VERSION} && \ - mkdir build && \ - cd build && \ - cmake .. \ - -DCMAKE_BUILD_TYPE=Release \ - -DZSTD_FROM_INTERNET=ON \ - -DREDIS_STORAGE_BACKEND=OFF && \ - cmake --build . --parallel ${PARALLEL_LEVEL} --target install && \ - cd ../.. && \ - rm -rf ccache-${CCACHE_VERSION} + tar zxf ccache-${CCACHE_VERSION}.tar.gz && \ + rm ccache-${CCACHE_VERSION}.tar.gz && \ + cd ccache-${CCACHE_VERSION} && \ + mkdir build && \ + cd build && \ + scl enable gcc-toolset-${TOOLSET_VERSION} \ + "cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DZSTD_FROM_INTERNET=ON \ + -DREDIS_STORAGE_BACKEND=OFF && \ + cmake --build . --parallel ${PARALLEL_LEVEL} --target install" && \ + cd ../.. && \ + rm -rf ccache-${CCACHE_VERSION} + +ENTRYPOINT /usr/bin/scl enable gcc-toolset-${TOOLSET_VERSION} -- bash diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md index 7ab7d2fe3..579f718c8 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md @@ -122,8 +122,8 @@ Run the following commands to build and start a docker ```bash cd spark-rapids-examples/examples/UDF-Examples/RAPIDS-accelerated-UDFs -docker build -t my-local:my-udf-example-ubuntu . -nvidia-docker run -it my-local:my-udf-example-ubuntu +docker build -t my-local:my-udf-example . +nvidia-docker run -it my-local:my-udf-example ``` ### Build the udf-examples jar @@ -139,7 +139,7 @@ export CCACHE_DIR="$LOCAL_CCACHE_DIR" export CMAKE_C_COMPILER_LAUNCHER="ccache" export CMAKE_CXX_COMPILER_LAUNCHER="ccache" export CMAKE_CUDA_COMPILER_LAUNCHER="ccache" -export CMAKE_CXX_LINKER_LAUNCHER="ccache +export CMAKE_CXX_LINKER_LAUNCHER="ccache" mvn clean package -Pudf-native-examples ``` @@ -206,9 +206,9 @@ $SPARK_HOME/bin/pyspark --master local[*] \ ### Test native based UDF -Input the following commands to test wordcount JIN UDF +Input the following commands to test wordcount JNI UDF -```bash +```python from pyspark.sql.types import * schema = StructType([ StructField("c1", StringType()),