Skip to content

Commit

Permalink
[HugeCTR]Add a new base for hugectr (#1098)
Browse files Browse the repository at this point in the history
* Add ctr base image and install ctr components in merlin-hugectr

* Use same version for pytorch base image

* Correct torch python folder name for new version

* Add merlin and test script for ctr-base

* upate test script for new ctr-base and merlin-hugectr

* Fix typo

* Remove some packages hugectr maynot use

* Add back keras since SOK uses it

* Refactor dockerfiles

* Correct relative path

* Upgrade upstream image to 24.03

* Remove libboost which not in triton container

* Add libhdf5-dev

* Add execution privilege for test scripts

* Remove unused test script

* correct for base version
  • Loading branch information
EmmaQiaoCh authored Jun 14, 2024
1 parent c0f43d6 commit c4eb92d
Show file tree
Hide file tree
Showing 4 changed files with 424 additions and 14 deletions.
20 changes: 20 additions & 0 deletions ci/container_hugectr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

container=$1
devices=$2

echo "##############"
echo "# Unit tests #"
echo "##############"

exit_code=0

## Test HugeCTR
if [ "$container" == "merlin-hugectr" ]; then
echo "Run unit tests for HugeCTR"
/hugectr/ci/test_unit.sh $container $devices || exit_code=1
echo "Run unit tests for merlin-sok"
/hugectr/ci/test_unit.sh "merlin-tensorflow" $devices || exit_code=1
fi

exit $exit_code
9 changes: 7 additions & 2 deletions ci/test_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ if [ $container != 'merlin-ci-runner' ]; then
fi

${ci_script_dir}container_software.sh $container $devices
${ci_script_dir}container_integration.sh $container $devices $suppress_failures
${ci_script_dir}container_unit.sh $container $devices

if [ $container == 'merlin-hugectr' ]; then
${ci_script_dir}container_hugectr.sh $container $devices
elif [ $container != 'ctr-base' ]; then
${ci_script_dir}container_integration.sh $container $devices $suppress_failures
${ci_script_dir}container_unit.sh $container $devices
fi

108 changes: 96 additions & 12 deletions docker/dockerfile.ctr
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
# syntax=docker/dockerfile:1.2
ARG MERLIN_VERSION=23.06
ARG TRITON_VERSION=23.06
ARG MERLIN_VERSION=24.06
ARG TRITON_VERSION=24.03

ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/merlin-base:${MERLIN_VERSION}
ARG BASE_IMAGE=nvcr.io/nvstaging/merlin/ctr-base:${MERLIN_VERSION}

FROM ${BASE_IMAGE} as base

ARG HUGECTR_VER=main
ARG HUGECTR_BACKEND_VER=main

RUN pip install --no-cache-dir --upgrade notebook ipython
RUN pip install --no-cache-dir mpi4py
RUN pip install --no-cache-dir --upgrade notebook ipython mpi4py

# Install CUDA-Aware hwloc
ARG HWLOC_VER=2.4.1
Expand Down Expand Up @@ -45,22 +44,86 @@ ENV SHARP_COLL_NUM_COLL_GROUP_RESOURCE_ALLOC_THRESHOLD=0
ENV SHARP_COLL_LOCK_ON_COMM_INIT=1
ENV SHARP_COLL_LOG_LEVEL=3
ENV HCOLL_ENABLE_MCAST=0
ENV LD_LIBRARY_PATH=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow:$LD_LIBRARY_PATH \
SOK_COMPILE_UNIT_TEST=ON

# link sub modules expected by hugectr cmake
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so
RUN ln -s libibverbs.so.1 $(find /usr/lib/*-linux-gnu/libibverbs.so.1 | sed -e 's/\.1$//g')

# Install HugeCTR
# Optional dependency: Build and install protocol buffers and Hadoop/HDFS.
ARG INSTALL_HDFS=false
# Env for HDFS
ENV HADOOP_HOME=/opt/hadoop
ENV PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin \
HDFS_NAMENODE_USER=root \
HDFS_SECONDARYNAMENODE_USER=root \
HDFS_DATANODE_USER=root \
YARN_RESOURCEMANAGER_USER=root \
YARN_NODEMANAGER_USER=root \
# Tackles with ThreadReaper stack overflow issues: https://bugs.openjdk.java.net/browse/JDK-8153057
LIBHDFS_OPTS='-Djdk.lang.processReaperUseDefaultStackSize=true' \
# Tackles with JVM setting error signals that the UCX library checks (GitLab issue #425).
UCX_ERROR_SIGNALS='' \
CLASSPATH=${CLASSPATH}:\
${HADOOP_HOME}/etc/hadoop/*:\
${HADOOP_HOME}/share/hadoop/common/*:\
${HADOOP_HOME}/share/hadoop/common/lib/*:\
${HADOOP_HOME}/share/hadoop/hdfs/*:\
${HADOOP_HOME}/share/hadoop/hdfs/lib/*:\
${HADOOP_HOME}/share/hadoop/mapreduce/*:\
${HADOOP_HOME}/share/hadoop/yarn/*:\
${HADOOP_HOME}/share/hadoop/yarn/lib/*

# Install Inference and HPS Backend
ARG HUGECTR_DEV_MODE=false
ARG HUGECTR_VER=main
ARG _HUGECTR_REPO="github.com/NVIDIA-Merlin/HugeCTR.git"
ARG HUGECTR_BACKEND_VER=main
ARG _CI_JOB_TOKEN=""
ARG _HUGECTR_BACKEND_REPO="github.com/triton-inference-server/hugectr_backend.git"
ARG HUGECTR_HOME=/usr/local/hugectr
RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
ARG TRITON_VERSION

ENV PATH=$PATH:${HUGECTR_HOME}/bin \
CPATH=$CPATH:${HUGECTR_HOME}/include \
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib

RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \
# Install HugeCTR inference which is dependency for hps_backend
git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
cd /hugectr && \
git submodule update --init --recursive && \
mkdir build && \
cd build && \
if [[ "${INSTALL_HDFS}" == "false" ]]; then \
cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON .. \
; else \
cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;75;80;90" -DENABLE_INFERENCE=ON -DENABLE_HDFS=ON .. \
; fi && \
make -j$(nproc) && \
make install && \
rm -rf ./* && \
# Install hps_backend
git clone --branch ${HUGECTR_BACKEND_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_BACKEND_REPO} /repos/hugectr_triton_backend && \
mkdir /repos/hugectr_triton_backend/hps_backend/build && \
cd /repos/hugectr_triton_backend/hps_backend/build && \
cmake \
-DCMAKE_INSTALL_PREFIX:PATH=${HUGECTR_HOME} \
-DTRITON_COMMON_REPO_TAG="r${TRITON_VERSION}" \
-DTRITON_CORE_REPO_TAG="r${TRITON_VERSION}" \
-DTRITON_BACKEND_REPO_TAG="r${TRITON_VERSION}" .. && \
make -j$(nproc) && \
make install && \
chmod +x ${HUGECTR_HOME}/lib/*.so ${HUGECTR_HOME}/backends/hps/*.so && \
cd ../../.. && \
rm -rf hugectr_triton_backend && \
# Remove the incompatible gmock and gtest installed by hps_backend
rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \
rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \
# Install HugeCTR multinode
cd /hugectr/build && \
LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/:$LD_LIBRARY_PATH && \
export PATH=$PATH:/usr/local/cuda-$(echo $CUDA_VERSION | awk -F'.' '{print $1"."$2}')/compat && \
if [[ "${INSTALL_HDFS}" == "false" ]]; then \
Expand All @@ -70,13 +133,34 @@ RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
; fi && \
make -j$(nproc) && \
make install && \
rm -rf ./* && \
chmod +x ${HUGECTR_HOME}/bin/* ${HUGECTR_HOME}/lib/*.so && \
cd ../onnx_converter && \
# Install HPS trt pugin
cd ../hps_trt && \
mkdir build && \
cd build && \
cmake -DSM="70;75;80;90" .. && \
make -j$(nproc) && \
make install && \
cd ../../onnx_converter && \
python setup.py install && \
mv /hugectr/ci ~/hugectr-ci && rm -rf /hugectr && mkdir -p /hugectr && mv ~/hugectr-ci /hugectr/ci \
pip --no-cache-dir install ninja tf2onnx && \
# Install SOK
cd ../sparse_operation_kit && \
python setup.py install && \
# Install HPS TF plugin
cd ../hps_tf && \
python setup.py install && \
# Install hps_torch
cd ../hps_torch/ && \
TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 9.0" python setup.py install && \
mv /hugectr/ci ~/hugectr-ci && mv /hugectr/sparse_operation_kit/sparse_operation_kit ~/hugectr-sparse_operation_kit && \
rm -rf /hugectr && mkdir -p /hugectr /hugectr/sparse_operation_kit && \
mv ~/hugectr-ci /hugectr/ci && mv ~/hugectr-sparse_operation_kit /hugectr/sparse_operation_kit/sparse_operation_kit && \
chmod +x /hugectr/ci/* /hugectr/sparse_operation_kit/sparse_operation_kit/* \
; fi

RUN ln -s ${HUGECTR_HOME}/backends/hps /opt/tritonserver/backends/hps

ENV PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib

# Clean up
Expand Down
Loading

0 comments on commit c4eb92d

Please sign in to comment.