Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Scripts and docker files for Huawei experiments. #8

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions scripts/benchmarks/resnet-50.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,14 @@ OPTS="-server -XX:+UseConcMarkSweepGC -XX:NewRatio=2 -XX:SurvivorRatio=16 -Xms48
CLASS="uk.ac.imperial.lsds.crossbow.ResNetv1"
CLASSFILE="${TESTS}/`echo ${CLASS} | tr '.' '/'`.class"

resultdir="results/"
#resultdir="results/"
resultdir="/cache/train_dir"
[ ! -d ${resultdir} ] && mkdir -p ${resultdir}

crossbowDirExists ${resultdir}

datadir="$CROSSBOW_HOME/data/imagenet/"
# datadir="$CROSSBOW_HOME/data/imagenet/"
datadir="/cache/data_dir/"
crossbowDirExists ${datadir}

layers=50
Expand Down Expand Up @@ -142,8 +144,7 @@ NCCL_DEBUG=WARN java $OPTS -cp $JCP $CLASS \
--layers ${layers} \
--reuse-memory true \
--direct-scheduling false \
--data-directory ${datadir} \
&> ${resultdir}/${resultfile}
--data-directory ${datadir}

echo "Done"

Expand Down
15 changes: 11 additions & 4 deletions scripts/datasets/imagenet/parse-records.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import math
import numpy as np
import argparse

import tensorflow as tf

Expand Down Expand Up @@ -59,14 +60,20 @@ def _parse(record):
return image, height, width, label, xmin, ymin, xmax, ymax, features['image/class/text']

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--input-dir', type=str, default='/data/tensorflow/imagenet/train')
parser.add_argument('--output-dir', type=str, default='/data/crossbow/imagenet/train')
args = parser.parse_args()

with tf.Session() as session:
subset = "train"
maxrecordsperfile = 2048
N = 0 # Expect 1251 records in 1 file (for training)
# Number of records per file...
mx = 0
directory = "/data/tensorflow/imagenet/train"
pattern = os.path.join(directory, '%s-*-of-*' % subset)
input_dir = args.input_dir
output_dir = args.output_dir
pattern = os.path.join(input_dir, '%s-*-of-*' % subset)
files = gfile.Glob(pattern)
if not files:
raise ValueError()
Expand Down Expand Up @@ -101,7 +108,7 @@ def _parse(record):
img_checksums = []

filecounter = 1
filename = "crossbow-%s.records.%d" % (subset, filecounter)
filename = "%s/crossbow-%s.records.%d" % (output_dir, subset, filecounter)
f = open(filename, "wb")
# Write number of records as a file header
recordsinfile = 0
Expand Down Expand Up @@ -178,7 +185,7 @@ def _parse(record):
remaining = N - totalrecordswritten
if remaining > 0:
filecounter += 1
filename = "crossbow-%s.records.%d" % (subset, filecounter)
filename = "%s/crossbow-%s.records.%d" % (output_dir, subset, filecounter)
f = open(filename, "wb")
# Write file header
recordsinfile = 0
Expand Down
15 changes: 15 additions & 0 deletions scripts/huawei/download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env python

from __future__ import print_function

import moxing as mox
import time
import os

if __name__ == '__main__':
data_dir = '/cache/data_dir'
start = time.time()
data_url = os.environ['DLS_DATA_URL']
print('INFO: Start copying data from the blob storage ' + data_url + ' into SSD under ' + data_dir)
mox.file.copy_parallel(data_url, data_dir)
print('INFO: Copying completes! The copy task takes: ' + str(time.time() - start) + ' seconds')
11 changes: 11 additions & 0 deletions scripts/huawei/runner.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

CROSSBOW_HOME=/home/work/user-job-dir/Crossbow

[ ! -d "/cache/train_dir" ] && mkdir /cache/train_dir

python $CROSSBOW_HOME/scripts/huawei/download_data.py

bash $CROSSBOW_HOME/scripts/datasets/imagenet/prepare-imagenet.sh /cache/data_dir /cache/train_dir

python $CROSSBOW_HOME/scripts/huawei/upload_data.py
12 changes: 12 additions & 0 deletions scripts/huawei/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

CROSSBOW_HOME=/crossbow

python $CROSSBOW_HOME/scripts/huawei/download_data.py

mv /home/work/user-job-dir/Crossbow-scripts/imagenet-test.metadata $CROSSBOW_HOME/data/imagenet/imagenet-test.metadata
mv /home/work/user-job-dir/Crossbow-scripts/imagenet-train.metadata $CROSSBOW_HOME/data/imagenet/imagenet-train.metadata

bash /home/work/user-job-dir/Crossbow-scripts/resnet-50.sh

python $CROSSBOW_HOME/scripts/huawei/upload_data.py
12 changes: 12 additions & 0 deletions scripts/huawei/upload_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python

from __future__ import print_function

import moxing as mox
import os

if __name__ == '__main__':
train_dir = '/cache/train_dir'
train_url = os.environ['DLS_TRAIN_URL']
print('INFO: Copy trained model to ' + train_url)
mox.file.copy_parallel(train_dir, train_url)
90 changes: 90 additions & 0 deletions tools/dockerfiles/dockerfiles/Dockerfile.huawei
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# ModelArts example: https://github.com/huawei-clouds/modelarts-example/blob/master/CustomImage
FROM swr.cn-north-1.myhuaweicloud.com/eiwizard/custom-gpu-cuda9-inner-moxing-cp36:1.1 as base

# The pip source has been pre-configured to an internal source. Roll back to public sources.
RUN rm $HOME/.pip/pip.conf

# Fix the source lists
RUN sed -i 's/cmc-cd-mirror.rnd.huawei.com/security.ubuntu.com/g' /etc/apt/sources.list

# Replace the standard ubuntu source with Aliyun sources if buidling in mainland China
RUN sed -i s/archive.ubuntu.com/mirrors.aliyun.com/g /etc/apt/sources.list \
&& sed -i s/security.ubuntu.com/mirrors.aliyun.com/g /etc/apt/sources.list

# Add the NVIDIA package repo and fetch key
# Reference: https://gitlab.com/nvidia/cuda/blob/ubuntu16.04/9.0/base/Dockerfile#L4
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
rm -rf /var/lib/apt/lists/* && \
NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list

RUN apt update && apt install -y --no-install-recommends \
apt-utils \
build-essential \
cuda9.0 \
cuda-cublas-9-0 \
cuda-cufft-9-0 \
cuda-curand-9-0 \
cuda-cusolver-9-0 \
cuda-cusparse-9-0 \
libcudnn7=7.2.1.38-1+cuda9.0 \
libcudnn7-dev=7.2.1.38-1+cuda9.0 \
libnccl2=2.2.13-1+cuda9.0 \
libnccl-dev=2.2.13-1+cuda9.0 \
cuda-command-line-tools-9-0 \
libfreetype6-dev \
libhdf5-serial-dev \
libpng12-dev \
libzmq3-dev \
pkg-config \
software-properties-common \
unzip \
git \
wget \
openjdk-8-jdk \
maven \
libboost-all-dev \
graphviz \
cmake \
nasm \
&& rm -rf /var/lib/apt/lists/*

ENV CUDA_HOME /usr/local/cuda

# OpenBLAS (TODO: install using apt install)
RUN git clone --progress https://github.com/xianyi/OpenBLAS.git openblas \
&& cd openblas \
&& make -j $(nproc) \
&& make install
ENV BLAS_HOME /opt/OpenBLAS
ENV LD_LIBRARY_PATH $BLAS_HOME/lib:$LD_LIBRARY_PATH

# libjpeg-turbo (TODO: install using apt install)
RUN git clone --progress https://github.com/libjpeg-turbo/libjpeg-turbo.git \
&& cd libjpeg-turbo \
&& cmake -G"Unix Makefiles" && make -j $(nproc)
ENV JPEG_HOME /libjpeg-turbo
ENV LD_LIBRARY_PATH $JPEG_HOME/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH $JPEG_HOME:$LD_LIBRARY_PATH

# Crossbow
RUN git clone https://github.com/lsds/Crossbow.git crossbow
ENV CROSSBOW_HOME /crossbow
RUN cd crossbow \
&& mvn package \
&& cd clib-multigpu \
&& ./genmakefile.sh \
&& make -j $(nproc) \
&& cd ../ \
&& ./scripts/build.sh

# Install tensorflow-gpu 1.12.0 in the conda environment (pip has been redirected to conda pip)
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple tensorflow-gpu==1.12.0 # Run this if in the mainland China
# RUN pip install tensorflow-gpu==1.12.0

WORKDIR /