Skip to content

Commit

Permalink
Support for running in a Ray cluster (#737)
Browse files Browse the repository at this point in the history
  • Loading branch information
dlwh authored Sep 24, 2024
1 parent 9fa3aaa commit 2b42bfb
Show file tree
Hide file tree
Showing 15 changed files with 1,187 additions and 109 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ ledger.json
/checkpoints
*.jaxpr

# local execution commands
local_*.sh

# aider
.aider*

.benchmarks
74 changes: 74 additions & 0 deletions docker/tpu/Dockerfile.cluster
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# This dockerfile is used to build the docker image for using Ray to manage TPU slices.
ARG IMAGE=ghcr.io/stanford-crfm/levanter-base
ARG TAG=latest

FROM ${IMAGE}:${TAG}

# install docker in docker, but don't start it
RUN apt-get update && apt-get install -y docker.io

ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
RAY_USAGE_STATS_ENABLED=0\
PATH=/opt/levanter/.venv/bin:$PATH\
PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests:src:.\
HOME=/home/levanter
# Install dependencies

RUN apt-get install -y \
sudo \
git \
libjemalloc-dev \
wget \
cmake \
g++ \
zlib1g-dev \
tmux \
screen \
rsync \
netbase \
openssh-client \
gnupg

RUN pip install --no-cache-dir \
flatbuffers \
cython==0.29.37 \
# Necessary for Dataset to work properly.
numpy\>=1.20 \
psutil \
# Required a recent version of setuptools to be compatible with python 3.12+.
setuptools==71.1.0 \
"google-api-python-client==1.7.8" \
"google-oauth"


# Install gcloud so we can get secrets (maybe we should just curl?)
RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz

RUN mkdir -p /usr/local/gcloud \
&& tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \
&& /usr/local/gcloud/google-cloud-sdk/install.sh \
&& rm -f /tmp/google-cloud-sdk.tar.gz

# Adding the package path to local
ENV PATH=$PATH:/usr/local/gcloud/google-cloud-sdk/bin

# GCP doesn't like it when root ssh's into a machine
RUN useradd -m -s /bin/bash levanter
RUN echo "levanter ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
RUN usermod -aG docker levanter
RUN mkdir -p $HOME && touch $HOME/.bashrc && chown -R levanter $HOME
RUN echo "export PATH=$PATH" >> $HOME/.bashrc
RUN adduser levanter docker

RUN chown -R levanter /opt/levanter

USER levanter

# HACK until https://github.com/ray-project/ray/issues/47769 is resolved
RUN pip install 'ray[default,gcp]==2.34.0'
RUN git clone https://github.com/dlwh/ray.git ~/ray --branch tpu_docker_2.34 --depth 1
RUN cp ~/ray/python/ray/autoscaler/_private/gcp/tpu_command_runner.py /opt/levanter/.venv/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/tpu_command_runner.py


WORKDIR /opt/levanter
Loading

0 comments on commit 2b42bfb

Please sign in to comment.