-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support for running in a Ray cluster (#737)
- Loading branch information
Showing
15 changed files
with
1,187 additions
and
109 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -150,6 +150,9 @@ ledger.json | |
/checkpoints | ||
*.jaxpr | ||
|
||
# local execution commands | ||
local_*.sh | ||
|
||
# aider | ||
.aider* | ||
|
||
.benchmarks |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# This dockerfile is used to build the docker image for using Ray to manage TPU slices. | ||
ARG IMAGE=ghcr.io/stanford-crfm/levanter-base | ||
ARG TAG=latest | ||
|
||
FROM ${IMAGE}:${TAG} | ||
|
||
# install docker in docker, but don't start it | ||
RUN apt-get update && apt-get install -y docker.io | ||
|
||
ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\ | ||
TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\ | ||
RAY_USAGE_STATS_ENABLED=0\ | ||
PATH=/opt/levanter/.venv/bin:$PATH\ | ||
PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests:src:.\ | ||
HOME=/home/levanter | ||
# Install dependencies | ||
|
||
RUN apt-get install -y \ | ||
sudo \ | ||
git \ | ||
libjemalloc-dev \ | ||
wget \ | ||
cmake \ | ||
g++ \ | ||
zlib1g-dev \ | ||
tmux \ | ||
screen \ | ||
rsync \ | ||
netbase \ | ||
openssh-client \ | ||
gnupg | ||
|
||
RUN pip install --no-cache-dir \ | ||
flatbuffers \ | ||
cython==0.29.37 \ | ||
# Necessary for Dataset to work properly. | ||
numpy\>=1.20 \ | ||
psutil \ | ||
# Required a recent version of setuptools to be compatible with python 3.12+. | ||
setuptools==71.1.0 \ | ||
"google-api-python-client==1.7.8" \ | ||
"google-oauth" | ||
|
||
|
||
# Install gcloud so we can get secrets (maybe we should just curl?) | ||
RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz | ||
|
||
RUN mkdir -p /usr/local/gcloud \ | ||
&& tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \ | ||
&& /usr/local/gcloud/google-cloud-sdk/install.sh \ | ||
&& rm -f /tmp/google-cloud-sdk.tar.gz | ||
|
||
# Adding the package path to local | ||
ENV PATH=$PATH:/usr/local/gcloud/google-cloud-sdk/bin | ||
|
||
# GCP doesn't like it when root ssh's into a machine | ||
RUN useradd -m -s /bin/bash levanter | ||
RUN echo "levanter ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers | ||
RUN usermod -aG docker levanter | ||
RUN mkdir -p $HOME && touch $HOME/.bashrc && chown -R levanter $HOME | ||
RUN echo "export PATH=$PATH" >> $HOME/.bashrc | ||
RUN adduser levanter docker | ||
|
||
RUN chown -R levanter /opt/levanter | ||
|
||
USER levanter | ||
|
||
# HACK until https://github.com/ray-project/ray/issues/47769 is resolved | ||
RUN pip install 'ray[default,gcp]==2.34.0' | ||
RUN git clone https://github.com/dlwh/ray.git ~/ray --branch tpu_docker_2.34 --depth 1 | ||
RUN cp ~/ray/python/ray/autoscaler/_private/gcp/tpu_command_runner.py /opt/levanter/.venv/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/tpu_command_runner.py | ||
|
||
|
||
WORKDIR /opt/levanter |
Oops, something went wrong.