Skip to content

Commit

Permalink
update to main
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmeda14960 committed Sep 11, 2024
2 parents ec4ba08 + a91ef81 commit 00ecb15
Show file tree
Hide file tree
Showing 131 changed files with 9,200 additions and 4,993 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

scratch
cache
new-cache
wandb
checkpoints

Expand Down
44 changes: 44 additions & 0 deletions .github/workflows/docker-base-image.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: Build and Push Docker TPU Images

on:
push:
branches:
- main

jobs:
build:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Cache Docker layers
uses: actions/cache@v3
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-
- name: Get current date
id: date
run: echo "DATE=$(date +'%Y%m%d')" >> $GITHUB_ENV

- name: Login to GitHub Container Registry
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.DOCKER_PUSH_TOKEN }}

- name: Build and Push Docker image
run: |
docker buildx build --file docker/tpu/Dockerfile.base --tag ghcr.io/${{ github.repository_owner }}/levanter-base:latest --tag ghcr.io/${{ github.repository_owner }}/levanter-base:${{ env.DATE }} --push .
- name: Build and Push Incremental Docker image
run: |
docker buildx build --file docker/tpu/Dockerfile.incremental --tag ghcr.io/${{ github.repository_owner }}/levanter-tpu:latest --tag ghcr.io/${{ github.repository_owner }}/levanter-tpu:${{ env.DATE }} --push .
67 changes: 67 additions & 0 deletions .github/workflows/publish_dev.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: Publish Dev Build

on:
workflow_run:
workflows: ["Run Tests"]
types:
- completed
branches: [main]
workflow_dispatch:

jobs:
build-package:
runs-on: ubuntu-latest
if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success'}}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Calculate Version and Build Number
run: |
PROJECT_VERSION=$(sed -n 's/__version__ = "\(.*\)"/\1/p' src/levanter/__init__.py)
BUILD_NUMBER=$(git rev-list --count HEAD)
FULL_VERSION="${PROJECT_VERSION}.dev${BUILD_NUMBER}"
echo "FULL_VERSION=${FULL_VERSION}" >> $GITHUB_ENV
echo "Calculated version with build number: $FULL_VERSION"
- name: Update pyproject.toml version
run: |
# replace the version in pyproject.toml
sed -i "s/version = \".*\"/version = \"$FULL_VERSION\"/g" pyproject.toml
- name: Build package
run: |
python -m pip install --upgrade pip
pip install build
python -m build
- name: Upload package
uses: actions/upload-artifact@v4
with:
name: package
path: dist/


# cf https://test.pypi.org/manage/project/levanter/settings/publishing/
publish-dev:
runs-on: ubuntu-latest
needs:
- build-package
permissions:
id-token: write
steps:
- name: Retrieve release distributions
uses: actions/download-artifact@v4
with:
name: package
path: dist/

- name: Publish release distributions to PyPI
uses: pypa/gh-action-pypi-publish@release/v1


6 changes: 3 additions & 3 deletions .github/workflows/run_entry_tests.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
name: Run entry tests

on: [push]
on: [push, pull_request]

jobs:
build:

if: github.event_name == 'push' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository)
runs-on: ubuntu-latest
strategy:
matrix:
Expand All @@ -21,7 +21,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
pip install . "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
pip install .[test] "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
pip install soundfile librosa
- name: Run entry tests with pytest
run: |
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/run_pre_commit.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
name: Pre-Commit

on: [push]
on: [push, pull_request]

jobs:
build:
if: github.event_name == 'push' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository)

runs-on: ubuntu-latest
strategy:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/run_ray_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
pip install . "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
pip install .[test] "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
pip install soundfile librosa
- name: Run ray tests with pytest
run: |
XLA_FLAGS=--xla_force_host_platform_device_count=8 PYTHONPATH=$(pwd)/tests:$(pwd)/src:$(pwd):. pytest tests -m ray
PYTHONPATH=$(pwd)/tests:$(pwd)/src:$(pwd):. pytest tests -m ray
5 changes: 3 additions & 2 deletions .github/workflows/run_tests.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
name: Run tests

on: [push]
on: [push, pull_request]

jobs:
build:
if: github.event_name == 'push' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository)

runs-on: ubuntu-latest
strategy:
Expand All @@ -20,7 +21,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install . "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
pip install .[test] "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
pip install -r ./tests/requirements.txt
- name: Test with pytest
run: |
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/tpu_unit_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on: [pull_request]

jobs:
test:
if: (github.event.pull_request.head.repo.full_name == github.repository)
runs-on: ubuntu-latest
env:
TPU_ZONE: "us-central2-b"
Expand Down Expand Up @@ -38,7 +39,7 @@ jobs:
- name: Run most tests
run: |
export TPU_NAME=ci-run-${{ github.run_id }}
gcloud compute tpus tpu-vm ssh $TPU_NAME --zone ${TPU_ZONE} --command "PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m 'not entry'"
gcloud compute tpus tpu-vm ssh $TPU_NAME --zone ${TPU_ZONE} --command "PYTHONPATH=$PYTHONPATH:levanter/tests CI=1 bash levanter/infra/run.sh pytest levanter/tests -m 'not entry'"
# Something's wrong with these
#
# - name: Run forked tests
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ repos:
hooks:
- id: mypy
args: [--ignore-missing-imports]
additional_dependencies: [wandb, types-PyYAML]
additional_dependencies: [wandb==0.17.8, types-PyYAML]
2 changes: 1 addition & 1 deletion config/data/dolma_olmo_paloma.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ train_weights:
dolma-algebraic-stack: 12.6 # 12.6 * 1.0
dolma-arxiv: 28.0 # 28.0 * 1.0
dolma-gutenberg: 5.3 # 5.3 * 1.0
dolma-c4: 69.2 # 138.4 * 0.5
dolma-c4: 124.95 # 249.9 * 0.5
dolma-cc: 597.75 # 1,195.5 * 0.5
dolma-cc-news: 14.3 # 1.0
dolma-falcon: 456.4 # 1.0, refined web
Expand Down
124 changes: 124 additions & 0 deletions config/data/fineweb_llama_txt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
cache_dir: "gs://marin-data/tokenized/fineweb/llama2_tokenizer/txt"
tokenizer: "meta-llama/Llama-2-7b-hf"
stop_strategy: restart
configs:
"fineweb":
train_urls:
# - gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-*/*/*_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00000/{0..257}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00001/{0..258}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00002/{0..260}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00003/{0..261}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00004/{0..262}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00005/{0..262}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00006/{0..263}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00007/{0..263}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00008/{0..263}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00009/{0..263}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00010/{0..263}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00011/{0..265}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00012/{0..265}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00013/{0..266}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00014/{0..265}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00015/{0..265}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00016/{0..266}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00017/{0..266}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00018/{0..267}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00019/{0..266}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00020/{0..267}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00021/{0..267}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00022/{0..269}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00023/{0..267}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00024/{0..268}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00025/{0..268}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00026/{0..269}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00027/{0..269}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00028/{0..269}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00029/{0..269}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00030/{0..270}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00031/{0..270}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00032/{0..270}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00033/{0..271}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00034/{0..271}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00035/{0..271}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00036/{0..272}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00037/{0..272}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00038/{0..272}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00039/{0..272}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00040/{0..272}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00041/{0..272}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00042/{0..273}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00043/{0..272}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00044/{0..273}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00045/{0..274}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00046/{0..274}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00047/{0..273}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00048/{0..274}_processed.jsonl.gz
- gs://marin-data/processed/fineweb/fw-v1.0/text_fw/CC-MAIN-2020-10/000_00049/{0..275}_processed.jsonl.gz
# these are just for eval
"paloma/4chan":
validation_urls:
- gs://levanter-data/paloma/4chan_meta_sep/val/val*.jsonl.gz
"paloma/c4_100_domains":
validation_urls:
- gs://levanter-data/paloma/c4_100_domains/val/val*.jsonl.gz
"paloma/c4_en":
validation_urls:
- gs://levanter-data/paloma/c4_en/val/val*.jsonl.gz
"paloma/dolma-v1_5":
validation_urls:
- gs://levanter-data/paloma/dolma-v1_5/val/val*.jsonl.gz
"paloma/dolma_100_programing_languages":
validation_urls:
- gs://levanter-data/paloma/dolma_100_programing_languages/val/val*.jsonl.gz
"paloma/dolma_100_subreddits":
validation_urls:
- gs://levanter-data/paloma/dolma_100_subreddits/val/val*.jsonl.gz
"paloma/falcon-refinedweb":
validation_urls:
- gs://levanter-data/paloma/falcon-refinedweb/val/val*.jsonl.gz
"paloma/gab":
validation_urls:
- gs://levanter-data/paloma/gab/val/val*.jsonl.gz
"paloma/m2d2_s2orc_unsplit":
validation_urls:
- gs://levanter-data/paloma/m2d2_s2orc_unsplit/val/val*.jsonl.gz
"paloma/m2d2_wikipedia_unsplit":
validation_urls:
- gs://levanter-data/paloma/m2d2_wikipedia_unsplit/val/val*.jsonl.gz
"paloma/manosphere_meta_sep":
validation_urls:
- gs://levanter-data/paloma/manosphere_meta_sep/val/val*.jsonl.gz
"paloma/mc4":
validation_urls:
- gs://levanter-data/paloma/mc4/val/val*.jsonl.gz
"paloma/ptb":
validation_urls:
- gs://levanter-data/paloma/ptb/val/val*.jsonl.gz
"paloma/redpajama":
validation_urls:
- gs://levanter-data/paloma/redpajama/val/val*.jsonl.gz
"paloma/twitterAAE_HELM_fixed":
validation_urls:
- gs://levanter-data/paloma/twitterAAE_HELM_fixed/val/val*.jsonl.gz
"paloma/wikitext_103":
validation_urls:
- gs://levanter-data/paloma/wikitext_103/val/val*.jsonl.gz
train_weights:
fineweb: 1.0
paloma/4chan: 0.0
paloma/c4_100_domains: 0.0
paloma/c4_en: 0.0
paloma/dolma-v1_5: 0.0
paloma/dolma_100_programing_languages: 0.0
paloma/dolma_100_subreddits: 0.0
paloma/falcon-refinedweb: 0.0
paloma/gab: 0.0
paloma/m2d2_s2orc_unsplit: 0.0
paloma/m2d2_wikipedia_unsplit: 0.0
paloma/manosphere_meta_sep: 0.0
paloma/mc4: 0.0
paloma/ptb: 0.0
paloma/redpajama: 0.0
paloma/twitterAAE_HELM_fixed: 0.0
paloma/wikitext_103: 0.0
1 change: 0 additions & 1 deletion config/data/redpajama_1b_source.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ cache_dir: gs://levanter-data/tokenized/redpajama-sample/
tokenizer: EleutherAI/gpt-neox-20b
splits:
- train
rows_per_chunk: 32768
1 change: 0 additions & 1 deletion config/data/redpajama_1t_source.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ cache_dir: gs://levanter-data/tokenized/redpajama/
tokenizer: EleutherAI/gpt-neox-20b
splits:
- train
rows_per_chunk: 4096
1 change: 0 additions & 1 deletion config/data/rpv1_llama.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
cache_dir: gs://levanter-data/tokenized/redpajama_v1_llama_mixture
rows_per_chunk: 4096
tokenizer: "meta-llama/Llama-2-7b-hf"
configs:
arxiv:
Expand Down
2 changes: 1 addition & 1 deletion config/gpt2_nano_mixture.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ data:
id: dlwh/wikitext_103_detokenized
train_weights:
wikitext: 1.0
w2: 0
w2: 1.0
model:
type: gpt2
hidden_dim: 32
Expand Down
Loading

0 comments on commit 00ecb15

Please sign in to comment.