Skip to content

Adds dclm fasttext classifier #1002

Adds dclm fasttext classifier

Adds dclm fasttext classifier #1002

Workflow file for this run

name: CI
on:
push:
branches:
- main
- master
tags:
- "*"
pull_request:
branches:
- main
- master
workflow_dispatch:
permissions:
contents: read
env:
DOLMA_TESTS_SKIP_AWS: ${{ secrets.AWS_ACCESS_KEY_ID == '' && 'true' || 'false' }}
DOLMA_TEST_S3_PREFIX: s3://dolma-tests
RUST_CHANNEL: stable
jobs:
info:
name: Run info
runs-on: ubuntu-latest
steps:
- name: Echo environment variables
run: |
echo "reference: ${{ github.ref }}"
echo "event name: ${{ github.event_name }}"
echo "run tests: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}"
echo "is main: ${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master' }}"
echo "is release: ${{ startsWith(github.ref, 'refs/tags/') }}"
echo "skip AWS: ${{ env.DOLMA_TESTS_SKIP_AWS }}"
echo "commit: ${{ github.sha }}"
echo "PR base repo: ${{ github.event.pull_request.base.repo.full_name }}/tree/${{ github.event.pull_request.base.ref }}"
echo "PR head repo: ${{ github.event.pull_request.head.repo.full_name }}/tree/${{ github.event.pull_request.head.ref }}"
should_build:
name: "Check if build"
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
ref: ${{ github.ref }}
- name: List branches and tags
run: |
git branch -a
git tag -l
git log | head -n 1000
- id: check_version
run: |
set +e
has_updated=$(git diff --name-only '${{ github.event.pull_request.base.sha }}' | grep -E 'pyproject.toml|Cargo.toml')
is_main_or_release='${{ github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/') }}'
if [[ -n "${has_updated}" ]] || [[ "${is_main_or_release}" == 'true' ]]; then
echo "should_build=true" >> $GITHUB_OUTPUT
else
echo "should_build=false" >> $GITHUB_OUTPUT
fi
shell: bash
outputs:
should_build: ${{ steps.check_version.outputs.should_build }}
prepare-venv:
runs-on: ubuntu-latest
name: "Prepare Virtual Env"
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Cache Virtual Env
uses: actions/cache@v3
# name for referring later
id: cache-venv
with:
# what we cache: the virtualenv
path: ./.venv/
# The cache key depends on pyproject.toml and Cargo.toml
key: ${{ runner.os }}-venv-${{ hashFiles('**/pyproject.toml', '**/Cargo.toml, **/Cargo.lock') }}--${{ hashFiles('python/**', 'src/**') }}
- name: Setup system libraries
if: steps.cache-venv.outputs.cache-hit != 'true'
run: |
sudo apt-get update
sudo apt-get install --yes --upgrade build-essential cmake protobuf-compiler libssl-dev glibc-source musl-tools
- name: Install Rust toolchain
if: steps.cache-venv.outputs.cache-hit != 'true'
run: |
rustup update ${{ env.RUST_CHANNEL }}
rustup component add --toolchain ${{ env.RUST_CHANNEL }} rustfmt rust-src
rustup default ${{ env.RUST_CHANNEL }}
- name: Install Python
if: steps.cache-venv.outputs.cache-hit != 'true'
uses: actions/setup-python@v4
with:
python-version: "3.8"
architecture: "x64"
- name: Create a new Python environment & install maturin
if: steps.cache-venv.outputs.cache-hit != 'true'
run: |
python -m venv .venv
source .venv/bin/activate
pip install -U pip
pip install maturin
- name: Install dolma wheels
if: steps.cache-venv.outputs.cache-hit != 'true'
run: |
source .venv/bin/activate
maturin build --release -i $(which python) --out dist
wheel_path=$(ls dist/*.whl)
pip install "${wheel_path}[all]"
tests:
runs-on: ubuntu-latest
needs: prepare-venv
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
name: "${{ matrix.task.name }}"
strategy:
fail-fast: true
matrix:
task:
- name: Check Python style
run: |
set -e
isort --check --verbose .
black --check --verbose .
- name: Check Rust style
run: |
rustfmt --edition 2021 src/*.rs --check
- name: Lint Python
run: |
flake8 tests/python/ && flake8 python/
- name: Types Python
run: |
mypy tests/python/ && mypy python/
- name: Run Python tests
run: |
pytest -vs --color=yes tests/python/
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Cache Virtual Env
uses: actions/cache@v3
# name for referring later
id: cache-venv
with:
# what we cache: the virtualenv
path: ./.venv/
# The cache key depends on pyproject.toml and Cargo.toml
key: ${{ runner.os }}-venv-${{ hashFiles('**/pyproject.toml', '**/Cargo.toml, **/Cargo.lock') }}--${{ hashFiles('python/**', 'src/**') }}
- name: ${{ matrix.task.name }}
run: |
source .venv/bin/activate
${{ matrix.task.run }}
build-linux:
needs: should_build
if: ${{ needs.should_build.outputs.should_build == 'true' }}
runs-on: ubuntu-latest
strategy:
matrix:
# fails to build on x86, so removing for now; low use anyway.
# target: [x86_64, x86, aarch64, armv7]
target: [x86_64, aarch64, armv7]
name: "Build Linux (${{ matrix.target }})"
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install 32bit version of libc
if: ${{ matrix.target == 'x86' || contains(matrix.target, 'i686') }}
run: |
sudo apt-get update
sudo apt-get install --yes --upgrade libc6-dev-i386
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.target }}
args: --release --out dist --find-interpreter
manylinux: manylinux_2_28
container: "ghcr.io/rust-cross/manylinux_2_28-cross:${{ matrix.target }}"
before-script-linux: |
sudo apt-get update
sudo apt-get install --yes --upgrade build-essential cmake protobuf-compiler libssl-dev glibc-source musl-tools
- name: Upload wheels
uses: actions/upload-artifact@v3
with:
name: wheels
path: dist
build-windows:
needs: should_build
if: ${{ needs.should_build.outputs.should_build == 'true' }}
runs-on: windows-latest
name: "Build Windows (${{ matrix.target }})"
strategy:
matrix:
target: [x64, x86]
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
architecture: ${{ matrix.target }}
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.target }}
args: --release --out dist --find-interpreter
- name: Upload wheels
uses: actions/upload-artifact@v3
with:
name: wheels
path: dist
build-macos:
needs: should_build
if: ${{ needs.should_build.outputs.should_build == 'true' }}
runs-on: macos-latest
name: "Build macOS (${{ matrix.target }})"
strategy:
matrix:
target: [x86_64, aarch64]
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.target }}
args: --release --out dist --find-interpreter
- name: Upload wheels
uses: actions/upload-artifact@v3
with:
name: wheels
path: dist
sdist:
runs-on: ubuntu-latest
needs: should_build
if: ${{ needs.should_build.outputs.should_build == 'true' }}
steps:
- uses: actions/checkout@v3
- name: Build sdist
uses: PyO3/maturin-action@v1
with:
command: sdist
args: --out dist
- name: Upload sdist
uses: actions/upload-artifact@v3
with:
name: wheels
path: dist
release:
name: Release
runs-on: ubuntu-latest
if: "startsWith(github.ref, 'refs/tags/')"
needs: [build-linux, build-windows, build-macos, sdist]
steps:
- uses: actions/download-artifact@v3
with:
name: wheels
- name: Publish to PyPI
uses: PyO3/maturin-action@v1
env:
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
with:
command: upload
args: --skip-existing *