Simplify tokenization pipeline, make it work with large numbers of shards again, (re)add configuration metadata to cache #3460

	name: Pre-Commit

	on: [push, pull_request]

	jobs:
	build:
	if: github.event_name == 'push' \|\| (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository)

	runs-on: ubuntu-latest
	strategy:
	matrix:
	python-version: ["3.10"]
	jax-version: ["0.4.14"]

	steps:
	- uses: actions/checkout@v3
	- name: Set up Python ${{ matrix.python-version }}
	uses: actions/setup-python@v4
	with:
	python-version: ${{ matrix.python-version }}
	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install flake8 pytest pre-commit
	pip install . "jax[cpu]==${{ matrix.jax-version }}" "jaxlib==${{ matrix.jax-version }}"
	- name: "Run Pre-commit"
	run: \|
	pre-commit run --all-files --show-diff-on-failure

Provide feedback