Event Index #75
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Event Index | |
on: | |
workflow_dispatch: | |
schedule: | |
# <minute [0,59]> <hour [0,23]> <day of the month [1,31]> <month of the year [1,12]> <day of the week [0,6]> | |
# https://pubs.opengroup.org/onlinepubs/9699919799/utilities/crontab.html#tag_20_25_07 | |
# Run every Thursday at 3:26:00 UTC | |
# (Thursday at 19:26:00 PST) | |
# We offset from the hour and half hour to go easy on the servers :) | |
- cron: '26 3 * * 4' | |
# We doubly fan out | |
# We first generate indexs for uni, bi, and trigrams with a matrix | |
# Each index is split into chunks of 50,000 grams | |
# Then we fan out by every chunk and upload | |
jobs: | |
generate-index-chunks: | |
runs-on: ubuntu-latest | |
strategy: | |
matrix: | |
n-gram: [1, 2, 3] | |
fail-fast: false | |
outputs: | |
ngram-1-chunks: ${{ steps.output-index-chunks.outputs.ngram-1-chunks }} | |
ngram-2-chunks: ${{ steps.output-index-chunks.outputs.ngram-2-chunks }} | |
ngram-3-chunks: ${{ steps.output-index-chunks.outputs.ngram-3-chunks }} | |
steps: | |
# Setup Runner | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v1 | |
with: | |
python-version: '3.11' | |
# Setup GCloud / Creds | |
- name: Setup gcloud | |
uses: google-github-actions/setup-gcloud@v0 | |
with: | |
project_id: cdp-colorado-school-boards-zfi | |
service_account_key: ${{ secrets.GOOGLE_CREDENTIALS }} | |
export_default_credentials: true | |
- name: Dump Credentials to JSON | |
uses: jsdaniell/[email protected] | |
with: | |
name: "google-creds.json" | |
json: ${{ secrets.GOOGLE_CREDENTIALS }} | |
dir: "python/" | |
# Installs | |
- name: Install Python Dependencies | |
run: | | |
cd python/ | |
pip install . | |
# Index | |
- name: Index Events ${{ matrix.n-gram }}-grams | |
run: | | |
cd python/ | |
run_cdp_event_index_generation event-index-config.json \ | |
--n_grams ${{ matrix.n-gram }} \ | |
--store_remote \ | |
--parallel | |
# Store generated files to step output | |
- name: Store Index Fileset to Outputs | |
id: output-index-chunks | |
run: | | |
cd python/index/ | |
output=$(python -c 'import os, json; print(json.dumps(os.listdir(".")))') | |
echo "::set-output name=ngram-${{ matrix.n-gram }}-chunks::$output" | |
combine-matrix-ngram-chunks: | |
needs: generate-index-chunks | |
runs-on: ubuntu-latest | |
outputs: | |
all-chunks: ${{ steps.combine-index-chunks.outputs.combined-chunks }} | |
steps: | |
# Setup Runner | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v1 | |
with: | |
python-version: '3.11' | |
# Process | |
- name: Combine Chunks | |
id: 'combine-index-chunks' | |
run: | | |
echo 'print(${{ needs.generate-index-chunks.outputs.ngram-1-chunks }} + ${{ needs.generate-index-chunks.outputs.ngram-2-chunks }} + ${{ needs.generate-index-chunks.outputs.ngram-3-chunks }})' >> print-combined-chunks.py | |
output=$(python print-combined-chunks.py) | |
echo "::set-output name=combined-chunks::$output" | |
upload-index-chunks: | |
needs: combine-matrix-ngram-chunks | |
runs-on: ubuntu-latest | |
strategy: | |
max-parallel: 6 | |
matrix: | |
filename: ${{ fromJson(needs.combine-matrix-ngram-chunks.outputs.all-chunks) }} | |
fail-fast: false | |
steps: | |
# Setup Runner | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v1 | |
with: | |
python-version: '3.11' | |
# Setup GCloud / Creds | |
- name: Setup gcloud | |
uses: google-github-actions/setup-gcloud@v0 | |
with: | |
project_id: cdp-colorado-school-boards-zfi | |
service_account_key: ${{ secrets.GOOGLE_CREDENTIALS }} | |
export_default_credentials: true | |
- name: Dump Credentials to JSON | |
uses: jsdaniell/[email protected] | |
with: | |
name: "google-creds.json" | |
json: ${{ secrets.GOOGLE_CREDENTIALS }} | |
dir: "python/" | |
# Installs | |
- name: Install Python Dependencies | |
run: | | |
cd python/ | |
pip install . | |
# Upload Index Chunk | |
- name: Process Upload | |
run: | | |
cd python/ | |
process_cdp_event_index_chunk event-index-config.json \ | |
${{ matrix.filename }} \ | |
--parallel |