Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci: replace build-artifact-s3 with new workflow, add local tpch benches #3864

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions .github/ci-scripts/local_tpch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""Run TPC-H benchmarks with native runner on local Parquet data and upload results to Google sheets.

Expects tables as Parquet files in "/tmp/tpch-data/"
"""

import os
import time
from datetime import datetime, timezone

import gspread

import daft
import daft.context
from benchmarking.tpch import answers
from daft.sql import SQLCatalog


def get_df(name):
return daft.read_parquet(f"/tmp/tpch-data/{name}/*")


def run_benchmark():
table_names = [
"part",
"supplier",
"partsupp",
"customer",
"orders",
"lineitem",
"nation",
"region",
]

def lowercase_column_names(df):
return df.select(*[daft.col(name).alias(name.lower()) for name in df.column_names])

catalog = SQLCatalog({tbl: lowercase_column_names(get_df(tbl)) for tbl in table_names})

results = {}

for q in range(1, 23):
if q == 21:
# TODO: remove this once we support q21
daft_df = answers.q21(get_df)
else:
with open(f"benchmarking/tpch/queries/{q:02}.sql") as query_file:
query = query_file.read()
daft_df = daft.sql(query, catalog=catalog)

start = time.perf_counter()
daft_df.collect()
end = time.perf_counter()

results[q] = end - start

return results


def gather_metadata():
return {
"started at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S.%f"),
"github ref": os.getenv("GITHUB_REF"),
"github sha": os.getenv("GITHUB_SHA"),
}


def upload_to_google_sheets(data):
gc = gspread.service_account()

sh = gc.open_by_url(
"https://docs.google.com/spreadsheets/d/1d6pXsIsBkjjM93GYtoiF83WXvJXR4vFgFQdmG05u8eE/edit?gid=0#gid=0"
)
ws = sh.worksheet("Local TPC-H")
ws.append_row(data)


def main():
daft.context.set_runner_native()

metadata = gather_metadata()

results = run_benchmark()

data_dict = {**metadata, **results}

print("Results:")
print(data_dict)

upload_to_google_sheets(list(data_dict.values()))


if __name__ == "__main__":
main()
71 changes: 71 additions & 0 deletions .github/workflows/benchmark-local-tpch.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: benchmark-local-tpch

on:
workflow_dispatch:
workflow_call:
inputs:
daft_index_url:
description: The index URL of the Daft build to benchmark
type: string
default: ''

env:
DAFT_INDEX_URL: ${{ inputs.daft_index_url != '' && inputs.daft_index_url || format('https://d1p3klp2t5517h.cloudfront.net/builds/dev/{0}', github.sha) }}
AWS_REGION: us-west-2

DAFT_ANALYTICS_ENABLED: '0'
UV_SYSTEM_PYTHON: 1
PYTHON_VERSION: '3.9'

jobs:
build:
name: Build and publish wheels if necessary
if: ${{ inputs.daft_index_url == '' }}
uses: ./.github/workflows/publish-dev-s3.yml

benchmark:
needs: build
if: ${{ !failure() && !cancelled() }}
runs-on: [self-hosted, linux, arm64, benchmark-local-tpch]
permissions:
id-token: write
contents: read

steps:
- name: Mount local SSD to /tmp
run: |
findmnt /tmp 1> /dev/null
code=$?
if [ $code -ne 0 ]; then
sudo mkfs.ext4 /dev/nvme0n1
sudo mount -t ext4 /dev/nvme0n1 /tmp
sudo chmod 777 /tmp
fi
- uses: actions/checkout@v4
with:
submodules: true
- name: Assume GitHub Actions AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: ${{ env.AWS_REGION }}
role-to-assume: ${{ secrets.ACTIONS_AWS_ROLE_ARN }}
role-session-name: DaftLocalTpchGitHubWorkflow
- name: Download TPC-H data (100SF)
run: aws s3 cp s3://eventual-dev-benchmarking-fixtures/uncompressed/tpch-dbgen/100_0/32/parquet /tmp/tpch-data --recursive

- name: Install uv
uses: astral-sh/setup-uv@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Daft and dev dependencies
run: |
rm -rf daft
uv pip install getdaft --pre --extra-index-url ${{ env.DAFT_INDEX_URL }}
uv pip install gspread
- name: Write service account secret file
run: |
cat << EOF > ~/.config/gspread/service_account.json
${{ secrets.GOOGLE_SHEETS_SERVICE_ACCOUNT }}
EOF
- name: Run benchmark and upload results to Google Sheets
run: python .github/ci-scripts/local_tpch.py
102 changes: 0 additions & 102 deletions .github/workflows/build-artifact-s3.yml

This file was deleted.

42 changes: 33 additions & 9 deletions .github/workflows/nightlies-tests.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
name: Verify platforms nightly wheels
name: Test and benchmark suite on nightly build

on:
schedule:
- cron: 0 13 * * *
workflow_dispatch:
workflow_run:
workflows: [daft-publish]
types:
- completed
workflow_call:

env:
DAFT_ANALYTICS_ENABLED: '0'
Expand Down Expand Up @@ -44,7 +39,7 @@ jobs:

- name: Install Daft and dev dependencies
run: |
uv pip install -r requirements-dev.txt getdaft --pre --extra-index-url https://pypi.anaconda.org/daft-nightly/simple --force-reinstall
uv pip install -r requirements-dev.txt getdaft --pre --extra-index-url https://d1p3klp2t5517h.cloudfront.net/builds/nightly --force-reinstall
rm -rf daft
- uses: actions/cache@v4
env:
Expand Down Expand Up @@ -114,7 +109,7 @@ jobs:
echo "$GITHUB_WORKSPACE/venv/bin" >> $GITHUB_PATH
- name: Install Daft and dev dependencies
run: |
uv pip install -r requirements-dev.txt getdaft --pre --extra-index-url https://pypi.anaconda.org/daft-nightly/simple --force-reinstall
uv pip install -r requirements-dev.txt getdaft --pre --extra-index-url https://d1p3klp2t5517h.cloudfront.net/builds/nightly --force-reinstall
rm -rf daft
- name: Prepare tmpdirs for IO services
run: |
Expand Down Expand Up @@ -155,3 +150,32 @@ jobs:
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK

benchmark-local-tpch:
uses: ./.github/workflows/benchmark-local-tpch.yml
with:
daft_index_url: https://d1p3klp2t5517h.cloudfront.net/builds/nightly

on-local-tpch-failure:
name: Send Slack notification on failure
runs-on: ubuntu-latest
needs: benchmark-local-tpch
if: ${{ failure() }}

steps:
- uses: slackapi/[email protected]
with:
payload: |
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":rotating_light: [CI] Local TPC-H benchmarks on nightly wheel <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED* :rotating_light:"
}
}
]
}
webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
webhook-type: incoming-webhook
5 changes: 5 additions & 0 deletions .github/workflows/nightly-publish-s3.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,8 @@ jobs:
}
webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
webhook-type: incoming-webhook

tests:
name: Run tests on nightly build
needs: publish
uses: ./.github/workflows/nightlies-tests.yml
Loading
Loading