Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Workload #53

Merged
merged 2 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions benchmark/tpch/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import click

from benchmark.constants import DEFAULT_SCALE_FACTOR
from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
from util.log import DBGYM_LOGGER_NAME
from util.shell import subprocess_run
from util.workspace import (
Expand All @@ -15,8 +16,6 @@
link_result,
)

NUM_TPCH_QUERIES = 22


@click.group(name="tpch")
@click.pass_obj
Expand All @@ -38,13 +37,13 @@ def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float) -> None:
@click.option(
"--seed-start",
type=int,
default=15721,
default=DEFAULT_TPCH_SEED,
help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).",
)
@click.option(
"--seed-end",
type=int,
default=15721,
default=DEFAULT_TPCH_SEED,
help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).",
)
@click.option(
Expand Down
2 changes: 2 additions & 0 deletions benchmark/tpch/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
DEFAULT_TPCH_SEED = 15721
NUM_TPCH_QUERIES = 22
24 changes: 10 additions & 14 deletions env/integtest_pg_conn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@

import psycopg

from env.integtest_util import IntegtestWorkspace
from env.integtest_util import (
INTEGTEST_BENCHMARK,
INTEGTEST_SCALE_FACTOR,
IntegtestWorkspace,
)
from env.pg_conn import PostgresConn
from util.pg import (
DEFAULT_POSTGRES_PORT,
Expand All @@ -12,19 +16,13 @@
)
from util.workspace import (
DEFAULT_BOOT_CONFIG_FPATH,
DBGymConfig,
default_dbdata_parent_dpath,
default_pgbin_path,
default_pristine_dbdata_snapshot_path,
)

BENCHMARK = "tpch"
SCALE_FACTOR = 0.01


class PostgresConnTests(unittest.TestCase):
dbgym_cfg: DBGymConfig

@staticmethod
def setUpClass() -> None:
IntegtestWorkspace.set_up_workspace()
Expand All @@ -36,16 +34,14 @@ def setUp(self) -> None:
+ "to ensure this. Be careful about accidentally taking down other people's Postgres instances though.",
)
self.pristine_dbdata_snapshot_path = default_pristine_dbdata_snapshot_path(
IntegtestWorkspace.get_dbgym_cfg().dbgym_workspace_path,
BENCHMARK,
SCALE_FACTOR,
IntegtestWorkspace.get_workspace_path(),
INTEGTEST_BENCHMARK,
INTEGTEST_SCALE_FACTOR,
)
self.dbdata_parent_dpath = default_dbdata_parent_dpath(
IntegtestWorkspace.get_dbgym_cfg().dbgym_workspace_path
)
self.pgbin_dpath = default_pgbin_path(
IntegtestWorkspace.get_dbgym_cfg().dbgym_workspace_path
IntegtestWorkspace.get_workspace_path()
)
self.pgbin_dpath = default_pgbin_path(IntegtestWorkspace.get_workspace_path())

# The reason we restart Postgres every time is to ensure a "clean" starting point
# so that all tests are independent of each other.
Expand Down
6 changes: 5 additions & 1 deletion env/integtest_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@

from util.workspace import DBGymConfig

# These are the values used by set_up_env_integtests.sh.
# TODO: make set_up_env_integtests.sh take in these values directly as envvars.
INTEGTEST_BENCHMARK = "tpch"
INTEGTEST_SCALE_FACTOR = 0.01


class IntegtestWorkspace:
"""
Expand Down Expand Up @@ -40,4 +45,3 @@ def get_dbgym_cfg() -> DBGymConfig:
def get_workspace_path() -> Path:
with open(IntegtestWorkspace.ENV_INTEGTESTS_DBGYM_CONFIG_FPATH) as f:
return Path(yaml.safe_load(f)["dbgym_workspace_path"])
assert False
50 changes: 50 additions & 0 deletions env/integtest_workload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import unittest

from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
from env.integtest_util import (
INTEGTEST_BENCHMARK,
INTEGTEST_SCALE_FACTOR,
IntegtestWorkspace,
)
from env.workload import Workload
from util.workspace import (
default_workload_path,
fully_resolve_path,
get_default_workload_name_suffix,
get_workload_name,
)


class WorkloadTests(unittest.TestCase):
@staticmethod
def setUpClass() -> None:
IntegtestWorkspace.set_up_workspace()

def test_workload(self) -> None:
workload_dpath = fully_resolve_path(
IntegtestWorkspace.get_dbgym_cfg(),
default_workload_path(
IntegtestWorkspace.get_workspace_path(),
INTEGTEST_BENCHMARK,
get_workload_name(
INTEGTEST_SCALE_FACTOR,
get_default_workload_name_suffix(INTEGTEST_BENCHMARK),
),
),
)

workload = Workload(IntegtestWorkspace.get_dbgym_cfg(), workload_dpath)

# Check the order of query IDs.
self.assertEqual(
workload.get_query_order(),
[f"S{DEFAULT_TPCH_SEED}-Q{i}" for i in range(1, NUM_TPCH_QUERIES + 1)],
)

# Sanity check all queries.
for query in workload.get_queries_in_order():
self.assertTrue("select" in query.lower())


if __name__ == "__main__":
unittest.main()
2 changes: 2 additions & 0 deletions env/set_up_env_integtests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,7 @@ export DBGYM_CONFIG_PATH=env/env_integtests_dbgym_config.yaml # Note that this e
WORKSPACE_PATH=$(grep 'dbgym_workspace_path:' $DBGYM_CONFIG_PATH | sed 's/dbgym_workspace_path: //')

python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR
python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR

python3 task.py dbms postgres build
python3 task.py dbms postgres dbdata $BENCHMARK --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE
35 changes: 35 additions & 0 deletions env/workload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from pathlib import Path

from util.workspace import DBGymConfig, is_fully_resolved, open_and_save


class Workload:
def __init__(self, dbgym_cfg: DBGymConfig, workload_dpath: Path) -> None:
self.dbgym_cfg = dbgym_cfg
self.workload_dpath = workload_dpath
assert is_fully_resolved(self.workload_dpath)

self.queries: dict[str, str] = {}
order_fpath = self.workload_dpath / "order.txt"
self.query_order: list[str] = []

assert order_fpath.exists()

with open_and_save(self.dbgym_cfg, order_fpath) as f:
for line in f:
qid, qpath = line.strip().split(",")
qpath = Path(qpath)
assert is_fully_resolved(qpath)

with open_and_save(self.dbgym_cfg, qpath) as qf:
self.queries[qid] = qf.read()
self.query_order.append(qid)

def get_query(self, qid: str) -> str:
return self.queries[qid]

def get_query_order(self) -> list[str]:
return self.query_order

def get_queries_in_order(self) -> list[str]:
return [self.queries[qid] for qid in self.query_order]
3 changes: 2 additions & 1 deletion scripts/run_protox_e2e_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import yaml

from benchmark.constants import DEFAULT_SCALE_FACTOR
from benchmark.tpch.constants import DEFAULT_TPCH_SEED
from util.pg import get_is_postgres_running
from util.workspace import (
default_embedder_path,
Expand Down Expand Up @@ -72,7 +73,7 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) ->
if benchmark_name == "tpch":
scale_factor = 0.01
query_subset = "all"
workload_name_suffix = f"15721_15721_{query_subset}"
workload_name_suffix = f"{DEFAULT_TPCH_SEED}_{DEFAULT_TPCH_SEED}_{query_subset}"
embedding_datagen_args = "--override-sample-limits lineitem,32768"
embedding_train_args = "--iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2"
tune_hpo_args = "--num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01"
Expand Down
3 changes: 2 additions & 1 deletion util/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import redis
import yaml

from benchmark.tpch.constants import DEFAULT_TPCH_SEED
from util.log import DBGYM_LOGGER_NAME
from util.shell import subprocess_run

Expand Down Expand Up @@ -96,7 +97,7 @@ def get_workload_name(scale_factor: float | str, suffix: str) -> str:

def get_default_workload_name_suffix(benchmark_name: str) -> str:
if benchmark_name == "tpch":
return "15721_15721_all"
return f"{DEFAULT_TPCH_SEED}_{DEFAULT_TPCH_SEED}_all"
if benchmark_name == "job":
return "all"
else:
Expand Down
Loading