diff --git a/README.md b/README.md index c1bb8574..7989c40a 100644 --- a/README.md +++ b/README.md @@ -1 +1,86 @@ -# Database Gym \ No newline at end of file +# 🛢️ Database Gym 🏋️ +[\[Slides\]](http://www.cidrdb.org/cidr2023/slides/p27-lim-slides.pdf) [\[Paper\]](https://www.cidrdb.org/cidr2023/papers/p27-lim.pdf) + +*An end-to-end research vehicle for the field of self-driving DBMSs.* + +## Quickstart + +These steps were tested on a fresh repository clone, Ubuntu 22.04. + +``` +# Setup dependencies. +# You may want to create a Python virtual environment (e.g. with conda) before doing this. +./dependency/install_dependencies.sh + +# Compile a custom fork of PostgreSQL, load TPC-H (SF 0.01), train the Proto-X agent, and tune. +./scripts/quickstart.sh postgres tpch 0.01 protox +``` + +## Overview + +Autonomous DBMS research often involves more engineering than research. +As new advances in state-of-the-art technology are made, it is common to find that they have have +reimplemented the database tuning pipeline from scratch: workload capture, database setup, +training data collection, model creation, model deployment, and more. +Moreover, these bespoke pipelines make it difficult to combine different techniques even when they +should be independent (e.g., using a different operator latency model in a tuning algorithm). + +The database gym project is our attempt at standardizing the APIs between these disparate tasks, +allowing researchers to mix-and-match the different pipeline components. +It draws inspiration from the Farama Foundation's Gymnasium (formerly OpenAI Gym), which +accelerates the development and comparison of reinforcement learning algorithms by providing a set +of agents, environments, and a standardized API for communicating between them. +Through the database gym, we hope to save other people time and reimplementation effort by +providing an extensible open-source platform for autonomous DBMS research. + +This project is under active development. +Currently, we decompose the database tuning pipeline into the following components: + +1. Workload: collection, forecasting, synthesis +2. Database: database loading, instrumentation, orchestrating workload execution +3. Agent: identifying tuning actions, suggesting an action + +## Repository Structure + +`task.py` is the entrypoint for all tasks. +The tasks are grouped into categories that correspond to the top-level directories of the repository: + +- `benchmark` - tasks to generate data and queries for different benchmarks (e.g., TPC-H, JOB) +- `dbms` - tasks to build and start DBMSs (e.g., PostgreSQL) +- `tune` - tasks to train autonomous database tuning agents + +## Credits + +The Database Gym project rose from the ashes of the [NoisePage](https://db.cs.cmu.edu/projects/noisepage/) self-driving DBMS project. + +The first prototype was written by [Patrick Wang](https://github.com/wangpatrick57), integrating [Boot (VLDB 2024)](https://github.com/lmwnshn/boot) and [Proto-X (VLDB 2024)](https://github.com/17zhangw/protox) into a cohesive system. + +## Citing This Repository + +If you use this repository in an academic paper, please cite: + +``` +@inproceedings{lim23, + author = {Lim, Wan Shen and Butrovich, Matthew and Zhang, William and Crotty, Andrew and Ma, Lin and Xu, Peijing and Gehrke, Johannes and Pavlo, Andrew}, + title = {Database Gyms}, + booktitle = {{CIDR} 2023, Conference on Innovative Data Systems Research}, + year = {2023}, + url = {https://db.cs.cmu.edu/papers/2023/p27-lim.pdf}, + } +``` + +Additionally, please cite any module-specific paper that is relevant to your use. + +**Accelerating Training Data Generation** + +``` +(citation pending) +Boot, appearing at VLDB 2024. +``` + +**Simultaneously Tuning Multiple Configuration Spaces with Proto Actions** + +``` +(citation pending) +Proto-X, appearing at VLDB 2024. +``` diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index d5c8c407..82adeff5 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -21,8 +21,8 @@ def tpch_group(dbgym_cfg: DBGymConfig): @tpch_group.command(name="data") @click.argument("scale-factor", type=float) @click.pass_obj -# The reason generate-data is separate from create-pgdata is because generate-data is generic -# to all DBMSs while create-pgdata is specific to Postgres. +# The reason generate data is separate from create dbdata is because generate-data is generic +# to all DBMSs while create dbdata is specific to a single DBMS. def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float): _clone(dbgym_cfg) _generate_data(dbgym_cfg, scale_factor) diff --git a/dbms/postgres/build_repo.sh b/dbms/postgres/build_repo.sh index 16774edd..271f7056 100755 --- a/dbms/postgres/build_repo.sh +++ b/dbms/postgres/build_repo.sh @@ -4,34 +4,34 @@ set -euxo pipefail REPO_REAL_PARENT_DPATH="$1" -# download and make postgres from the boot repository +# Download and make postgres from the boot repository. mkdir -p "${REPO_REAL_PARENT_DPATH}" cd "${REPO_REAL_PARENT_DPATH}" -git clone git@github.com:lmwnshn/boot.git --single-branch --branch boot --depth 1 +git clone git@github.com:lmwnshn/boot.git --single-branch --branch vldb_2024 --depth 1 cd ./boot ./cmudb/build/configure.sh release "${REPO_REAL_PARENT_DPATH}/boot/build/postgres" make clean make install-world-bin -j4 -# download and make bytejack -cd ./cmudb/extension/bytejack_rs/ +# Download and make boot. +cd ./cmudb/extension/boot_rs/ cargo build --release -cbindgen . -o target/bytejack_rs.h --lang c +cbindgen . -o target/boot_rs.h --lang c cd "${REPO_REAL_PARENT_DPATH}/boot" -cd ./cmudb/extension/bytejack/ +cd ./cmudb/extension/boot/ make clean make install -j cd "${REPO_REAL_PARENT_DPATH}/boot" -# download and make hypopg +# Download and make hypopg. git clone git@github.com:HypoPG/hypopg.git cd ./hypopg PG_CONFIG="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin/pg_config" make install cd "${REPO_REAL_PARENT_DPATH}/boot" -# download and make pg_hint_plan -# we need -L to follow links +# Download and make pg_hint_plan. +# We need -L to follow links. curl -L https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL15_1_5_1.tar.gz -o REL15_1_5_1.tar.gz tar -xzf REL15_1_5_1.tar.gz rm REL15_1_5_1.tar.gz diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 75b03650..f81a877f 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -1,5 +1,5 @@ """ -At a high level, this file's goal is to (1) install+build postgres and (2) create pgdata. +At a high level, this file's goal is to (1) build postgres and (2) create dbdata (aka pgdata). On the other hand, the goal of tune.protox.env.util.postgres is to provide helpers to manage a Postgres instance during agent tuning. util.pg provides helpers used by *both* of the above files (as well as other files). @@ -10,11 +10,10 @@ import subprocess from pathlib import Path import click -import ssd_checker from benchmark.tpch.load_info import TpchLoadInfo from dbms.load_info_base_class import LoadInfoBaseClass -from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_pgdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_pgdata_parent_dpath +from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_dbdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_dbdata_parent_dpath, is_ssd from util.shell import subprocess_run from sqlalchemy import Connection from util.pg import SHARED_PRELOAD_LIBRARIES, conn_execute, sql_file_execute, DBGYM_POSTGRES_DBNAME, create_conn, DEFAULT_POSTGRES_PORT, DBGYM_POSTGRES_USER, DBGYM_POSTGRES_PASS, DEFAULT_POSTGRES_DBNAME @@ -32,7 +31,7 @@ def postgres_group(dbgym_cfg: DBGymConfig): @postgres_group.command( name="build", - help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create pgdata.", + help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create dbdata.", ) @click.pass_obj @click.option("--rebuild", is_flag=True, help="Include this flag to rebuild Postgres even if it already exists.") @@ -41,46 +40,46 @@ def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool): @postgres_group.command( - name="pgdata", - help="Build a .tgz file of pgdata with various specifications for its contents.", + name="dbdata", + help="Build a .tgz file of dbdata with various specifications for its contents.", ) @click.pass_obj @click.argument("benchmark_name", type=str) @click.option("--scale-factor", type=float, default=1) @click.option("--pgbin-path", type=Path, default=None, help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.") @click.option( - "--intended-pgdata-hardware", + "--intended-dbdata-hardware", type=click.Choice(["hdd", "ssd"]), default="hdd", - help=f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata-parent-dpath.", + help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-dpath.", ) @click.option( - "--pgdata-parent-dpath", + "--dbdata-parent-dpath", default=None, type=Path, - help=f"The path to the parent directory of the pgdata which will be actively tuned. The default is {default_pgdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.", + help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.", ) -def postgres_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_pgdata_hardware: str, pgdata_parent_dpath: Path): +def postgres_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_dbdata_hardware: str, dbdata_parent_dpath: Path): # Set args to defaults programmatically (do this before doing anything else in the function) if pgbin_path == None: pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path) - if pgdata_parent_dpath == None: - pgdata_parent_dpath = default_pgdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) + if dbdata_parent_dpath == None: + dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) # Convert all input paths to absolute paths pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path) - pgdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, pgdata_parent_dpath) + dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath) # Check assertions on args - if intended_pgdata_hardware == "hdd": - assert not ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is HDD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an SSD" - elif intended_pgdata_hardware == "ssd": - assert ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is SSD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an HDD" + if intended_dbdata_hardware == "hdd": + assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" + elif intended_dbdata_hardware == "ssd": + assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" else: assert False - # Create pgdata - _create_pgdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, pgdata_parent_dpath) + # Create dbdata + _create_dbdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath) def _get_pgbin_symlink_path(dbgym_cfg: DBGymConfig) -> Path: @@ -109,52 +108,52 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild): dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}") -def _create_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, pgdata_parent_dpath: Path) -> None: +def _create_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, dbdata_parent_dpath: Path) -> None: """ - I chose *not* for this function to skip by default if pgdata_tgz_symlink_path already exists. This + I chose *not* for this function to skip by default if dbdata_tgz_symlink_path already exists. This is because, while the generated data is deterministic given benchmark_name and scale_factor, any - change in the _create_pgdata() function would result in a different pgdata. Since _create_pgdata() + change in the _create_dbdata() function would result in a different dbdata. Since _create_dbdata() may change somewhat frequently, I decided to get rid of the footgun of having changes to - _create_pgdata() not propagate to [pgdata].tgz by default. + _create_dbdata() not propagate to [dbdata].tgz by default. """ - # It's ok for the pgdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place. - pgdata_dpath = pgdata_parent_dpath / "pgdata_being_created" - # We might be reusing the same pgdata_parent_dpath, so delete pgdata_dpath if it already exists - if pgdata_dpath.exists(): - shutil.rmtree(pgdata_dpath) + # It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place. + dbdata_dpath = dbdata_parent_dpath / "dbdata_being_created" + # We might be reusing the same dbdata_parent_dpath, so delete dbdata_dpath if it already exists + if dbdata_dpath.exists(): + shutil.rmtree(dbdata_dpath) # Call initdb. # Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run. save_file(dbgym_cfg, pgbin_path / "initdb") - subprocess_run(f'./initdb -D "{pgdata_dpath}"', cwd=pgbin_path) + subprocess_run(f'./initdb -D "{dbdata_dpath}"', cwd=pgbin_path) - # Start Postgres (all other pgdata setup requires postgres to be started). + # Start Postgres (all other dbdata setup requires postgres to be started). # Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead. - start_postgres(dbgym_cfg, pgbin_path, pgdata_dpath) + start_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) # Set up Postgres. - _generic_pgdata_setup(dbgym_cfg) - _load_benchmark_into_pgdata(dbgym_cfg, benchmark_name, scale_factor) + _generic_dbdata_setup(dbgym_cfg) + _load_benchmark_into_dbdata(dbgym_cfg, benchmark_name, scale_factor) # Stop Postgres so that we don't "leak" processes. - stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath) + stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) # Create .tgz file. - # Note that you can't pass "[pgdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[pgdata].tgz" as a dir. - pgdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path( + # Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir. + dbdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path( mkdir=True - ) / get_pgdata_tgz_name(benchmark_name, scale_factor) - # We need to cd into pgdata_dpath so that the tar file does not contain folders for the whole path of pgdata_dpath. - subprocess_run(f"tar -czf {pgdata_tgz_real_fpath} .", cwd=pgdata_dpath) + ) / get_dbdata_tgz_name(benchmark_name, scale_factor) + # We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath. + subprocess_run(f"tar -czf {dbdata_tgz_real_fpath} .", cwd=dbdata_dpath) # Create symlink. - # Only link at the end so that the link only ever points to a complete pgdata. - pgdata_tgz_symlink_path = link_result(dbgym_cfg, pgdata_tgz_real_fpath) - dbms_postgres_logger.info(f"Created pgdata in {pgdata_tgz_symlink_path}") + # Only link at the end so that the link only ever points to a complete dbdata. + dbdata_tgz_symlink_path = link_result(dbgym_cfg, dbdata_tgz_real_fpath) + dbms_postgres_logger.info(f"Created dbdata in {dbdata_tgz_symlink_path}") -def _generic_pgdata_setup(dbgym_cfg: DBGymConfig): +def _generic_dbdata_setup(dbgym_cfg: DBGymConfig): # get necessary vars pgbin_real_dpath = _get_pgbin_symlink_path(dbgym_cfg).resolve() assert pgbin_real_dpath.exists() @@ -182,15 +181,15 @@ def _generic_pgdata_setup(dbgym_cfg: DBGymConfig): cwd=pgbin_real_dpath, ) - # Create the dbgym database. since one pgdata dir maps to one benchmark, all benchmarks will use the same database - # as opposed to using databases named after the benchmark + # Create the dbgym database. Since one dbdata dir maps to one benchmark, all benchmarks will use the same database + # as opposed to using databases named after the benchmark. subprocess_run( f"./psql -c \"create database {DBGYM_POSTGRES_DBNAME} with owner = '{dbgym_pguser}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost", cwd=pgbin_real_dpath, ) -def _load_benchmark_into_pgdata( +def _load_benchmark_into_dbdata( dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float ): with create_conn(use_psycopg=False) as conn: @@ -198,13 +197,13 @@ def _load_benchmark_into_pgdata( load_info = TpchLoadInfo(dbgym_cfg, scale_factor) else: raise AssertionError( - f"_load_benchmark_into_pgdata(): the benchmark of name {benchmark_name} is not implemented" + f"_load_benchmark_into_dbdata(): the benchmark of name {benchmark_name} is not implemented" ) - _load_into_pgdata(dbgym_cfg, conn, load_info) + _load_into_dbdata(dbgym_cfg, conn, load_info) -def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass): +def _load_into_dbdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass): sql_file_execute(dbgym_cfg, conn, load_info.get_schema_fpath()) # truncate all tables first before even loading a single one @@ -223,21 +222,21 @@ def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadI sql_file_execute(dbgym_cfg, conn, constraints_fpath) -def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None: - _start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, True) +def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None: + _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, True) -def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None: - _start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, False) +def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None: + _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, False) -def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path, is_start: bool) -> None: +def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path, is_start: bool) -> None: # They should be absolute paths and should exist assert pgbin_path.is_absolute() and pgbin_path.exists() - assert pgdata_dpath.is_absolute() and pgdata_dpath.exists() + assert dbdata_dpath.is_absolute() and dbdata_dpath.exists() # The inputs may be symlinks so we need to resolve them first pgbin_real_dpath = pgbin_path.resolve() - pgdata_dpath = pgdata_dpath.resolve() + dbdata_dpath = dbdata_dpath.resolve() pgport = DEFAULT_POSTGRES_PORT save_file(dbgym_cfg, pgbin_real_dpath / "pg_ctl") @@ -245,7 +244,7 @@ def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpa # We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start". # The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None. # On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do. - result = subprocess.run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True) + result = subprocess.run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True) result.check_returncode() else: - subprocess_run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath) \ No newline at end of file + subprocess_run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath) \ No newline at end of file diff --git a/dependency/install_dependencies.sh b/dependency/install_dependencies.sh new file mode 100755 index 00000000..8a516880 --- /dev/null +++ b/dependency/install_dependencies.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# You may want to create a conda environment before doing this +pip install -r dependency/requirements.txt +cat dependency/apt_requirements.txt | xargs sudo apt-get install -y +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh \ No newline at end of file diff --git a/dependency/requirements.txt b/dependency/requirements.txt index 1b58b1c9..ba32594c 100644 --- a/dependency/requirements.txt +++ b/dependency/requirements.txt @@ -122,5 +122,4 @@ virtualenv==20.25.0 Werkzeug==3.0.1 wrapt==1.14.1 zipp==3.17.0 -ssd_checker==1.0.3 redis==5.0.3 diff --git a/dependency/rust.sh b/dependency/rust.sh deleted file mode 100755 index 9af316fc..00000000 --- a/dependency/rust.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh \ No newline at end of file diff --git a/experiments/load_per_machine_envvars.sh b/experiments/load_per_machine_envvars.sh index 905c6c01..b9772d3c 100644 --- a/experiments/load_per_machine_envvars.sh +++ b/experiments/load_per_machine_envvars.sh @@ -2,9 +2,9 @@ host=$(hostname) if [ "$host" == "dev4" ]; then - export PGDATA_PARENT_DPATH=/mnt/nvme1n1/phw2/dbgym_tmp/ + export DBDATA_PARENT_DPATH=/mnt/nvme1n1/phw2/dbgym_tmp/ elif [ "$host" == "dev6" ]; then - export PGDATA_PARENT_DPATH=/mnt/nvme0n1/phw2/dbgym_tmp/ + export DBDATA_PARENT_DPATH=/mnt/nvme0n1/phw2/dbgym_tmp/ else echo "Did not recognize host \"$host\"" exit 1 diff --git a/experiments/protox_tpch_sf0point1/main.sh b/experiments/protox_tpch_sf0point1/main.sh index 5a111a4f..480f28ca 100755 --- a/experiments/protox_tpch_sf0point1/main.sh +++ b/experiments/protox_tpch_sf0point1/main.sh @@ -3,31 +3,31 @@ set -euxo pipefail SCALE_FACTOR=0.1 -INTENDED_PGDATA_HARDWARE=ssd +INTENDED_DBDATA_HARDWARE=ssd . ./experiments/load_per_machine_envvars.sh -echo $PGDATA_PARENT_DPATH +echo $DBDATA_PARENT_DPATH # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars) -# python3 task.py --no-startup-check tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 0.1 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH -python3 task.py --no-startup-check tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 0.2 -python3 task.py --no-startup-check tune protox agent replay tpch --scale-factor $SCALE_FACTOR +# python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 0.1 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH +python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 0.2 +python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR exit 0 # benchmark -python3 task.py --no-startup-check benchmark tpch data $SCALE_FACTOR -python3 task.py --no-startup-check benchmark tpch workload --scale-factor $SCALE_FACTOR +python3 task.py benchmark tpch data $SCALE_FACTOR +python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR # postgres -python3 task.py --no-startup-check dbms postgres build -python3 task.py --no-startup-check dbms postgres pgdata tpch --scale-factor $SCALE_FACTOR --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH +python3 task.py dbms postgres build +python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH exit 0 # embedding -python3 task.py --no-startup-check tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH # long datagen so that train doesn't crash -python3 task.py --no-startup-check tune protox embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2 +python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # long datagen so that train doesn't crash +python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2 # agent -python3 task.py --no-startup-check tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 1 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot -python3 task.py --no-startup-check tune protox agent tune tpch --scale-factor $SCALE_FACTOR -python3 task.py --no-startup-check tune protox agent replay tpch --scale-factor $SCALE_FACTOR +python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 4 --max-concurrent 4 --workload-timeout 100 --query-timeout 15 --tune-duration-during-hpo 1 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot +python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR +python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR diff --git a/experiments/protox_tpch_sf10/main.sh b/experiments/protox_tpch_sf10/main.sh index 2627c942..62814340 100755 --- a/experiments/protox_tpch_sf10/main.sh +++ b/experiments/protox_tpch_sf10/main.sh @@ -3,29 +3,29 @@ set -euxo pipefail SCALE_FACTOR=10 -INTENDED_PGDATA_HARDWARE=ssd +INTENDED_DBDATA_HARDWARE=ssd . ./experiments/load_per_machine_envvars.sh # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars) -python3 task.py --no-startup-check tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 4 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot -# python3 task.py --no-startup-check tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 4 -# python3 task.py --no-startup-check tune protox agent tune tpch --scale-factor $SCALE_FACTOR --enable-boot-during-tune --tune-duration-during-tune 4 -# python3 task.py --no-startup-check tune protox agent replay tpch --scale-factor $SCALE_FACTOR -# python3 task.py --no-startup-check tune protox agent replay tpch --scale-factor $SCALE_FACTOR --boot-enabled-during-tune +python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 4 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot +# python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 4 +# python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --enable-boot-during-tune --tune-duration-during-tune 4 +# python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR +# python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR --boot-enabled-during-tune exit 0 # benchmark -python3 task.py --no-startup-check benchmark tpch data $SCALE_FACTOR -python3 task.py --no-startup-check benchmark tpch workload --scale-factor $SCALE_FACTOR +python3 task.py benchmark tpch data $SCALE_FACTOR +python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR # postgres -python3 task.py --no-startup-check dbms postgres build -python3 task.py --no-startup-check dbms postgres pgdata tpch --scale-factor $SCALE_FACTOR --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH +python3 task.py dbms postgres build +python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # embedding -python3 task.py --no-startup-check tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH -python3 task.py --no-startup-check tune protox embedding train tpch --scale-factor $SCALE_FACTOR --train-max-concurrent 10 +python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH +python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --train-max-concurrent 10 # agent -python3 task.py --no-startup-check tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 4 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot -python3 task.py --no-startup-check tune protox agent tune tpch --scale-factor $SCALE_FACTOR +python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --max-concurrent 4 --tune-duration-during-hpo 4 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot +python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR diff --git a/misc/utils.py b/misc/utils.py index bec81d97..fb1dbde4 100644 --- a/misc/utils.py +++ b/misc/utils.py @@ -48,8 +48,8 @@ def get_scale_factor_string(scale_factor: float | str) -> str: else: return str(scale_factor).replace(".", "point") -def get_pgdata_tgz_name(benchmark_name: str, scale_factor: float) -> str: - return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_pgdata.tgz" +def get_dbdata_tgz_name(benchmark_name: str, scale_factor: float) -> str: + return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz" # Other parameters @@ -134,15 +134,15 @@ def get_pgdata_tgz_name(benchmark_name: str, scale_factor: float) -> str: / "data" / (workload_name + ".link") ) -default_pristine_pgdata_snapshot_path = ( +default_pristine_dbdata_snapshot_path = ( lambda workspace_path, benchmark_name, scale_factor: get_symlinks_path_from_workspace_path( workspace_path ) / "dbgym_dbms_postgres" / "data" - / (get_pgdata_tgz_name(benchmark_name, scale_factor) + ".link") + / (get_dbdata_tgz_name(benchmark_name, scale_factor) + ".link") ) -default_pgdata_parent_dpath = ( +default_dbdata_parent_dpath = ( lambda workspace_path: get_tmp_path_from_workspace_path( workspace_path ) @@ -166,13 +166,11 @@ class DBGymConfig: Global configurations that apply to all parts of DB-Gym """ - def __init__(self, config_path, startup_check=False): + def __init__(self, config_path): """ Parameters ---------- config_path : Path - startup_check : bool - True if startup_check shoul """ assert is_base_git_dir( os.getcwd() @@ -188,18 +186,6 @@ def __init__(self, config_path, startup_check=False): Path(yaml_config["dbgym_workspace_path"]).resolve().absolute() ) - # Quickly display options. - if startup_check: - msg = ( - "💩💩💩 CMU-DB Database Gym: github.com/cmu-db/dbgym 💩💩💩\n" - f"\tdbgym_workspace_path: {dbgym_workspace_path}\n" - "\n" - "Proceed?" - ) - if not click.confirm(msg): - print("Goodbye.") - sys.exit(0) - self.path: Path = config_path self.cur_path_list: list[str] = ["dbgym"] self.root_yaml: dict = yaml_config @@ -216,8 +202,8 @@ def __init__(self, config_path, startup_check=False): ) self.dbgym_symlinks_path.mkdir(parents=True, exist_ok=True) # tmp is a workspace for this run only - # one use for it is to place the unzipped pgdata - # there's no need to save the actual pgdata dir in run_*/ because we just save a symlink to + # one use for it is to place the unzipped dbdata + # there's no need to save the actual dbdata dir in run_*/ because we just save a symlink to # the .tgz file we unzipped self.dbgym_tmp_path = get_tmp_path_from_workspace_path(self.dbgym_workspace_path) if self.dbgym_tmp_path.exists(): @@ -589,3 +575,19 @@ def make_redis_started(port: int): # When you start Redis in daemon mode, it won't let you know if it's started, so we ping again to check r = redis.Redis(port=port) r.ping() + + +def is_ssd(path: Path) -> bool: + try: + device = subprocess.check_output(['df', path]).decode().split('\n')[1].split()[0] + device_basename = os.path.basename(device) + lsblk_output = subprocess.check_output(['lsblk', '-d', '-o', 'name,rota']).decode() + for line in lsblk_output.split('\n')[1:]: + parts = line.split() + if parts and parts[0] == device_basename: + is_ssd = int(parts[1]) == 0 + return is_ssd + return False + except Exception as e: + print(f"An error occurred: {e}") + return False \ No newline at end of file diff --git a/scripts/pat_test.sh b/scripts/pat_test.sh index afab9108..4353a5dc 100755 --- a/scripts/pat_test.sh +++ b/scripts/pat_test.sh @@ -3,31 +3,31 @@ set -euxo pipefail SCALE_FACTOR=0.01 -INTENDED_PGDATA_HARDWARE=ssd +INTENDED_DBDATA_HARDWARE=ssd . ./experiments/load_per_machine_envvars.sh # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars) -python3 task.py --no-startup-check tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot -python3 task.py --no-startup-check tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 0.02 -python3 task.py --no-startup-check tune protox agent replay tpch --scale-factor $SCALE_FACTOR +python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot +python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR --tune-duration-during-tune 0.02 +python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR exit 0 # benchmark -python3 task.py --no-startup-check benchmark tpch data $SCALE_FACTOR -python3 task.py --no-startup-check benchmark tpch workload --scale-factor $SCALE_FACTOR +python3 task.py benchmark tpch data $SCALE_FACTOR +python3 task.py benchmark tpch workload --scale-factor $SCALE_FACTOR # postgres -python3 task.py --no-startup-check dbms postgres build -python3 task.py --no-startup-check dbms postgres pgdata tpch --scale-factor $SCALE_FACTOR --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH +python3 task.py dbms postgres build +python3 task.py dbms postgres dbdata tpch --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH exit 0 # embedding -# python3 task.py --no-startup-check tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --default-sample-limit 64 --file-limit 64 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH # short datagen for testing -python3 task.py --no-startup-check tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH # long datagen so that train doesn't crash -python3 task.py --no-startup-check tune protox embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2 +# python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --default-sample-limit 64 --file-limit 64 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # short datagen for testing +python3 task.py tune protox embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH # long datagen so that train doesn't crash +python3 task.py tune protox embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2 # agent -python3 task.py --no-startup-check tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-pgdata-hardware $INTENDED_PGDATA_HARDWARE --pgdata-parent-dpath $PGDATA_PARENT_DPATH --build-space-good-for-boot -python3 task.py --no-startup-check tune protox agent tune tpch --scale-factor $SCALE_FACTOR -python3 task.py --no-startup-check tune protox agent replay tpch --scale-factor $SCALE_FACTOR +python3 task.py tune protox agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-dpath $DBDATA_PARENT_DPATH --build-space-good-for-boot +python3 task.py tune protox agent tune tpch --scale-factor $SCALE_FACTOR +python3 task.py tune protox agent replay tpch --scale-factor $SCALE_FACTOR diff --git a/scripts/quickstart.sh b/scripts/quickstart.sh new file mode 100755 index 00000000..7d082726 --- /dev/null +++ b/scripts/quickstart.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -euxo pipefail + +DBMS=$1 +BENCHMARK=$2 +SCALE_FACTOR=$3 +AGENT=$4 + +# Benchmark +python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR +python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR + +# DBMS +python3 task.py dbms $DBMS build +python3 task.py dbms $DBMS dbdata tpch --scale-factor $SCALE_FACTOR + +# Tune +python3 task.py tune $AGENT embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" # long datagen so that train doesn't crash +python3 task.py tune $AGENT embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2 +python3 task.py tune $AGENT agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 --build-space-good-for-boot +python3 task.py tune $AGENT agent tune tpch --scale-factor $SCALE_FACTOR +python3 task.py tune $AGENT agent replay tpch --scale-factor $SCALE_FACTOR diff --git a/scripts/wan_test.sh b/scripts/wan_test.sh index fb39184f..a700dd31 100755 --- a/scripts/wan_test.sh +++ b/scripts/wan_test.sh @@ -3,17 +3,17 @@ set -euxo pipefail # Build Postgres -python3 task.py --no-startup-check dbms postgres repo +python3 task.py dbms postgres repo # Generate TPC-H -python3 task.py --no-startup-check benchmark tpch generate-data 1 -python3 task.py --no-startup-check benchmark tpch generate-workload queries_15721_15723 15721 15723 +python3 task.py benchmark tpch generate-data 1 +python3 task.py benchmark tpch generate-workload queries_15721_15723 15721 15723 # Create tpch_sf1.tgz -python3 task.py --no-startup-check dbms postgres pgdata tpch --scale-factor 1 +python3 task.py dbms postgres dbdata tpch --scale-factor 1 # Run Proto-X -python3 task.py --no-startup-check dbms postgres start -python3 task.py --no-startup-check tune protox embedding datagen tpch queries_15721_15723 --connection-str "host=localhost port=15721 dbname=tpch_sf1 user=noisepage_user password=noisepage_pass" --override-sample-limits "lineitem,32768" -python3 task.py --no-startup-check tune protox embedding train tpch queries_15721_15723 --iterations-per-epoch 1 --num-samples 4 --train-max-concurrent 4 --num-points-to-sample 32 --max-segments 3 -python3 task.py --no-startup-check dbms postgres stop +python3 task.py dbms postgres start +python3 task.py tune protox embedding datagen tpch queries_15721_15723 --connection-str "host=localhost port=15721 dbname=tpch_sf1 user=noisepage_user password=noisepage_pass" --override-sample-limits "lineitem,32768" +python3 task.py tune protox embedding train tpch queries_15721_15723 --iterations-per-epoch 1 --num-samples 4 --train-max-concurrent 4 --num-points-to-sample 32 --max-segments 3 +python3 task.py dbms postgres stop diff --git a/task.py b/task.py index 6f952656..c20cdf62 100644 --- a/task.py +++ b/task.py @@ -19,11 +19,10 @@ @click.group() @click.option("--config-path", default="config.yaml") -@click.option("--no-startup-check", is_flag=True) @click.pass_context -def task(ctx, config_path, no_startup_check): +def task(ctx, config_path): """💩💩💩 CMU-DB Database Gym: github.com/cmu-db/dbgym 💩💩💩""" - ctx.obj = DBGymConfig(config_path, startup_check=not no_startup_check) + ctx.obj = DBGymConfig(config_path) @click.group(name="config") diff --git a/tune/protox/agent/build_trial.py b/tune/protox/agent/build_trial.py index 58e1aeb7..53e782a5 100644 --- a/tune/protox/agent/build_trial.py +++ b/tune/protox/agent/build_trial.py @@ -158,8 +158,8 @@ def _build_utilities( pg_conn = PostgresConn( dbgym_cfg=dbgym_cfg, pgport=pgport, - pristine_pgdata_snapshot_fpath=Path(hpo_params["pgconn_info"]["pristine_pgdata_snapshot_path"]), - pgdata_parent_dpath=Path(hpo_params["pgconn_info"]["pgdata_parent_dpath"]), + pristine_dbdata_snapshot_fpath=Path(hpo_params["pgconn_info"]["pristine_dbdata_snapshot_path"]), + dbdata_parent_dpath=Path(hpo_params["pgconn_info"]["dbdata_parent_dpath"]), pgbin_path=Path(hpo_params["pgconn_info"]["pgbin_path"]), enable_boot=enable_boot, boot_config_fpath=hpo_params["boot_config_fpath"][str(tuning_mode)], diff --git a/tune/protox/agent/coerce_config.py b/tune/protox/agent/coerce_config.py index 3c19900c..db8f06eb 100644 --- a/tune/protox/agent/coerce_config.py +++ b/tune/protox/agent/coerce_config.py @@ -35,8 +35,8 @@ def coerce_config(dbgym_cfg: DBGymConfig, space: dict[str, Any], hpo_params: dic "pgport": 5432, "pguser": "admin", "pgpass": "", - "pristine_pgdata_snapshot_path": "/mnt/nvme0n1/wz2/noisepage/pgdata", - "pgdata_parent_dpath": "/mnt/nvme0n1/wz2/noisepage/", + "pristine_dbdata_snapshot_path": "/mnt/nvme0n1/wz2/noisepage/pgdata", + "dbdata_parent_dpath": "/mnt/nvme0n1/wz2/noisepage/", "pgbin_path": "/mnt/nvme0n1/wz2/noisepage/", }, "benchmark_config": benchmark_config, diff --git a/tune/protox/agent/hpo.py b/tune/protox/agent/hpo.py index 60498514..bc3d8432 100644 --- a/tune/protox/agent/hpo.py +++ b/tune/protox/agent/hpo.py @@ -13,7 +13,6 @@ from typing import Any, Optional, Union import random import click -import ssd_checker import ray from ray.tune import Trainable from ray.tune.schedulers import FIFOScheduler @@ -23,22 +22,22 @@ from ray.train import SyncConfig from tune.protox.agent.build_trial import build_trial -from misc.utils import DEFAULT_BOOT_CONFIG_FPATH, DEFAULT_WORKLOAD_TIMEOUT, DBGymConfig, TuningMode, link_result, open_and_save, restart_ray, conv_inputpath_to_realabspath, default_pristine_pgdata_snapshot_path, default_workload_path, default_embedder_path, default_benchmark_config_path, default_benchbase_config_path, WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, WORKLOAD_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER, DEFAULT_SYSKNOBS_PATH, default_pgbin_path, workload_name_fn, default_pgdata_parent_dpath, default_hpoed_agent_params_fname +from misc.utils import DEFAULT_BOOT_CONFIG_FPATH, DEFAULT_WORKLOAD_TIMEOUT, DBGymConfig, TuningMode, link_result, open_and_save, restart_ray, conv_inputpath_to_realabspath, default_pristine_dbdata_snapshot_path, default_workload_path, default_embedder_path, default_benchmark_config_path, default_benchbase_config_path, WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, WORKLOAD_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER, DEFAULT_SYSKNOBS_PATH, default_pgbin_path, workload_name_fn, default_dbdata_parent_dpath, default_hpoed_agent_params_fname, is_ssd METRIC_NAME = "Best Metric" class AgentHPOArgs: - def __init__(self, benchmark_name, workload_name, embedder_path, benchmark_config_path, benchbase_config_path, sysknobs_path, pristine_pgdata_snapshot_path, pgdata_parent_dpath, pgbin_path, workload_path, seed, agent, max_concurrent, num_samples, tune_duration_during_hpo, workload_timeout, query_timeout, enable_boot_during_hpo, boot_config_fpath_during_hpo, build_space_good_for_boot): + def __init__(self, benchmark_name, workload_name, embedder_path, benchmark_config_path, benchbase_config_path, sysknobs_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath, pgbin_path, workload_path, seed, agent, max_concurrent, num_samples, tune_duration_during_hpo, workload_timeout, query_timeout, enable_boot_during_hpo, boot_config_fpath_during_hpo, build_space_good_for_boot): self.benchmark_name = benchmark_name self.workload_name = workload_name self.embedder_path = embedder_path self.benchmark_config_path = benchmark_config_path self.benchbase_config_path = benchbase_config_path self.sysknobs_path = sysknobs_path - self.pristine_pgdata_snapshot_path = pristine_pgdata_snapshot_path - self.pgdata_parent_dpath = pgdata_parent_dpath + self.pristine_dbdata_snapshot_path = pristine_dbdata_snapshot_path + self.dbdata_parent_dpath = dbdata_parent_dpath self.pgbin_path = pgbin_path self.workload_path = workload_path self.seed = seed @@ -91,28 +90,22 @@ def __init__(self, benchmark_name, workload_name, embedder_path, benchmark_confi help=f"The path to the file configuring the space of system knobs the tuner can tune.", ) @click.option( - "--pristine-pgdata-snapshot-path", + "--pristine-dbdata-snapshot-path", default=None, type=Path, - help=f"The path to the .tgz snapshot of the pgdata directory to use as a starting point for tuning. The default is {default_pristine_pgdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", + help=f"The path to the .tgz snapshot of the dbdata directory to use as a starting point for tuning. The default is {default_pristine_dbdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", ) @click.option( - "--pristine-pgdata-snapshot-path", - default=None, - type=Path, - help=f"The path to the .tgz snapshot of the pgdata directory to use as a starting point for tuning. The default is {default_pristine_pgdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", -) -@click.option( - "--intended-pgdata-hardware", + "--intended-dbdata-hardware", type=click.Choice(["hdd", "ssd"]), default="hdd", - help=f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata-parent-dpath.", + help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-dpath.", ) @click.option( - "--pgdata-parent-dpath", + "--dbdata-parent-dpath", default=None, type=Path, - help=f"The path to the parent directory of the pgdata which will be actively tuned. The default is {default_pgdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.", + help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.", ) @click.option( "--pgbin-path", @@ -199,9 +192,9 @@ def hpo( benchmark_config_path, benchbase_config_path, sysknobs_path, - pristine_pgdata_snapshot_path, - intended_pgdata_hardware, - pgdata_parent_dpath, + pristine_dbdata_snapshot_path, + intended_dbdata_hardware, + dbdata_parent_dpath, pgbin_path, workload_path, seed, @@ -223,10 +216,10 @@ def hpo( benchmark_config_path = default_benchmark_config_path(benchmark_name) if benchbase_config_path == None: benchbase_config_path = default_benchbase_config_path(benchmark_name) - if pristine_pgdata_snapshot_path == None: - pristine_pgdata_snapshot_path = default_pristine_pgdata_snapshot_path(dbgym_cfg.dbgym_workspace_path, benchmark_name, scale_factor) - if pgdata_parent_dpath == None: - pgdata_parent_dpath = default_pgdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) + if pristine_dbdata_snapshot_path == None: + pristine_dbdata_snapshot_path = default_pristine_dbdata_snapshot_path(dbgym_cfg.dbgym_workspace_path, benchmark_name, scale_factor) + if dbdata_parent_dpath == None: + dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) if pgbin_path == None: pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path) if workload_path == None: @@ -239,22 +232,22 @@ def hpo( benchmark_config_path = conv_inputpath_to_realabspath(dbgym_cfg, benchmark_config_path) benchbase_config_path = conv_inputpath_to_realabspath(dbgym_cfg, benchbase_config_path) sysknobs_path = conv_inputpath_to_realabspath(dbgym_cfg, sysknobs_path) - pristine_pgdata_snapshot_path = conv_inputpath_to_realabspath(dbgym_cfg, pristine_pgdata_snapshot_path) - pgdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, pgdata_parent_dpath) + pristine_dbdata_snapshot_path = conv_inputpath_to_realabspath(dbgym_cfg, pristine_dbdata_snapshot_path) + dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath) pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path) workload_path = conv_inputpath_to_realabspath(dbgym_cfg, workload_path) boot_config_fpath_during_hpo = conv_inputpath_to_realabspath(dbgym_cfg, boot_config_fpath_during_hpo) # Check assertions on args - if intended_pgdata_hardware == "hdd": - assert not ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is HDD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an SSD" - elif intended_pgdata_hardware == "ssd": - assert ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is SSD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an HDD" + if intended_dbdata_hardware == "hdd": + assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" + elif intended_dbdata_hardware == "ssd": + assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" else: assert False # Create args object - hpo_args = AgentHPOArgs(benchmark_name, workload_name, embedder_path, benchmark_config_path, benchbase_config_path, sysknobs_path, pristine_pgdata_snapshot_path, pgdata_parent_dpath, pgbin_path, workload_path, seed, agent, max_concurrent, num_samples, tune_duration_during_hpo, workload_timeout, query_timeout, enable_boot_during_hpo, boot_config_fpath_during_hpo, build_space_good_for_boot) + hpo_args = AgentHPOArgs(benchmark_name, workload_name, embedder_path, benchmark_config_path, benchbase_config_path, sysknobs_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath, pgbin_path, workload_path, seed, agent, max_concurrent, num_samples, tune_duration_during_hpo, workload_timeout, query_timeout, enable_boot_during_hpo, boot_config_fpath_during_hpo, build_space_good_for_boot) _tune_hpo(dbgym_cfg, hpo_args) @@ -607,8 +600,8 @@ def _tune_hpo(dbgym_cfg: DBGymConfig, hpo_args: AgentHPOArgs) -> None: hpo_args.workload_path, embedder_path, pgconn_info={ - "pristine_pgdata_snapshot_path": hpo_args.pristine_pgdata_snapshot_path, - "pgdata_parent_dpath": hpo_args.pgdata_parent_dpath, + "pristine_dbdata_snapshot_path": hpo_args.pristine_dbdata_snapshot_path, + "dbdata_parent_dpath": hpo_args.dbdata_parent_dpath, "pgbin_path": hpo_args.pgbin_path, }, benchbase_config=benchbase_config, diff --git a/tune/protox/embedding/datagen.py b/tune/protox/embedding/datagen.py index 940a3dfd..3e4889c8 100644 --- a/tune/protox/embedding/datagen.py +++ b/tune/protox/embedding/datagen.py @@ -13,7 +13,6 @@ import yaml from sklearn.preprocessing import quantile_transform import shutil -import ssd_checker from misc.utils import ( BENCHMARK_NAME_PLACEHOLDER, @@ -24,14 +23,15 @@ conv_inputpath_to_realabspath, default_benchmark_config_path, default_workload_path, - default_pristine_pgdata_snapshot_path, + default_pristine_dbdata_snapshot_path, default_pgbin_path, traindata_fname, link_result, open_and_save, save_file, workload_name_fn, - default_pgdata_parent_dpath, + default_dbdata_parent_dpath, + is_ssd, ) from tune.protox.embedding.loss import COST_COLUMNS from tune.protox.env.space.primitive_space.index_space import IndexSpace @@ -69,22 +69,22 @@ @click.option("--pgbin-path", type=Path, default=None, help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.") # TODO(phw2): need to run pgtune before gathering data @click.option( - "--pristine-pgdata-snapshot-path", + "--pristine-dbdata-snapshot-path", default=None, type=Path, - help=f"The path to the .tgz snapshot of the pgdata directory to build an embedding space over. The default is {default_pristine_pgdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", + help=f"The path to the .tgz snapshot of the dbdata directory to build an embedding space over. The default is {default_pristine_dbdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", ) @click.option( - "--intended-pgdata-hardware", + "--intended-dbdata-hardware", type=click.Choice(["hdd", "ssd"]), default="hdd", - help=f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata-parent-dpath.", + help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-dpath.", ) @click.option( - "--pgdata-parent-dpath", + "--dbdata-parent-dpath", default=None, type=Path, - help=f"The path to the parent directory of the pgdata which will be actively tuned. The default is {default_pristine_pgdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", + help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {default_pristine_dbdata_snapshot_path(WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER)}.", ) @click.option( "--benchmark-config-path", @@ -154,9 +154,9 @@ def datagen( query_subset, scale_factor, pgbin_path, - pristine_pgdata_snapshot_path, - intended_pgdata_hardware, - pgdata_parent_dpath, + pristine_dbdata_snapshot_path, + intended_dbdata_hardware, + dbdata_parent_dpath, benchmark_config_path, workload_path, seed, @@ -191,12 +191,12 @@ def datagen( ) if pgbin_path == None: pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path) - if pristine_pgdata_snapshot_path == None: - pristine_pgdata_snapshot_path = default_pristine_pgdata_snapshot_path( + if pristine_dbdata_snapshot_path == None: + pristine_dbdata_snapshot_path = default_pristine_dbdata_snapshot_path( dbgym_cfg.dbgym_workspace_path, benchmark_name, scale_factor ) - if pgdata_parent_dpath == None: - pgdata_parent_dpath = default_pgdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) + if dbdata_parent_dpath == None: + dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) if max_concurrent == None: max_concurrent = os.cpu_count() if seed == None: @@ -206,14 +206,14 @@ def datagen( workload_path = conv_inputpath_to_realabspath(dbgym_cfg, workload_path) benchmark_config_path = conv_inputpath_to_realabspath(dbgym_cfg, benchmark_config_path) pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path) - pristine_pgdata_snapshot_path = conv_inputpath_to_realabspath(dbgym_cfg, pristine_pgdata_snapshot_path) - pgdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, pgdata_parent_dpath) + pristine_dbdata_snapshot_path = conv_inputpath_to_realabspath(dbgym_cfg, pristine_dbdata_snapshot_path) + dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath) # Check assertions on args - if intended_pgdata_hardware == "hdd": - assert not ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is HDD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an SSD" - elif intended_pgdata_hardware == "ssd": - assert ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is SSD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an HDD" + if intended_dbdata_hardware == "hdd": + assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" + elif intended_dbdata_hardware == "ssd": + assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" else: assert False @@ -238,7 +238,7 @@ def datagen( # Group args together to reduce the # of parameters we pass into functions # I chose to group them into separate objects instead because it felt hacky to pass a giant args object into every function generic_args = EmbeddingDatagenGenericArgs( - benchmark_name, workload_name, scale_factor, benchmark_config_path, seed, workload_path, pristine_pgdata_snapshot_path, pgdata_parent_dpath + benchmark_name, workload_name, scale_factor, benchmark_config_path, seed, workload_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath ) dir_gen_args = EmbeddingDirGenArgs( leading_col_tbls, @@ -252,31 +252,31 @@ def datagen( # run all steps start_time = time.time() - pgdata_dpath = untar_snapshot(dbgym_cfg, generic_args.pristine_pgdata_snapshot_path, generic_args.pgdata_parent_dpath) + dbdata_dpath = untar_snapshot(dbgym_cfg, generic_args.pristine_dbdata_snapshot_path, generic_args.dbdata_parent_dpath) pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path) - start_postgres(dbgym_cfg, pgbin_path, pgdata_dpath) + start_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) _gen_traindata_dir(dbgym_cfg, generic_args, dir_gen_args) _combine_traindata_dir_into_parquet(dbgym_cfg, generic_args, file_gen_args) datagen_duration = time.time() - start_time with open(f"{dbgym_cfg.dbgym_this_run_path}/datagen_time.txt", "w") as f: f.write(f"{datagen_duration}") - stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath) + stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) -def untar_snapshot(dbgym_cfg: DBGymConfig, pgdata_snapshot_fpath: Path, pgdata_parent_dpath: Path) -> Path: +def untar_snapshot(dbgym_cfg: DBGymConfig, dbdata_snapshot_fpath: Path, dbdata_parent_dpath: Path) -> Path: # It should be an absolute path and it should exist - assert pgdata_snapshot_fpath.is_absolute() and pgdata_snapshot_fpath.exists(), f"untar_snapshot(): pgdata_snapshot_fpath ({pgdata_snapshot_fpath}) either doesn't exist or is not absolute" + assert dbdata_snapshot_fpath.is_absolute() and dbdata_snapshot_fpath.exists(), f"untar_snapshot(): dbdata_snapshot_fpath ({dbdata_snapshot_fpath}) either doesn't exist or is not absolute" # It may be a symlink so we need to resolve them first - pgdata_snapshot_real_fpath = pgdata_snapshot_fpath.resolve() - save_file(dbgym_cfg, pgdata_snapshot_real_fpath) - pgdata_dpath = pgdata_parent_dpath / "pgdata" - # Make the parent dir and the pgdata dir. Note how we require that pgdata_dpath does not exist while it's ok if the parent does. - pgdata_parent_dpath.mkdir(parents=True, exist_ok=True) - if pgdata_dpath.exists(): - shutil.rmtree(pgdata_dpath) - pgdata_dpath.mkdir(parents=False, exist_ok=False) - subprocess_run(f"tar -xzf {pgdata_snapshot_real_fpath} -C {pgdata_dpath}") - return pgdata_dpath + dbdata_snapshot_real_fpath = dbdata_snapshot_fpath.resolve() + save_file(dbgym_cfg, dbdata_snapshot_real_fpath) + dbdata_dpath = dbdata_parent_dpath / "dbdata" + # Make the parent dir and the dbdata dir. Note how we require that dbdata_dpath does not exist while it's ok if the parent does. + dbdata_parent_dpath.mkdir(parents=True, exist_ok=True) + if dbdata_dpath.exists(): + shutil.rmtree(dbdata_dpath) + dbdata_dpath.mkdir(parents=False, exist_ok=False) + subprocess_run(f"tar -xzf {dbdata_snapshot_real_fpath} -C {dbdata_dpath}") + return dbdata_dpath class EmbeddingDatagenGenericArgs: @@ -286,15 +286,15 @@ class EmbeddingDatagenGenericArgs: I wanted to make multiple classes instead of just one to conceptually separate the different args """ - def __init__(self, benchmark_name, workload_name, scale_factor, benchmark_config_path, seed, workload_path, pristine_pgdata_snapshot_path, pgdata_parent_dpath): + def __init__(self, benchmark_name, workload_name, scale_factor, benchmark_config_path, seed, workload_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath): self.benchmark_name = benchmark_name self.workload_name = workload_name self.scale_factor = scale_factor self.benchmark_config_path = benchmark_config_path self.seed = seed self.workload_path = workload_path - self.pristine_pgdata_snapshot_path = pristine_pgdata_snapshot_path - self.pgdata_parent_dpath = pgdata_parent_dpath + self.pristine_dbdata_snapshot_path = pristine_dbdata_snapshot_path + self.dbdata_parent_dpath = dbdata_parent_dpath class EmbeddingDirGenArgs: diff --git a/tune/protox/env/pg_env.py b/tune/protox/env/pg_env.py index 62fa92b8..92236519 100644 --- a/tune/protox/env/pg_env.py +++ b/tune/protox/env/pg_env.py @@ -220,8 +220,8 @@ def step_before_execution(self, action: HolonAction) -> Tuple[bool, EnvInfoDict] # Get the prior state. prior_state = copy.deepcopy(self.state_container) # Save the old configuration file. - old_conf_path = f"{self.pg_conn.pgdata_dpath}/postgresql.auto.conf" - conf_path = f"{self.pg_conn.pgdata_dpath}/postgresql.auto.old" + old_conf_path = f"{self.pg_conn.dbdata_dpath}/postgresql.auto.conf" + conf_path = f"{self.pg_conn.dbdata_dpath}/postgresql.auto.old" local["cp"][old_conf_path, conf_path].run() # Figure out what we have to change to get to the new configuration. @@ -421,8 +421,8 @@ def attempt_checkpoint(conn_str: str) -> None: def close(self) -> None: self.pg_conn.shutdown_postgres() # This file may not be in in [workspace]/tmp/, so it's important to delete it - local["rm"]["-rf", self.pg_conn.pgdata_dpath].run() + local["rm"]["-rf", self.pg_conn.dbdata_dpath].run() # Even though these files get deleted because [workspace]/tmp/ gets deleted, # we'll just delete them here anyways because why not - local["rm"]["-f", self.pg_conn.checkpoint_pgdata_snapshot_fpath].run() - local["rm"]["-f", f"{self.pg_conn.checkpoint_pgdata_snapshot_fpath}.tmp"].run() + local["rm"]["-f", self.pg_conn.checkpoint_dbdata_snapshot_fpath].run() + local["rm"]["-f", f"{self.pg_conn.checkpoint_dbdata_snapshot_fpath}.tmp"].run() diff --git a/tune/protox/env/util/pg_conn.py b/tune/protox/env/util/pg_conn.py index 3a4f0207..69b2c701 100644 --- a/tune/protox/env/util/pg_conn.py +++ b/tune/protox/env/util/pg_conn.py @@ -2,7 +2,7 @@ At a high level, this file's goal is to provide helpers to manage a Postgres instance during agent tuning. On the other hand, the goal of dbms.postgres.cli is to (1) install+build postgres and (2) - create pgdata. + create dbdata. util.pg provides helpers used by *both* of the above files (as well as other files). """ import os @@ -28,8 +28,8 @@ def __init__( self, dbgym_cfg: DBGymConfig, pgport: int, - pristine_pgdata_snapshot_fpath: Path, - pgdata_parent_dpath: Path, + pristine_dbdata_snapshot_fpath: Path, + dbdata_parent_dpath: Path, pgbin_path: Union[str, Path], connect_timeout: int, enable_boot: bool, @@ -46,20 +46,20 @@ def __init__( self.log_step = 0 self.logger = logger - # All the paths related to pgdata - # pristine_pgdata_snapshot_fpath is the .tgz snapshot that represents the starting state + # All the paths related to dbdata + # pristine_dbdata_snapshot_fpath is the .tgz snapshot that represents the starting state # of the database (with the default configuration). It is generated by a call to # `python tune.py dbms postgres ...` and should not be overwritten. - self.pristine_pgdata_snapshot_fpath = pristine_pgdata_snapshot_fpath - # checkpoint_pgdata_snapshot_fpath is the .tgz snapshot that represents the current + self.pristine_dbdata_snapshot_fpath = pristine_dbdata_snapshot_fpath + # checkpoint_dbdata_snapshot_fpath is the .tgz snapshot that represents the current # state of the database as it is being tuned. It is generated while tuning and is # discarded once tuning is completed. - self.checkpoint_pgdata_snapshot_fpath = dbgym_cfg.dbgym_tmp_path / "checkpoint_pgdata.tgz" - # pgdata_parent_dpath is the parent directory of the pgdata that is *actively being tuned*. - # Setting this lets us control the hardware device pgdata is built on (e.g. HDD vs. SSD). - self.pgdata_parent_dpath = pgdata_parent_dpath - # pgdata_dpath is the pgdata that is *actively being tuned* - self.pgdata_dpath = self.pgdata_parent_dpath / f"pgdata{self.pgport}" + self.checkpoint_dbdata_snapshot_fpath = dbgym_cfg.dbgym_tmp_path / "checkpoint_dbdata.tgz" + # dbdata_parent_dpath is the parent directory of the dbdata that is *actively being tuned*. + # Setting this lets us control the hardware device dbdata is built on (e.g. HDD vs. SSD). + self.dbdata_parent_dpath = dbdata_parent_dpath + # dbdata_dpath is the dbdata that is *actively being tuned* + self.dbdata_dpath = self.dbdata_parent_dpath / f"dbdata{self.pgport}" self._conn: Optional[psycopg.Connection[Any]] = None @@ -92,13 +92,13 @@ def move_log(self) -> None: def shutdown_postgres(self) -> None: """Shuts down postgres.""" self.disconnect() - if not Path(self.pgdata_dpath).exists(): + if not Path(self.dbdata_dpath).exists(): return while True: self.logger.get_logger(__name__).debug("Shutting down postgres...") _, stdout, stderr = local[f"{self.pgbin_path}/pg_ctl"][ - "stop", "--wait", "-t", "180", "-D", self.pgdata_dpath + "stop", "--wait", "-t", "180", "-D", self.dbdata_dpath ].run(retcode=None) time.sleep(1) self.logger.get_logger(__name__).debug( @@ -115,7 +115,7 @@ def shutdown_postgres(self) -> None: DBGYM_POSTGRES_DBNAME, ].run(retcode=None) - exists = (Path(self.pgdata_dpath) / "postmaster.pid").exists() + exists = (Path(self.dbdata_dpath) / "postmaster.pid").exists() if not exists and retcode != 0: break @@ -127,7 +127,7 @@ def start_with_changes( save_checkpoint: bool = False, ) -> bool: """ - This function assumes that some snapshot has already been untarred into self.pgdata_dpath + This function assumes that some snapshot has already been untarred into self.dbdata_dpath """ # Install the new configuration changes. if conf_changes is not None: @@ -135,11 +135,11 @@ def start_with_changes( # This way of doing it works for both single or multiple libraries. An example of a way # that *doesn't* work is `f"shared_preload_libraries='"{SHARED_PRELOAD_LIBRARIES}"'"` conf_changes.append(f"shared_preload_libraries='{SHARED_PRELOAD_LIBRARIES}'") - pgdata_auto_conf_path = self.pgdata_dpath / "postgresql.auto.conf" - with open(pgdata_auto_conf_path, "w") as f: + dbdata_auto_conf_path = self.dbdata_dpath / "postgresql.auto.conf" + with open(dbdata_auto_conf_path, "w") as f: f.write("\n".join(conf_changes)) save_auto_conf_path = self.dbgym_cfg.cur_task_runs_data_path(".", mkdir=True) / "postgresql.auto.conf" - local["cp"][pgdata_auto_conf_path, save_auto_conf_path].run() + local["cp"][dbdata_auto_conf_path, save_auto_conf_path].run() link_result(self.dbgym_cfg, save_auto_conf_path) # Start postgres instance. @@ -151,14 +151,14 @@ def start_with_changes( "cf", # We append .tmp so that if we fail in the *middle* of running tar, we # still have the previous checkpoint available to us - f"{self.checkpoint_pgdata_snapshot_fpath}.tmp", + f"{self.checkpoint_dbdata_snapshot_fpath}.tmp", "-C", - parent_dir(self.pgdata_dpath), - self.pgdata_dpath, + parent_dir(self.dbdata_dpath), + self.dbdata_dpath, ].run() # Make sure the PID lock file doesn't exist. - pid_lock = Path(f"{self.pgdata_dpath}/postmaster.pid") + pid_lock = Path(f"{self.dbdata_dpath}/postmaster.pid") assert not pid_lock.exists() if dump_page_cache: @@ -170,7 +170,7 @@ def start_with_changes( # Try starting up. retcode, stdout, stderr = local[f"{self.pgbin_path}/pg_ctl"][ "-D", - self.pgdata_dpath, + self.dbdata_dpath, "--wait", "-t", "180", @@ -241,7 +241,7 @@ def start_with_changes( # Move the temporary over since we now know the temporary can load. if save_checkpoint: - shutil.move(f"{self.pgdata_dpath}.tgz.tmp", f"{self.pgdata_dpath}.tgz") + shutil.move(f"{self.dbdata_dpath}.tgz.tmp", f"{self.dbdata_dpath}.tgz") return True @@ -258,20 +258,20 @@ def _set_up_boot(self, intelligent_cache: bool, early_stop: bool, seq_sample: bo # If any of these commands fail, they'll throw a Python exception # Thus, if none of them throw an exception, we know they passed self.logger.get_logger(__name__).debug("Setting up boot") - self.conn().execute("DROP EXTENSION IF EXISTS bytejack") - self.conn().execute("CREATE EXTENSION IF NOT EXISTS bytejack") - self.conn().execute("SELECT bytejack_connect()") - self.conn().execute("SELECT bytejack_cache_clear()") - self.conn().execute("SET bytejack.enable=true") - self.conn().execute("SET bytejack.intercept_explain_analyze=true") - self.conn().execute(f"SET bytejack.intelligent_cache={intelligent_cache}") - self.conn().execute(f"SET bytejack.early_stop={early_stop}") - self.conn().execute(f"SET bytejack.seq_sample={seq_sample}") - self.conn().execute(f"SET bytejack.seq_sample_pct={seq_sample_pct}") - self.conn().execute(f"SET bytejack.seq_sample_seed={seq_sample_seed}") - self.conn().execute(f"SET bytejack.mu_hyp_opt={mu_hyp_opt}") - self.conn().execute(f"SET bytejack.mu_hyp_time={mu_hyp_time}") - self.conn().execute(f"SET bytejack.mu_hyp_stdev={mu_hyp_stdev}") + self.conn().execute("DROP EXTENSION IF EXISTS boot") + self.conn().execute("CREATE EXTENSION IF NOT EXISTS boot") + self.conn().execute("SELECT boot_connect()") + self.conn().execute("SELECT boot_cache_clear()") + self.conn().execute("SET boot.enable=true") + self.conn().execute("SET boot.intercept_explain_analyze=true") + self.conn().execute(f"SET boot.intelligent_cache={intelligent_cache}") + self.conn().execute(f"SET boot.early_stop={early_stop}") + self.conn().execute(f"SET boot.seq_sample={seq_sample}") + self.conn().execute(f"SET boot.seq_sample_pct={seq_sample_pct}") + self.conn().execute(f"SET boot.seq_sample_seed={seq_sample_seed}") + self.conn().execute(f"SET boot.mu_hyp_opt={mu_hyp_opt}") + self.conn().execute(f"SET boot.mu_hyp_time={mu_hyp_time}") + self.conn().execute(f"SET boot.mu_hyp_stdev={mu_hyp_stdev}") self.logger.get_logger(__name__).debug("Set up boot") @time_record("psql") @@ -332,29 +332,29 @@ def cancel_fn(conn_str: str) -> None: return 0, None def restore_pristine_snapshot(self): - self._restore_snapshot(self.pristine_pgdata_snapshot_fpath) + self._restore_snapshot(self.pristine_dbdata_snapshot_fpath) def restore_checkpointed_snapshot(self): - self._restore_snapshot(self.checkpoint_pgdata_snapshot_fpath) + self._restore_snapshot(self.checkpoint_dbdata_snapshot_fpath) @time_record("restore") def _restore_snapshot( - self, pgdata_snapshot_path: Path, + self, dbdata_snapshot_path: Path, ) -> bool: self.shutdown_postgres() - local["rm"]["-rf", self.pgdata_dpath].run() - local["mkdir"]["-m", "0700", "-p", self.pgdata_dpath].run() + local["rm"]["-rf", self.dbdata_dpath].run() + local["mkdir"]["-m", "0700", "-p", self.dbdata_dpath].run() - # Strip the "pgdata" so we can implant directly into the target pgdata_dpath. - assert pgdata_snapshot_path.exists() + # Strip the "dbdata" so we can implant directly into the target dbdata_dpath. + assert dbdata_snapshot_path.exists() local["tar"][ - "xf", pgdata_snapshot_path, "-C", self.pgdata_dpath, "--strip-components", "1" + "xf", dbdata_snapshot_path, "-C", self.dbdata_dpath, "--strip-components", "1" ].run() # Imprint the required port. ( (local["echo"][f"port={self.pgport}"]) - >> f"{self.pgdata_dpath}/postgresql.conf" + >> f"{self.dbdata_dpath}/postgresql.conf" )() return self.start_with_changes(conf_changes=None) diff --git a/util/pg.py b/util/pg.py index 469f5660..ee45772d 100644 --- a/util/pg.py +++ b/util/pg.py @@ -12,7 +12,7 @@ DBGYM_POSTGRES_DBNAME = "dbgym" DEFAULT_POSTGRES_DBNAME = "postgres" DEFAULT_POSTGRES_PORT = 5432 -SHARED_PRELOAD_LIBRARIES = "bytejack,pg_hint_plan,pg_prewarm" +SHARED_PRELOAD_LIBRARIES = "boot,pg_hint_plan,pg_prewarm" def conn_execute(conn: Connection, sql: str) -> CursorResult: