diff --git a/.github/workflows/tests_ci.yml b/.github/workflows/tests_ci.yml index f2f151f9..948b2a31 100644 --- a/.github/workflows/tests_ci.yml +++ b/.github/workflows/tests_ci.yml @@ -6,7 +6,7 @@ on: branches: [main] jobs: - build: + ci: runs-on: self-hosted steps: @@ -32,12 +32,16 @@ jobs: run: | ./dependencies/install_dependencies.sh + - name: Check formatting + run: | + ./scripts/check_format.sh + - name: Run unit tests run: | . "$HOME/.cargo/env" python scripts/run_unit_tests.py - - name: Run integration test + - name: Run integration tests # Delete the workspace. Run once with a clean workspace. Run again from the existing workspace. # Need to run with a non-root user in order to start Postgres. run: | diff --git a/benchmark/tpch/cli.py b/benchmark/tpch/cli.py index ac9a11a4..975fd769 100644 --- a/benchmark/tpch/cli.py +++ b/benchmark/tpch/cli.py @@ -4,9 +4,14 @@ import click -from misc.utils import DBGymConfig, get_scale_factor_string, link_result, workload_name_fn -from util.shell import subprocess_run +from misc.utils import ( + DBGymConfig, + get_scale_factor_string, + link_result, + workload_name_fn, +) from util.pg import * +from util.shell import subprocess_run benchmark_tpch_logger = logging.getLogger("benchmark/tpch") benchmark_tpch_logger.setLevel(logging.INFO) @@ -29,8 +34,18 @@ def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float): @tpch_group.command(name="workload") -@click.option("--seed-start", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).") -@click.option("--seed-end", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).") +@click.option( + "--seed-start", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).", +) +@click.option( + "--seed-end", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).", +) @click.option( "--query-subset", type=click.Choice(["all", "even", "odd"]), @@ -45,7 +60,9 @@ def tpch_workload( query_subset: str, scale_factor: float, ): - assert seed_start <= seed_end, f'seed_start ({seed_start}) must be <= seed_end ({seed_end})' + assert ( + seed_start <= seed_end + ), f"seed_start ({seed_start}) must be <= seed_end ({seed_end})" _clone(dbgym_cfg) _generate_queries(dbgym_cfg, seed_start, seed_end, scale_factor) _generate_workload(dbgym_cfg, seed_start, seed_end, query_subset, scale_factor) @@ -56,7 +73,9 @@ def _get_queries_dname(seed: int, scale_factor: float) -> str: def _clone(dbgym_cfg: DBGymConfig): - expected_symlink_dpath = dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link" + expected_symlink_dpath = ( + dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link" + ) if expected_symlink_dpath.exists(): benchmark_tpch_logger.info(f"Skipping clone: {expected_symlink_dpath}") return @@ -73,22 +92,32 @@ def _clone(dbgym_cfg: DBGymConfig): def _get_tpch_kit_dpath(dbgym_cfg: DBGymConfig) -> Path: tpch_kit_dpath = (dbgym_cfg.cur_symlinks_build_path() / "tpch-kit.link").resolve() - assert tpch_kit_dpath.exists() and tpch_kit_dpath.is_absolute() and not tpch_kit_dpath.is_symlink() + assert ( + tpch_kit_dpath.exists() + and tpch_kit_dpath.is_absolute() + and not tpch_kit_dpath.is_symlink() + ) return tpch_kit_dpath -def _generate_queries(dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, scale_factor: float): +def _generate_queries( + dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, scale_factor: float +): tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg) data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True) benchmark_tpch_logger.info( f"Generating queries: {data_path} [{seed_start}, {seed_end}]" ) for seed in range(seed_start, seed_end + 1): - expected_queries_symlink_dpath = data_path / (_get_queries_dname(seed, scale_factor) + ".link") + expected_queries_symlink_dpath = data_path / ( + _get_queries_dname(seed, scale_factor) + ".link" + ) if expected_queries_symlink_dpath.exists(): continue - real_dir = dbgym_cfg.cur_task_runs_data_path(_get_queries_dname(seed, scale_factor), mkdir=True) + real_dir = dbgym_cfg.cur_task_runs_data_path( + _get_queries_dname(seed, scale_factor), mkdir=True + ) for i in range(1, 22 + 1): target_sql = (real_dir / f"{i}.sql").resolve() subprocess_run( @@ -106,16 +135,20 @@ def _generate_queries(dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, sc def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float): tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg) data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True) - expected_tables_symlink_dpath = data_path / f"tables_sf{get_scale_factor_string(scale_factor)}.link" + expected_tables_symlink_dpath = ( + data_path / f"tables_sf{get_scale_factor_string(scale_factor)}.link" + ) if expected_tables_symlink_dpath.exists(): - benchmark_tpch_logger.info(f"Skipping generation: {expected_tables_symlink_dpath}") + benchmark_tpch_logger.info( + f"Skipping generation: {expected_tables_symlink_dpath}" + ) return benchmark_tpch_logger.info(f"Generating: {expected_tables_symlink_dpath}") - subprocess_run( - f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen" + subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen") + real_dir = dbgym_cfg.cur_task_runs_data_path( + f"tables_sf{get_scale_factor_string(scale_factor)}", mkdir=True ) - real_dir = dbgym_cfg.cur_task_runs_data_path(f"tables_sf{get_scale_factor_string(scale_factor)}", mkdir=True) subprocess_run(f"mv ./*.tbl {real_dir}", cwd=tpch_kit_dpath / "dbgen") tables_symlink_dpath = link_result(dbgym_cfg, real_dir) @@ -135,9 +168,7 @@ def _generate_workload( expected_workload_symlink_dpath = symlink_data_dpath / (workload_name + ".link") benchmark_tpch_logger.info(f"Generating: {expected_workload_symlink_dpath}") - real_dpath = dbgym_cfg.cur_task_runs_data_path( - workload_name, mkdir=True - ) + real_dpath = dbgym_cfg.cur_task_runs_data_path(workload_name, mkdir=True) queries = None if query_subset == "all": @@ -150,12 +181,19 @@ def _generate_workload( with open(real_dpath / "order.txt", "w") as f: for seed in range(seed_start, seed_end + 1): for qnum in queries: - sql_fpath = (symlink_data_dpath / (_get_queries_dname(seed, scale_factor) + ".link")).resolve() / f"{qnum}.sql" - assert sql_fpath.exists() and not sql_fpath.is_symlink() and sql_fpath.is_absolute(), "We should only write existent real absolute paths to a file" + sql_fpath = ( + symlink_data_dpath + / (_get_queries_dname(seed, scale_factor) + ".link") + ).resolve() / f"{qnum}.sql" + assert ( + sql_fpath.exists() + and not sql_fpath.is_symlink() + and sql_fpath.is_absolute() + ), "We should only write existent real absolute paths to a file" output = ",".join([f"S{seed}-Q{qnum}", str(sql_fpath)]) print(output, file=f) # TODO(WAN): add option to deep-copy the workload. - + workload_symlink_dpath = link_result(dbgym_cfg, real_dpath) assert workload_symlink_dpath == expected_workload_symlink_dpath benchmark_tpch_logger.info(f"Generated: {expected_workload_symlink_dpath}") diff --git a/benchmark/tpch/load_info.py b/benchmark/tpch/load_info.py index 8db2f0b4..2c84ac2b 100644 --- a/benchmark/tpch/load_info.py +++ b/benchmark/tpch/load_info.py @@ -1,7 +1,6 @@ from dbms.load_info_base_class import LoadInfoBaseClass from misc.utils import DBGymConfig, get_scale_factor_string - TPCH_SCHEMA_FNAME = "tpch_schema.sql" TPCH_CONSTRAINTS_FNAME = "tpch_constraints.sql" @@ -39,11 +38,17 @@ def __init__(self, dbgym_cfg: DBGymConfig, scale_factor: float): ), f"self._constraints_fpath ({self._constraints_fpath}) does not exist" # tables - data_root_dpath = dbgym_cfg.dbgym_symlinks_path / TpchLoadInfo.CODEBASE_DNAME / "data" - tables_symlink_dpath = data_root_dpath / f"tables_sf{get_scale_factor_string(scale_factor)}.link" + data_root_dpath = ( + dbgym_cfg.dbgym_symlinks_path / TpchLoadInfo.CODEBASE_DNAME / "data" + ) + tables_symlink_dpath = ( + data_root_dpath / f"tables_sf{get_scale_factor_string(scale_factor)}.link" + ) tables_dpath = tables_symlink_dpath.resolve() assert ( - tables_dpath.exists() and tables_dpath.is_absolute() and not tables_dpath.is_symlink() + tables_dpath.exists() + and tables_dpath.is_absolute() + and not tables_dpath.is_symlink() ), f"tables_dpath ({tables_dpath}) should be an existent real absolute path. Make sure you have generated the TPC-H data" self._tables_and_fpaths = [] for table in TpchLoadInfo.TABLES: diff --git a/dbms/postgres/cli.py b/dbms/postgres/cli.py index 22789140..140f7e7c 100644 --- a/dbms/postgres/cli.py +++ b/dbms/postgres/cli.py @@ -4,20 +4,42 @@ a Postgres instance during agent tuning. util.pg provides helpers used by *both* of the above files (as well as other files). """ + import logging import os import shutil import subprocess from pathlib import Path + import click +from sqlalchemy import Connection from benchmark.tpch.load_info import TpchLoadInfo from dbms.load_info_base_class import LoadInfoBaseClass -from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_dbdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_dbdata_parent_dpath, is_ssd +from misc.utils import ( + WORKSPACE_PATH_PLACEHOLDER, + DBGymConfig, + conv_inputpath_to_realabspath, + default_dbdata_parent_dpath, + default_pgbin_path, + get_dbdata_tgz_name, + is_ssd, + link_result, + open_and_save, + save_file, +) +from util.pg import ( + DBGYM_POSTGRES_DBNAME, + DBGYM_POSTGRES_PASS, + DBGYM_POSTGRES_USER, + DEFAULT_POSTGRES_DBNAME, + DEFAULT_POSTGRES_PORT, + SHARED_PRELOAD_LIBRARIES, + conn_execute, + create_conn, + sql_file_execute, +) from util.shell import subprocess_run -from sqlalchemy import Connection -from util.pg import SHARED_PRELOAD_LIBRARIES, conn_execute, sql_file_execute, DBGYM_POSTGRES_DBNAME, create_conn, DEFAULT_POSTGRES_PORT, DBGYM_POSTGRES_USER, DBGYM_POSTGRES_PASS, DEFAULT_POSTGRES_DBNAME - dbms_postgres_logger = logging.getLogger("dbms/postgres") dbms_postgres_logger.setLevel(logging.INFO) @@ -34,7 +56,11 @@ def postgres_group(dbgym_cfg: DBGymConfig): help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create dbdata.", ) @click.pass_obj -@click.option("--rebuild", is_flag=True, help="Include this flag to rebuild Postgres even if it already exists.") +@click.option( + "--rebuild", + is_flag=True, + help="Include this flag to rebuild Postgres even if it already exists.", +) def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool): _build_repo(dbgym_cfg, rebuild) @@ -46,7 +72,12 @@ def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool): @click.pass_obj @click.argument("benchmark_name", type=str) @click.option("--scale-factor", type=float, default=1) -@click.option("--pgbin-path", type=Path, default=None, help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.") +@click.option( + "--pgbin-path", + type=Path, + default=None, + help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.", +) @click.option( "--intended-dbdata-hardware", type=click.Choice(["hdd", "ssd"]), @@ -59,12 +90,21 @@ def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool): type=Path, help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.", ) -def postgres_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_dbdata_hardware: str, dbdata_parent_dpath: Path): +def postgres_dbdata( + dbgym_cfg: DBGymConfig, + benchmark_name: str, + scale_factor: float, + pgbin_path: Path, + intended_dbdata_hardware: str, + dbdata_parent_dpath: Path, +): # Set args to defaults programmatically (do this before doing anything else in the function) if pgbin_path == None: pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path) if dbdata_parent_dpath == None: - dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) + dbdata_parent_dpath = default_dbdata_parent_dpath( + dbgym_cfg.dbgym_workspace_path + ) # Convert all input paths to absolute paths pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path) @@ -72,18 +112,26 @@ def postgres_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: f # Check assertions on args if intended_dbdata_hardware == "hdd": - assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" + assert not is_ssd( + dbdata_parent_dpath + ), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" elif intended_dbdata_hardware == "ssd": - assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" + assert is_ssd( + dbdata_parent_dpath + ), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" else: assert False # Create dbdata - _create_dbdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath) + _create_dbdata( + dbgym_cfg, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath + ) def _get_pgbin_symlink_path(dbgym_cfg: DBGymConfig) -> Path: - return dbgym_cfg.cur_symlinks_build_path("repo.link", "boot", "build", "postgres", "bin") + return dbgym_cfg.cur_symlinks_build_path( + "repo.link", "boot", "build", "postgres", "bin" + ) def _get_repo_symlink_path(dbgym_cfg: DBGymConfig) -> Path: @@ -93,7 +141,9 @@ def _get_repo_symlink_path(dbgym_cfg: DBGymConfig) -> Path: def _build_repo(dbgym_cfg: DBGymConfig, rebuild): expected_repo_symlink_dpath = _get_repo_symlink_path(dbgym_cfg) if not rebuild and expected_repo_symlink_dpath.exists(): - dbms_postgres_logger.info(f"Skipping _build_repo: {expected_repo_symlink_dpath}") + dbms_postgres_logger.info( + f"Skipping _build_repo: {expected_repo_symlink_dpath}" + ) return dbms_postgres_logger.info(f"Setting up repo in {expected_repo_symlink_dpath}") @@ -108,7 +158,13 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild): dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}") -def _create_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, dbdata_parent_dpath: Path) -> None: +def _create_dbdata( + dbgym_cfg: DBGymConfig, + benchmark_name: str, + scale_factor: float, + pgbin_path: Path, + dbdata_parent_dpath: Path, +) -> None: """ I chose *not* for this function to skip by default if dbdata_tgz_symlink_path already exists. This is because, while the generated data is deterministic given benchmark_name and scale_factor, any @@ -177,7 +233,7 @@ def _generic_dbdata_setup(dbgym_cfg: DBGymConfig): subprocess_run( # You have to use TO and you can't put single quotes around the libraries (https://postgrespro.com/list/thread-id/2580120) # The method I wrote here works for both one library and multiple libraries - f"./psql -c \"ALTER SYSTEM SET shared_preload_libraries TO {SHARED_PRELOAD_LIBRARIES};\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost", + f'./psql -c "ALTER SYSTEM SET shared_preload_libraries TO {SHARED_PRELOAD_LIBRARIES};" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost', cwd=pgbin_real_dpath, ) @@ -203,7 +259,9 @@ def _load_benchmark_into_dbdata( _load_into_dbdata(dbgym_cfg, conn, load_info) -def _load_into_dbdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass): +def _load_into_dbdata( + dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass +): sql_file_execute(dbgym_cfg, conn, load_info.get_schema_fpath()) # truncate all tables first before even loading a single one @@ -222,7 +280,9 @@ def _load_into_dbdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadI sql_file_execute(dbgym_cfg, conn, constraints_fpath) -def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None: +def start_postgres( + dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path +) -> None: _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, True) @@ -230,7 +290,9 @@ def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, False) -def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path, is_start: bool) -> None: +def _start_or_stop_postgres( + dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path, is_start: bool +) -> None: # They should be absolute paths and should exist assert pgbin_path.is_absolute() and pgbin_path.exists() assert dbdata_dpath.is_absolute() and dbdata_dpath.exists() @@ -244,7 +306,14 @@ def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpa # We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start". # The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None. # On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do. - result = subprocess.run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True) + result = subprocess.run( + f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' start", + cwd=pgbin_real_dpath, + shell=True, + ) result.check_returncode() else: - subprocess_run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath) \ No newline at end of file + subprocess_run( + f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' stop", + cwd=pgbin_real_dpath, + ) diff --git a/manage/cli.py b/manage/cli.py index 3ce8c65a..3f3cba2e 100644 --- a/manage/cli.py +++ b/manage/cli.py @@ -1,13 +1,14 @@ +import logging +import os import shutil +from itertools import chain +from pathlib import Path from typing import List, Set + import click import yaml -import logging -from pathlib import Path -from misc.utils import DBGymConfig, is_child_path, parent_dpath_of_path -from itertools import chain -import os +from misc.utils import DBGymConfig, is_child_path, parent_dpath_of_path task_logger = logging.getLogger("task") task_logger.setLevel(logging.INFO) @@ -85,7 +86,7 @@ def manage_standardize(dbgym_cfg): "--mode", type=click.Choice(["safe", "aggressive"]), default="safe", - help="The mode to clean the workspace (default=\"safe\"). \"aggressive\" means \"only keep run_*/ folders referenced by a file in symlinks/\". \"safe\" means \"in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping.\"" + help='The mode to clean the workspace (default="safe"). "aggressive" means "only keep run_*/ folders referenced by a file in symlinks/". "safe" means "in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping."', ) def manage_clean(dbgym_cfg: DBGymConfig, mode: str): clean_workspace(dbgym_cfg, mode=mode, verbose=True) @@ -95,10 +96,14 @@ def manage_clean(dbgym_cfg: DBGymConfig, mode: str): @click.pass_obj def manage_count(dbgym_cfg: DBGymConfig): num_files = _count_files_in_workspace(dbgym_cfg) - print(f"The workspace ({dbgym_cfg.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks.") + print( + f"The workspace ({dbgym_cfg.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks." + ) -def add_symlinks_in_dpath(symlinks_stack: List[Path], root_dpath: Path, processed_symlinks: Set[Path]) -> None: +def add_symlinks_in_dpath( + symlinks_stack: List[Path], root_dpath: Path, processed_symlinks: Set[Path] +) -> None: """ Will modify symlinks_stack and processed_symlinks. """ @@ -117,17 +122,21 @@ def _count_files_in_workspace(dbgym_cfg: DBGymConfig) -> int: Counts the number of files (regular file or dir or symlink) in the workspace. """ total_count = 0 - for dirpath, dirnames, filenames in os.walk(dbgym_cfg.dbgym_workspace_path, followlinks=False): + for dirpath, dirnames, filenames in os.walk( + dbgym_cfg.dbgym_workspace_path, followlinks=False + ): # Check if any of the directories are symbolic links and remove them from dirnames - dirnames[:] = [d for d in dirnames if not os.path.islink(os.path.join(dirpath, d))] - + dirnames[:] = [ + d for d in dirnames if not os.path.islink(os.path.join(dirpath, d)) + ] + # Count files and directories (non-symlink directories already filtered) total_count += len(filenames) + len(dirnames) return total_count -def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) -> None: +def clean_workspace(dbgym_cfg: DBGymConfig, mode: str = "safe", verbose=False) -> None: """ Clean all [workspace]/task_runs/run_*/ directories that are not referenced by any "active symlinks". If mode is "aggressive", "active symlinks" means *only* the symlinks directly in [workspace]/symlinks/. @@ -141,7 +150,9 @@ def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) -> # 1. Initialize paths to process if dbgym_cfg.dbgym_symlinks_path.exists(): - add_symlinks_in_dpath(symlink_fpaths_to_process, dbgym_cfg.dbgym_symlinks_path, processed_symlinks) + add_symlinks_in_dpath( + symlink_fpaths_to_process, dbgym_cfg.dbgym_symlinks_path, processed_symlinks + ) # 2. Go through symlinks, figuring out which "children of task runs" to keep # Based on the rules of the framework, "children of task runs" should be run_*/ directories. @@ -159,7 +170,9 @@ def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) -> # processing on the result of os.readlink() to convert it to an absolute path real_fordpath = symlink_fpath.resolve() one_layer_resolved_fordpath = os.readlink(symlink_fpath) - assert str(real_fordpath) == str(os.readlink(symlink_fpath)), f"symlink_fpath ({symlink_fpath}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually." + assert str(real_fordpath) == str( + os.readlink(symlink_fpath) + ), f"symlink_fpath ({symlink_fpath}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually." # If the file doesn't exist, we'll just ignore it. if not real_fordpath.exists(): @@ -183,15 +196,25 @@ def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) -> # However, as with above, we won't just nuke files if the workspace doesn't follow this rule for # some reason. task_run_child_fordpath = real_fordpath - while not parent_dpath_of_path(task_run_child_fordpath).samefile(dbgym_cfg.dbgym_runs_path): - task_run_child_fordpath = parent_dpath_of_path(task_run_child_fordpath) + while not parent_dpath_of_path(task_run_child_fordpath).samefile( + dbgym_cfg.dbgym_runs_path + ): + task_run_child_fordpath = parent_dpath_of_path( + task_run_child_fordpath + ) assert task_run_child_fordpath != None - assert parent_dpath_of_path(task_run_child_fordpath).samefile(dbgym_cfg.dbgym_runs_path), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path" + assert parent_dpath_of_path(task_run_child_fordpath).samefile( + dbgym_cfg.dbgym_runs_path + ), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path" task_run_child_fordpaths_to_keep.add(task_run_child_fordpath) - + # If on safe mode, add symlinks inside the task_run_child_fordpath to be processed if mode == "safe": - add_symlinks_in_dpath(symlink_fpaths_to_process, task_run_child_fordpath, processed_symlinks) + add_symlinks_in_dpath( + symlink_fpaths_to_process, + task_run_child_fordpath, + processed_symlinks, + ) # 3. Go through all children of task_runs/*, deleting any that we weren't told to keep # It's true that symlinks might link outside of task_runs/*. We'll just not care about those @@ -206,12 +229,16 @@ def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) -> ending_num_files = _count_files_in_workspace(dbgym_cfg) if verbose: - task_logger.info(f"Removed {starting_num_files - ending_num_files} out of {starting_num_files} files") - task_logger.info(f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}") + task_logger.info( + f"Removed {starting_num_files - ending_num_files} out of {starting_num_files} files" + ) + task_logger.info( + f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}" + ) manage_group.add_command(manage_show) manage_group.add_command(manage_write) manage_group.add_command(manage_standardize) manage_group.add_command(manage_clean) -manage_group.add_command(manage_count) \ No newline at end of file +manage_group.add_command(manage_count) diff --git a/manage/tests/test_clean.py b/manage/tests/test_clean.py index 2df33d2d..2ba24249 100644 --- a/manage/tests/test_clean.py +++ b/manage/tests/test_clean.py @@ -1,13 +1,16 @@ +import copy import logging -from pathlib import Path -import unittest import os import shutil -import copy +import unittest +from pathlib import Path -from misc.utils import get_symlinks_path_from_workspace_path, get_runs_path_from_workspace_path, path_exists_dont_follow_symlinks from manage.cli import clean_workspace - +from misc.utils import ( + get_runs_path_from_workspace_path, + get_symlinks_path_from_workspace_path, + path_exists_dont_follow_symlinks, +) # This is here instead of on `if __name__ == "__main__"` because we often run individual tests, which # does not go through the `if __name__ == "__main__"` codepath. @@ -18,7 +21,9 @@ class MockDBGymConfig: def __init__(self, scratchspace_path: Path): self.dbgym_workspace_path = scratchspace_path - self.dbgym_symlinks_path = get_symlinks_path_from_workspace_path(scratchspace_path) + self.dbgym_symlinks_path = get_symlinks_path_from_workspace_path( + scratchspace_path + ) self.dbgym_runs_path = get_runs_path_from_workspace_path(scratchspace_path) @@ -27,12 +32,15 @@ class CleanTests(unittest.TestCase): I deemed "clean" important enough to write extensive unit tests for because a bug could lead to losing important files. """ + @staticmethod def create_structure(root_path: Path, structure: dict) -> None: - def create_structure_internal(root_path: Path, cur_path: Path, structure: dict) -> None: + def create_structure_internal( + root_path: Path, cur_path: Path, structure: dict + ) -> None: for path, content in structure.items(): full_path: Path = cur_path / path - + if isinstance(content, dict): # Directory full_path.mkdir(parents=True, exist_ok=True) create_structure_internal(root_path, full_path, content) @@ -45,13 +53,15 @@ def create_structure_internal(root_path: Path, cur_path: Path, structure: dict) os.symlink(target_path, full_path) else: raise ValueError(f"Unsupported type for path ({path}): {content}") - + root_path.mkdir(parents=True, exist_ok=True) create_structure_internal(root_path, root_path, structure) - + @staticmethod def verify_structure(root_path: Path, structure: dict) -> bool: - def verify_structure_internal(root_path: Path, cur_path: Path, structure: dict) -> bool: + def verify_structure_internal( + root_path: Path, cur_path: Path, structure: dict + ) -> bool: # Check for the presence of each item specified in the structure for name, item in structure.items(): new_cur_path = cur_path / name @@ -76,16 +86,20 @@ def verify_structure_internal(root_path: Path, cur_path: Path, structure: dict) if item[1] != None: expected_target = root_path / item[1] if not new_cur_path.resolve().samefile(expected_target): - logging.debug(f"expected {new_cur_path} to link to {expected_target}, but it links to {new_cur_path.resolve()}") + logging.debug( + f"expected {new_cur_path} to link to {expected_target}, but it links to {new_cur_path.resolve()}" + ) return False else: assert False, "structure misconfigured" - + # Check for any extra files or directories not described by the structure expected_names = set(structure.keys()) actual_names = {entry.name for entry in cur_path.iterdir()} if not expected_names.issuperset(actual_names): - logging.debug(f"expected_names={expected_names}, actual_names={actual_names}") + logging.debug( + f"expected_names={expected_names}, actual_names={actual_names}" + ) return False return True @@ -96,7 +110,9 @@ def verify_structure_internal(root_path: Path, cur_path: Path, structure: dict) return verify_structure_internal(root_path, root_path, structure) @staticmethod - def make_workspace_structure(symlinks_structure: dict, task_runs_structure: dict) -> dict: + def make_workspace_structure( + symlinks_structure: dict, task_runs_structure: dict + ) -> dict: """ This function exists so that it's easier to refactor the tests in case we ever change how the workspace is organized. @@ -105,7 +121,7 @@ def make_workspace_structure(symlinks_structure: dict, task_runs_structure: dict "symlinks": symlinks_structure, "task_runs": task_runs_structure, } - + @classmethod def setUpClass(cls): cls.scratchspace_path = Path.cwd() / "manage/tests/test_clean_scratchspace/" @@ -120,271 +136,275 @@ def tearDown(self): def test_structure_helpers(self): structure = { - "dir1": { - "file1.txt": ("file",), - "dir2": { - "file2.txt": ("file",) - } - }, - "dir3": { - "nested_link_to_dir1": ("symlink", "dir1") - }, + "dir1": {"file1.txt": ("file",), "dir2": {"file2.txt": ("file",)}}, + "dir3": {"nested_link_to_dir1": ("symlink", "dir1")}, "link_to_dir1": ("symlink", "dir1"), - "link_to_file2": ("symlink", "dir1/dir2/file2.txt") + "link_to_file2": ("symlink", "dir1/dir2/file2.txt"), } CleanTests.create_structure(self.scratchspace_path, structure) self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, structure)) extra_dir_structure = copy.deepcopy(structure) # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, extra_dir_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, extra_dir_structure) + ) extra_dir_structure["dir4"] = {} - self.assertFalse(CleanTests.verify_structure(self.scratchspace_path, extra_dir_structure)) + self.assertFalse( + CleanTests.verify_structure(self.scratchspace_path, extra_dir_structure) + ) missing_dir_structure = copy.deepcopy(structure) # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, missing_dir_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, missing_dir_structure) + ) del missing_dir_structure["dir1"] - self.assertFalse(CleanTests.verify_structure(self.scratchspace_path, missing_dir_structure)) + self.assertFalse( + CleanTests.verify_structure(self.scratchspace_path, missing_dir_structure) + ) extra_file_structure = copy.deepcopy(structure) # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, extra_file_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, extra_file_structure) + ) extra_file_structure["file3.txt"] = ("file",) - self.assertFalse(CleanTests.verify_structure(self.scratchspace_path, extra_file_structure)) + self.assertFalse( + CleanTests.verify_structure(self.scratchspace_path, extra_file_structure) + ) missing_file_structure = copy.deepcopy(structure) # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, missing_file_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, missing_file_structure) + ) del missing_file_structure["dir1"]["file1.txt"] - self.assertFalse(CleanTests.verify_structure(self.scratchspace_path, missing_file_structure)) + self.assertFalse( + CleanTests.verify_structure(self.scratchspace_path, missing_file_structure) + ) extra_link_structure = copy.deepcopy(structure) # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, extra_link_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, extra_link_structure) + ) extra_link_structure["link_to_dir3"] = ("symlink", "dir3") - self.assertFalse(CleanTests.verify_structure(self.scratchspace_path, extra_link_structure)) + self.assertFalse( + CleanTests.verify_structure(self.scratchspace_path, extra_link_structure) + ) missing_link_structure = copy.deepcopy(structure) # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, missing_link_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, missing_link_structure) + ) del missing_link_structure["link_to_dir1"] - self.assertFalse(CleanTests.verify_structure(self.scratchspace_path, missing_link_structure)) + self.assertFalse( + CleanTests.verify_structure(self.scratchspace_path, missing_link_structure) + ) wrong_link_structure = copy.deepcopy(structure) # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, wrong_link_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, wrong_link_structure) + ) wrong_link_structure["link_to_dir1"] = ("symlink", "dir3") - self.assertFalse(CleanTests.verify_structure(self.scratchspace_path, wrong_link_structure)) + self.assertFalse( + CleanTests.verify_structure(self.scratchspace_path, wrong_link_structure) + ) def test_nonexistent_workspace(self): clean_workspace(MockDBGymConfig(self.scratchspace_path)) - + def test_no_symlinks_dir_and_no_task_runs_dir(self): starting_structure = {} ending_structure = {} CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) - + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) + def test_no_symlinks_dir_and_yes_task_runs_dir(self): - starting_structure = { - "task_runs": { - "file1.txt": ("file",) - } - } - ending_structure = { - "task_runs": {} - } + starting_structure = {"task_runs": {"file1.txt": ("file",)}} + ending_structure = {"task_runs": {}} CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) - + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) + def test_yes_symlinks_dir_and_no_task_runs_dir(self): - starting_structure = { - "symlinks": {} - } - ending_structure = { - "symlinks": {} - } + starting_structure = {"symlinks": {}} + ending_structure = {"symlinks": {}} CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_no_symlinks_in_dir_and_no_task_runs_in_dir(self): starting_symlinks_structure = {} starting_task_runs_structure = {} - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) ending_symlinks_structure = {} ending_task_runs_structure = {} - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_no_links_in_symlinks(self): starting_symlinks_structure = {} - starting_task_runs_structure = { - "run_0": {} - } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) + starting_task_runs_structure = {"run_0": {}} + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) ending_symlinks_structure = {} ending_task_runs_structure = {} - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_link_to_file_directly_in_task_runs(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "task_runs/file1.txt") - } - starting_task_runs_structure = { - "file1.txt": ("file",), - "file2.txt": ("file",) - } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) - ending_symlinks_structure = { - "symlink1": ("symlink", "task_runs/file1.txt") - } - ending_task_runs_structure = { - "file1.txt": ("file",) - } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + starting_symlinks_structure = {"symlink1": ("symlink", "task_runs/file1.txt")} + starting_task_runs_structure = {"file1.txt": ("file",), "file2.txt": ("file",)} + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) + ending_symlinks_structure = {"symlink1": ("symlink", "task_runs/file1.txt")} + ending_task_runs_structure = {"file1.txt": ("file",)} + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_link_to_dir_directly_in_task_runs(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } + starting_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} starting_task_runs_structure = { - "dir1": { - "file1.txt": ("file",) - }, - "dir2": { - "file2.txt": ("file",) - } - } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) - ending_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } - ending_task_runs_structure = { - "dir1": { - "file1.txt": ("file",) - } - } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + "dir1": {"file1.txt": ("file",)}, + "dir2": {"file2.txt": ("file",)}, + } + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) + ending_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} + ending_task_runs_structure = {"dir1": {"file1.txt": ("file",)}} + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_link_to_file_in_dir_in_task_runs(self): starting_symlinks_structure = { "symlink1": ("symlink", "task_runs/dir1/file1.txt") } starting_task_runs_structure = { - "dir1": { - "file1.txt": ("file",) - }, - "dir2": { - "file2.txt": ("file",) - } + "dir1": {"file1.txt": ("file",)}, + "dir2": {"file2.txt": ("file",)}, } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) ending_symlinks_structure = { "symlink1": ("symlink", "task_runs/dir1/file1.txt") } - ending_task_runs_structure = { - "dir1": { - "file1.txt": ("file",) - } - } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_task_runs_structure = {"dir1": {"file1.txt": ("file",)}} + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_link_to_dir_in_dir_in_task_runs(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1/dir2") - } + starting_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1/dir2")} starting_task_runs_structure = { - "dir1": { - "dir2": { - "file1.txt": ("file",) - }, - "file2.txt": ("file",) - }, - "dir3": { - "file3.txt": ("file",) - } - } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) - ending_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1/dir2") + "dir1": {"dir2": {"file1.txt": ("file",)}, "file2.txt": ("file",)}, + "dir3": {"file3.txt": ("file",)}, } + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) + ending_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1/dir2")} ending_task_runs_structure = { - "dir1": { - "dir2": { - "file1.txt": ("file",) - }, - "file2.txt": ("file",) - }, + "dir1": {"dir2": {"file1.txt": ("file",)}, "file2.txt": ("file",)}, } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path)) - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_link_to_link_crashes(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "task_runs/symlink2") - } + starting_symlinks_structure = {"symlink1": ("symlink", "task_runs/symlink2")} starting_task_runs_structure = { "symlink2": ("symlink", "task_runs/file1.txt"), - "file1.txt": ("file",) + "file1.txt": ("file",), } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) with self.assertRaises(AssertionError): clean_workspace(MockDBGymConfig(self.scratchspace_path)) def test_safe_mode_link_to_dir_with_link(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } + starting_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} starting_task_runs_structure = { - "dir1": { - "symlink2": ("symlink", "task_runs/file1.txt") - }, + "dir1": {"symlink2": ("symlink", "task_runs/file1.txt")}, "file1.txt": ("file",), - "file2.txt": ("file",) - } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) - ending_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") + "file2.txt": ("file",), } + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) + ending_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} ending_task_runs_structure = { - "dir1": { - "symlink2": ("symlink", "task_runs/file1.txt") - }, + "dir1": {"symlink2": ("symlink", "task_runs/file1.txt")}, "file1.txt": ("file",), } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_safe_mode_link_to_file_in_dir_with_link(self): starting_symlinks_structure = { @@ -393,96 +413,98 @@ def test_safe_mode_link_to_file_in_dir_with_link(self): starting_task_runs_structure = { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/file2.txt") + "symlink2": ("symlink", "task_runs/file2.txt"), }, "file2.txt": ("file",), - "file3.txt": ("file",) + "file3.txt": ("file",), } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) ending_symlinks_structure = { "symlink1": ("symlink", "task_runs/dir1/file1.txt") } ending_task_runs_structure = { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/file2.txt") + "symlink2": ("symlink", "task_runs/file2.txt"), }, "file2.txt": ("file",), } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_safe_mode_link_to_dir_with_link_to_file_in_dir_in_task_runs(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } + starting_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} starting_task_runs_structure = { - "dir1": { - "symlink2": ("symlink", "task_runs/dir2/file2.txt") - }, + "dir1": {"symlink2": ("symlink", "task_runs/dir2/file2.txt")}, "dir2": { "file2.txt": ("file",), }, - "file3.txt": ("file",) - } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) - ending_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") + "file3.txt": ("file",), } + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) + ending_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} ending_task_runs_structure = { - "dir1": { - "symlink2": ("symlink", "task_runs/dir2/file2.txt") - }, + "dir1": {"symlink2": ("symlink", "task_runs/dir2/file2.txt")}, "dir2": { "file2.txt": ("file",), }, } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_aggressive_mode_link_to_dir_with_link(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } + starting_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} starting_task_runs_structure = { - "dir1": { - "symlink2": ("symlink", "task_runs/file1.txt") - }, + "dir1": {"symlink2": ("symlink", "task_runs/file1.txt")}, "file1.txt": ("file",), - "file2.txt": ("file",) - } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) - ending_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") + "file2.txt": ("file",), } + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) + ending_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} ending_task_runs_structure = { - "dir1": { - "symlink2": ("symlink", None) - }, + "dir1": {"symlink2": ("symlink", None)}, } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="aggressive") - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_link_to_link_to_file_gives_error(self): starting_symlinks_structure = { "symlink1": ("symlink", "task_runs/dir1/symlink2") } starting_task_runs_structure = { - "dir1": { - "symlink2": ("symlink", "task_runs/file2.txt") - }, + "dir1": {"symlink2": ("symlink", "task_runs/file2.txt")}, "file2.txt": ("file",), } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) @@ -495,11 +517,11 @@ def test_multi_link_loop_gives_error(self): "symlink1": ("symlink", "task_runs/dir1/symlink2") } starting_task_runs_structure = { - "dir1": { - "symlink2": ("symlink", "symlinks/symlink1") - }, + "dir1": {"symlink2": ("symlink", "symlinks/symlink1")}, } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) @@ -508,11 +530,11 @@ def test_multi_link_loop_gives_error(self): clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") def test_link_self_loop_gives_error(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "symlinks/symlink1") - } + starting_symlinks_structure = {"symlink1": ("symlink", "symlinks/symlink1")} starting_task_runs_structure = dict() - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) @@ -520,140 +542,149 @@ def test_link_self_loop_gives_error(self): with self.assertRaises(RuntimeError): clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } + def test_dont_loop_infinitely_if_there_are_cycles_between_different_dirs_in_runs( + self, + ): + starting_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} starting_task_runs_structure = { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir2/file2.txt") + "symlink2": ("symlink", "task_runs/dir2/file2.txt"), }, "dir2": { "file2.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt") + "symlink2": ("symlink", "task_runs/dir1/file1.txt"), }, } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) - ending_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) + ending_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} ending_task_runs_structure = { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir2/file2.txt") + "symlink2": ("symlink", "task_runs/dir2/file2.txt"), }, "dir2": { "file2.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt") + "symlink2": ("symlink", "task_runs/dir1/file1.txt"), }, } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) - - def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_itself(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) + + def test_dont_loop_infinitely_if_there_is_a_dir_in_runs_that_links_to_a_file_in_itself( + self, + ): + starting_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} starting_task_runs_structure = { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt") + "symlink2": ("symlink", "task_runs/dir1/file1.txt"), }, } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) - ending_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) + ending_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} ending_task_runs_structure = { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt") + "symlink2": ("symlink", "task_runs/dir1/file1.txt"), }, } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_dont_loop_infinitely_if_there_is_loop_amongst_symlinks(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } + starting_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} starting_task_runs_structure = { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt") + "symlink2": ("symlink", "task_runs/dir1/file1.txt"), }, } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) - ending_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) + ending_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} ending_task_runs_structure = { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/file1.txt") + "symlink2": ("symlink", "task_runs/dir1/file1.txt"), }, } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_broken_symlink_has_no_effect(self): - starting_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") - } + starting_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} starting_task_runs_structure = { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "task_runs/dir1/non_existent_file.txt") + "symlink2": ("symlink", "task_runs/dir1/non_existent_file.txt"), }, - "dir2": { - "file2.txt": ("file",) - } - } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) - ending_symlinks_structure = { - "symlink1": ("symlink", "task_runs/dir1") + "dir2": {"file2.txt": ("file",)}, } + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) + ending_symlinks_structure = {"symlink1": ("symlink", "task_runs/dir1")} ending_task_runs_structure = { - "dir1": { - "file1.txt": ("file",), - "symlink2": ("symlink", None) - } + "dir1": {"file1.txt": ("file",), "symlink2": ("symlink", None)} } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) # The idea behind this test is that we shouldn't be following links outside of task_runs, even on safe mode - def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save_other_run(self): + def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save_other_run( + self, + ): starting_symlinks_structure = { "symlink1": ("symlink", "task_runs/dir1/file1.txt") } starting_task_runs_structure = { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "external/dir3/file3.txt") + "symlink2": ("symlink", "external/dir3/file3.txt"), }, - "dir2": { - "file2.txt": ("file",) - } + "dir2": {"file2.txt": ("file",)}, } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) starting_structure["external"] = { "dir3": { "file3.txt": ("file",), - "symlink3": ("symlink", "task_runs/dir2/file2.txt") + "symlink3": ("symlink", "task_runs/dir2/file2.txt"), } } ending_symlinks_structure = { @@ -662,41 +693,42 @@ def test_link_to_folder_outside_runs_that_contains_link_to_other_run_doesnt_save ending_task_runs_structure = { "dir1": { "file1.txt": ("file",), - "symlink2": ("symlink", "external/dir3/file3.txt") + "symlink2": ("symlink", "external/dir3/file3.txt"), } } - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) ending_structure["external"] = { - "dir3": { - "file3.txt": ("file",), - "symlink3": ("symlink", None) - } + "dir3": {"file3.txt": ("file",), "symlink3": ("symlink", None)} } CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) def test_outside_task_runs_doesnt_get_deleted(self): starting_symlinks_structure = {} - starting_task_runs_structure = { - "dir1": {} - } - starting_structure = CleanTests.make_workspace_structure(starting_symlinks_structure, starting_task_runs_structure) - starting_structure["external"] = { - "file1.txt": ("file",) - } + starting_task_runs_structure = {"dir1": {}} + starting_structure = CleanTests.make_workspace_structure( + starting_symlinks_structure, starting_task_runs_structure + ) + starting_structure["external"] = {"file1.txt": ("file",)} ending_symlinks_structure = {} ending_task_runs_structure = {} - ending_structure = CleanTests.make_workspace_structure(ending_symlinks_structure, ending_task_runs_structure) - ending_structure["external"] = { - "file1.txt": ("file",) - } + ending_structure = CleanTests.make_workspace_structure( + ending_symlinks_structure, ending_task_runs_structure + ) + ending_structure["external"] = {"file1.txt": ("file",)} CleanTests.create_structure(self.scratchspace_path, starting_structure) clean_workspace(MockDBGymConfig(self.scratchspace_path), mode="safe") - self.assertTrue(CleanTests.verify_structure(self.scratchspace_path, ending_structure)) + self.assertTrue( + CleanTests.verify_structure(self.scratchspace_path, ending_structure) + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/misc/utils.py b/misc/utils.py index 24562494..4a78c352 100644 --- a/misc/utils.py +++ b/misc/utils.py @@ -1,19 +1,20 @@ -from enum import Enum import os import shutil import subprocess import sys from datetime import datetime +from enum import Enum from pathlib import Path from typing import Tuple + import click -import yaml import redis +import yaml from util.shell import subprocess_run # Enums -TuningMode = Enum('TuningMode', ['HPO', 'TUNE', 'REPLAY']) +TuningMode = Enum("TuningMode", ["HPO", "TUNE", "REPLAY"]) # Default values DEFAULT_WORKLOAD_TIMEOUT = 600 @@ -35,12 +36,16 @@ # Helper functions that both this file and other files use def get_symlinks_path_from_workspace_path(workspace_path): return workspace_path / "symlinks" + + def get_tmp_path_from_workspace_path(workspace_path): return workspace_path / "tmp" + def get_runs_path_from_workspace_path(workspace_path): return workspace_path / "task_runs" + def get_scale_factor_string(scale_factor: float | str) -> str: assert type(scale_factor) is float or type(scale_factor) is str if scale_factor == SCALE_FACTOR_PLACEHOLDER: @@ -50,7 +55,8 @@ def get_scale_factor_string(scale_factor: float | str) -> str: return str(int(scale_factor)) else: return str(scale_factor).replace(".", "point") - + + def get_dbdata_tgz_name(benchmark_name: str, scale_factor: float) -> str: return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz" @@ -76,7 +82,7 @@ def get_dbdata_tgz_name(benchmark_name: str, scale_factor: float) -> str: # Generally useful functions workload_name_fn = ( - lambda scale_factor, seed_start, seed_end, query_subset : f"workload_sf{get_scale_factor_string(scale_factor)}_{seed_start}_{seed_end}_{query_subset}" + lambda scale_factor, seed_start, seed_end, query_subset: f"workload_sf{get_scale_factor_string(scale_factor)}_{seed_start}_{seed_end}_{query_subset}" ) # Standard names of files/directories. These can refer to either the actual file/directory or a link to the file/directory. @@ -124,7 +130,9 @@ def get_dbdata_tgz_name(benchmark_name: str, scale_factor: float) -> str: / (default_embedder_dname(benchmark_name, workload_name) + ".link") ) default_hpoed_agent_params_path = ( - lambda workspace_path, benchmark_name, workload_name: get_symlinks_path_from_workspace_path(workspace_path) + lambda workspace_path, benchmark_name, workload_name: get_symlinks_path_from_workspace_path( + workspace_path + ) / "dbgym_tune_protox_agent" / "data" / (default_hpoed_agent_params_fname(benchmark_name, workload_name) + ".link") @@ -145,22 +153,31 @@ def get_dbdata_tgz_name(benchmark_name: str, scale_factor: float) -> str: / "data" / (get_dbdata_tgz_name(benchmark_name, scale_factor) + ".link") ) -default_dbdata_parent_dpath = ( - lambda workspace_path: get_tmp_path_from_workspace_path( - workspace_path - ) +default_dbdata_parent_dpath = lambda workspace_path: get_tmp_path_from_workspace_path( + workspace_path ) default_pgbin_path = ( - lambda workspace_path: get_symlinks_path_from_workspace_path( - workspace_path - ) - / "dbgym_dbms_postgres" / "build" / "repo.link" / "boot"/ "build" / "postgres" / "bin" + lambda workspace_path: get_symlinks_path_from_workspace_path(workspace_path) + / "dbgym_dbms_postgres" + / "build" + / "repo.link" + / "boot" + / "build" + / "postgres" + / "bin" ) default_tuning_steps_dpath = ( lambda workspace_path, benchmark_name, workload_name, boot_enabled_during_tune: get_symlinks_path_from_workspace_path( workspace_path ) - / "dbgym_tune_protox_agent" / "artifacts" / (default_tuning_steps_dname(benchmark_name, workload_name, boot_enabled_during_tune) + ".link") + / "dbgym_tune_protox_agent" + / "artifacts" + / ( + default_tuning_steps_dname( + benchmark_name, workload_name, boot_enabled_during_tune + ) + + ".link" + ) ) @@ -168,12 +185,15 @@ class DBGymConfig: """ Global configurations that apply to all parts of DB-Gym """ + num_times_created_this_run: int = 0 def __init__(self, dbgym_config_path: Path): # The logic around dbgym_tmp_path assumes that DBGymConfig is only constructed once. DBGymConfig.num_times_created_this_run += 1 - assert DBGymConfig.num_times_created_this_run == 1, f"DBGymConfig has been created {DBGymConfig.num_times_created_this_run} times. It should only be created once per run." + assert ( + DBGymConfig.num_times_created_this_run == 1 + ), f"DBGymConfig has been created {DBGymConfig.num_times_created_this_run} times. It should only be created once per run." assert is_base_git_dir( os.getcwd() @@ -208,7 +228,9 @@ def __init__(self, dbgym_config_path: Path): # One use for it is to place the unzipped dbdata. # There's no need to save the actual dbdata dir in run_*/ because we just save a symlink to # the .tgz file we unzipped. - self.dbgym_tmp_path = get_tmp_path_from_workspace_path(self.dbgym_workspace_path) + self.dbgym_tmp_path = get_tmp_path_from_workspace_path( + self.dbgym_workspace_path + ) # The best place to delete the old dbgym_tmp_path is in DBGymConfig.__init__(). # This is better than deleting the dbgym_tmp_path is in DBGymConfig.__del__() because DBGymConfig may get deleted before execution has completed. # Also, by keeping the tmp directory around, you can look at it to debug issues. @@ -275,7 +297,9 @@ def cur_task_runs_artifacts_path(self, *dirs, mkdir=False) -> Path: return self.cur_task_runs_path("artifacts", *dirs, mkdir=mkdir) -def conv_inputpath_to_realabspath(dbgym_cfg: DBGymConfig, inputpath: os.PathLike) -> Path: +def conv_inputpath_to_realabspath( + dbgym_cfg: DBGymConfig, inputpath: os.PathLike +) -> Path: """ Convert any user inputted path to a real, absolute path For flexibility, we take in any os.PathLike. However, for consistency, we always output a Path object @@ -296,8 +320,12 @@ def conv_inputpath_to_realabspath(dbgym_cfg: DBGymConfig, inputpath: os.PathLike # I believe the pathlib library (https://docs.python.org/3/library/pathlib.html#pathlib.Path.resolve) does it this # way to avoid an edge case related to symlinks and normalizing paths (footnote 1 of the linked docs) realabspath = realabspath.resolve() - assert realabspath.is_absolute(), f"after being processed, realabspath ({realabspath}) is still not absolute" - assert realabspath.exists(), f"after being processed, realabspath ({realabspath}) is still a non-existent path" + assert ( + realabspath.is_absolute() + ), f"after being processed, realabspath ({realabspath}) is still not absolute" + assert ( + realabspath.exists() + ), f"after being processed, realabspath ({realabspath}) is still a non-existent path" return realabspath @@ -341,7 +369,9 @@ def parent_dpath_of_path(dpath: Path) -> Path: This function only calls Path.parent, but in a safer way. """ assert isinstance(dpath, Path) - assert is_fully_resolved(dpath), f"dpath must be fully resolved because Path.parent has weird behavior on non-resolved paths (see https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.parent)" + assert is_fully_resolved( + dpath + ), f"dpath must be fully resolved because Path.parent has weird behavior on non-resolved paths (see https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.parent)" parent_dpath = dpath.parent assert isinstance(parent_dpath, Path) return parent_dpath @@ -352,7 +382,9 @@ def basename_of_path(dpath: Path) -> str: This function only calls Path.name, but in a safer way. """ assert isinstance(dpath, Path) - assert is_fully_resolved(dpath), f"dpath must be fully resolved because Path.name has weird behavior on non-resolved paths (like giving \"..\" if the path ends with a \"..\")" + assert is_fully_resolved( + dpath + ), f'dpath must be fully resolved because Path.name has weird behavior on non-resolved paths (like giving ".." if the path ends with a "..")' dpath_dirname, dpath_basename = os.path.split(dpath) # this means the path ended with a '/' so all os.path.split() does is get rid of the slash if dpath_basename == "": @@ -399,7 +431,9 @@ def open_and_save(dbgym_cfg: DBGymConfig, open_fpath: Path, mode="r"): assert is_fully_resolved( open_fpath ), f"open_and_save(): open_fpath ({open_fpath}) should be a fully resolved path" - assert not os.path.islink(open_fpath), f"open_fpath ({open_fpath}) should not be a symlink" + assert not os.path.islink( + open_fpath + ), f"open_fpath ({open_fpath}) should not be a symlink" assert os.path.exists(open_fpath), f"open_fpath ({open_fpath}) does not exist" # open_and_save *must* be called on files because it doesn't make sense to open a directory. note that this doesn't mean we'll always save # a file though. we sometimes save a directory (see save_file() for details) @@ -412,7 +446,9 @@ def open_and_save(dbgym_cfg: DBGymConfig, open_fpath: Path, mode="r"): return open(open_fpath, mode=mode) -def extract_from_task_run_fordpath(dbgym_cfg: DBGymConfig, task_run_fordpath: Path) -> Tuple[Path, str, Path, str]: +def extract_from_task_run_fordpath( + dbgym_cfg: DBGymConfig, task_run_fordpath: Path +) -> Tuple[Path, str, Path, str]: """ The task_runs/ folder is organized like task_runs/run_*/[codebase]/[org]/any/path/you/want. This function extracts the [codebase] and [org] components @@ -432,9 +468,9 @@ def extract_from_task_run_fordpath(dbgym_cfg: DBGymConfig, task_run_fordpath: Pa ), f"task_run_fordpath ({task_run_fordpath}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_cfg.dbgym_runs_path})" # org_dpath is the run_*/[codebase]/[organization]/ dir that task_run_fordpath is in org_dpath = parent_dpath - while not parent_dpath_of_path(parent_dpath_of_path(parent_dpath_of_path(org_dpath))).samefile( - dbgym_cfg.dbgym_runs_path - ): + while not parent_dpath_of_path( + parent_dpath_of_path(parent_dpath_of_path(org_dpath)) + ).samefile(dbgym_cfg.dbgym_runs_path): org_dpath = parent_dpath_of_path(org_dpath) org_dname = basename_of_path(org_dpath) codebase_dpath = parent_dpath_of_path(org_dpath) @@ -470,7 +506,9 @@ def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> Path: # 2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them if is_child_path(fpath, dbgym_cfg.dbgym_runs_path): # get paths we'll need later. - _, codebase_dname, org_dpath, org_dname = extract_from_task_run_fordpath(dbgym_cfg, fpath) + _, codebase_dname, org_dpath, org_dname = extract_from_task_run_fordpath( + dbgym_cfg, fpath + ) this_run_save_dpath = dbgym_cfg.dbgym_this_run_path / codebase_dname / org_dname os.makedirs(this_run_save_dpath, exist_ok=True) @@ -505,7 +543,9 @@ def save_file(dbgym_cfg: DBGymConfig, fpath: Path) -> Path: # TODO(phw2): refactor our manual symlinking in postgres/cli.py to use link_result() instead -def link_result(dbgym_cfg: DBGymConfig, result_fordpath: Path, custom_result_name: str | None=None) -> Path: +def link_result( + dbgym_cfg: DBGymConfig, result_fordpath: Path, custom_result_name: str | None = None +) -> Path: """ result_fordpath must be a "result", meaning it was generated inside dbgym_cfg.dbgym_this_run_path. Further, result_fordpath must have been generated by this invocation to task.py. This also means that @@ -517,7 +557,9 @@ def link_result(dbgym_cfg: DBGymConfig, result_fordpath: Path, custom_result_nam This function will return the path to the symlink that was created. """ assert isinstance(result_fordpath, Path) - assert is_fully_resolved(result_fordpath), f"result_fordpath ({result_fordpath}) should be a fully resolved path" + assert is_fully_resolved( + result_fordpath + ), f"result_fordpath ({result_fordpath}) should be a fully resolved path" result_fordpath = conv_inputpath_to_realabspath(dbgym_cfg, result_fordpath) assert is_child_path(result_fordpath, dbgym_cfg.dbgym_this_run_path) assert not os.path.islink(result_fordpath) @@ -533,9 +575,13 @@ def link_result(dbgym_cfg: DBGymConfig, result_fordpath: Path, custom_result_nam raise AssertionError("result_fordpath must be either a file or dir") # Figure out the parent directory path of the symlink - codebase_dpath, codebase_dname, _, org_dname = extract_from_task_run_fordpath(dbgym_cfg, result_fordpath) + codebase_dpath, codebase_dname, _, org_dname = extract_from_task_run_fordpath( + dbgym_cfg, result_fordpath + ) # We're only supposed to save files generated by us, which means they should be in cur_task_runs_path() - assert codebase_dpath.samefile(dbgym_cfg.cur_task_runs_path()), f"link_result should only be called on files generated by this invocation to task.py" + assert codebase_dpath.samefile( + dbgym_cfg.cur_task_runs_path() + ), f"link_result should only be called on files generated by this invocation to task.py" symlink_parent_dpath = dbgym_cfg.dbgym_symlinks_path / codebase_dname / org_dname symlink_parent_dpath.mkdir(parents=True, exist_ok=True) @@ -543,7 +589,9 @@ def link_result(dbgym_cfg: DBGymConfig, result_fordpath: Path, custom_result_nam # Note that in a multi-threaded setting, this might remove one created by a process in the same run, # meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink # file of the current run regardless of the order of threads. - assert result_name.endswith(".link") and not result_name.endswith(".link.link"), f"result_name ({result_name}) should end with \".link\"" + assert result_name.endswith(".link") and not result_name.endswith( + ".link.link" + ), f'result_name ({result_name}) should end with ".link"' symlink_path = symlink_parent_dpath / result_name try_remove_file(symlink_path) try_create_symlink(result_fordpath, symlink_path) @@ -602,7 +650,7 @@ def make_redis_started(port: int) -> None: except (redis.ConnectionError, redis.TimeoutError): # This means Redis is not running, so we start it do_start_redis = True - + # I'm starting Redis outside of except so that errors in r.ping get propagated correctly if do_start_redis: subprocess_run(f"redis-server --port {port} --daemonize yes") @@ -613,10 +661,14 @@ def make_redis_started(port: int) -> None: def is_ssd(path: Path) -> bool: try: - device = subprocess.check_output(['df', path]).decode().split('\n')[1].split()[0] + device = ( + subprocess.check_output(["df", path]).decode().split("\n")[1].split()[0] + ) device_basename = os.path.basename(device) - lsblk_output = subprocess.check_output(['lsblk', '-d', '-o', 'name,rota']).decode() - for line in lsblk_output.split('\n')[1:]: + lsblk_output = subprocess.check_output( + ["lsblk", "-d", "-o", "name,rota"] + ).decode() + for line in lsblk_output.split("\n")[1:]: parts = line.split() if parts and parts[0] == device_basename: is_ssd = int(parts[1]) == 0 @@ -624,4 +676,4 @@ def is_ssd(path: Path) -> bool: return False except Exception as e: print(f"An error occurred: {e}") - return False \ No newline at end of file + return False diff --git a/scripts/check_format.sh b/scripts/check_format.sh new file mode 100755 index 00000000..74b4188b --- /dev/null +++ b/scripts/check_format.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -euxo pipefail + +black . --check +isort . --profile black -c diff --git a/scripts/format.sh b/scripts/format.sh index ea085aa7..db5cc125 100755 --- a/scripts/format.sh +++ b/scripts/format.sh @@ -1,5 +1,5 @@ #!/bin/bash set -euxo pipefail -find . \( ! -regex '.*/\..*' \) -name '*.py' -exec black {} + -find . \( ! -regex '.*/\..*' \) -name '*.py' -exec isort --profile black {} + \ No newline at end of file +black . +isort . --profile black diff --git a/scripts/run_unit_tests.py b/scripts/run_unit_tests.py index 56aadd28..e0e4821a 100644 --- a/scripts/run_unit_tests.py +++ b/scripts/run_unit_tests.py @@ -1,5 +1,5 @@ -import unittest import sys +import unittest if __name__ == "__main__": loader = unittest.TestLoader() diff --git a/task.py b/task.py index 93b59b1e..7871fdc4 100644 --- a/task.py +++ b/task.py @@ -1,15 +1,14 @@ import logging import os from pathlib import Path + import click -from misc.utils import DBGymConfig from benchmark.cli import benchmark_group from dbms.cli import dbms_group +from manage.cli import manage_group from misc.utils import DBGymConfig from tune.cli import tune_group -from manage.cli import manage_group - # TODO(phw2): save commit, git diff, and run command # TODO(phw2): remove write permissions on old run_*/ dirs to enforce that they are immutable diff --git a/tune/protox/agent/build_trial.py b/tune/protox/agent/build_trial.py index 53e782a5..4bd23ae5 100644 --- a/tune/protox/agent/build_trial.py +++ b/tune/protox/agent/build_trial.py @@ -10,14 +10,17 @@ import gymnasium as gym import numpy as np import torch -from gymnasium.wrappers import ( # type: ignore - FlattenObservation, - NormalizeObservation, - NormalizeReward, -) +from gymnasium.wrappers import FlattenObservation # type: ignore +from gymnasium.wrappers import NormalizeObservation, NormalizeReward from torch import nn -from misc.utils import DBGymConfig, TuningMode, open_and_save, make_redis_started, save_file +from misc.utils import ( + DBGymConfig, + TuningMode, + make_redis_started, + open_and_save, + save_file, +) from tune.protox.agent.agent_env import AgentEnv from tune.protox.agent.buffers import ReplayBuffer from tune.protox.agent.noise import ClampNoise @@ -93,9 +96,13 @@ def _get_signal(signal_folder: Union[str, Path]) -> Tuple[int, str]: raise IOError("No free ports to bind postgres to.") -def _modify_benchbase_config(dbgym_cfg: DBGymConfig, port: int, hpo_params: dict[str, Any]) -> None: +def _modify_benchbase_config( + dbgym_cfg: DBGymConfig, port: int, hpo_params: dict[str, Any] +) -> None: if hpo_params["benchmark_config"]["query_spec"]["oltp_workload"]: - conf_etree = ET.parse(dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) / "benchmark.xml") + conf_etree = ET.parse( + dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) / "benchmark.xml" + ) jdbc = f"jdbc:postgresql://localhost:{port}/benchbase?preferQueryMode=extended" conf_etree.getroot().find("url").text = jdbc # type: ignore @@ -110,7 +117,9 @@ def _modify_benchbase_config(dbgym_cfg: DBGymConfig, port: int, hpo_params: dict conf_etree.getroot().find("works").find("work").find("time").text = str(oltp_config["oltp_duration"]) # type: ignore if works.find("warmup") is not None: # type: ignore conf_etree.getroot().find("works").find("work").find("warmup").text = str(oltp_config["oltp_warmup"]) # type: ignore - conf_etree.write(dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) / "benchmark.xml") + conf_etree.write( + dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) / "benchmark.xml" + ) def _gen_noise_scale( @@ -130,7 +139,10 @@ def f(p: ProtoAction, n: torch.Tensor) -> ProtoAction: def _build_utilities( - dbgym_cfg: DBGymConfig, tuning_mode: TuningMode, pgport: int, hpo_params: dict[str, Any] + dbgym_cfg: DBGymConfig, + tuning_mode: TuningMode, + pgport: int, + hpo_params: dict[str, Any], ) -> Tuple[Logger, RewardUtility, PostgresConn, Workload]: logger = Logger( dbgym_cfg, @@ -158,7 +170,9 @@ def _build_utilities( pg_conn = PostgresConn( dbgym_cfg=dbgym_cfg, pgport=pgport, - pristine_dbdata_snapshot_fpath=Path(hpo_params["pgconn_info"]["pristine_dbdata_snapshot_path"]), + pristine_dbdata_snapshot_fpath=Path( + hpo_params["pgconn_info"]["pristine_dbdata_snapshot_path"] + ), dbdata_parent_dpath=Path(hpo_params["pgconn_info"]["dbdata_parent_dpath"]), pgbin_path=Path(hpo_params["pgconn_info"]["pgbin_path"]), enable_boot=enable_boot, @@ -183,7 +197,11 @@ def _build_utilities( def _build_actions( - dbgym_cfg: DBGymConfig, seed: int, hpo_params: dict[str, Any], workload: Workload, logger: Logger + dbgym_cfg: DBGymConfig, + seed: int, + hpo_params: dict[str, Any], + workload: Workload, + logger: Logger, ) -> Tuple[HolonSpace, LSC]: sysknobs = LatentKnobSpace( logger=logger, @@ -274,7 +292,11 @@ def _build_actions( def _build_observation_space( - dbgym_cfg: DBGymConfig, action_space: HolonSpace, lsc: LSC, hpo_params: dict[str, Any], seed: int + dbgym_cfg: DBGymConfig, + action_space: HolonSpace, + lsc: LSC, + hpo_params: dict[str, Any], + seed: int, ) -> StateSpace: if hpo_params["metric_state"] == "metric": return LSCMetricStateSpace( @@ -512,16 +534,24 @@ def _build_agent( def build_trial( - dbgym_cfg: DBGymConfig, tuning_mode: TuningMode, seed: int, hpo_params: dict[str, Any], ray_trial_id: Optional[str]=None + dbgym_cfg: DBGymConfig, + tuning_mode: TuningMode, + seed: int, + hpo_params: dict[str, Any], + ray_trial_id: Optional[str] = None, ) -> Tuple[Logger, TargetResetWrapper, AgentEnv, Wolp, str]: # The massive trial builder. port, signal = _get_signal(hpo_params["pgconn_info"]["pgbin_path"]) _modify_benchbase_config(dbgym_cfg, port, hpo_params) - logger, reward_utility, pg_conn, workload = _build_utilities(dbgym_cfg, tuning_mode, port, hpo_params) + logger, reward_utility, pg_conn, workload = _build_utilities( + dbgym_cfg, tuning_mode, port, hpo_params + ) holon_space, lsc = _build_actions(dbgym_cfg, seed, hpo_params, workload, logger) - observation_space = _build_observation_space(dbgym_cfg, holon_space, lsc, hpo_params, seed) + observation_space = _build_observation_space( + dbgym_cfg, holon_space, lsc, hpo_params, seed + ) target_reset, env = _build_env( dbgym_cfg, tuning_mode, @@ -535,5 +565,7 @@ def build_trial( logger, ) - agent = _build_agent(seed, hpo_params, observation_space, holon_space, logger, ray_trial_id) + agent = _build_agent( + seed, hpo_params, observation_space, holon_space, logger, ray_trial_id + ) return logger, target_reset, env, agent, signal diff --git a/tune/protox/agent/cli.py b/tune/protox/agent/cli.py index a78814a0..98f7bb22 100644 --- a/tune/protox/agent/cli.py +++ b/tune/protox/agent/cli.py @@ -2,8 +2,8 @@ from misc.utils import DBGymConfig from tune.protox.agent.hpo import hpo -from tune.protox.agent.tune import tune from tune.protox.agent.replay import replay +from tune.protox.agent.tune import tune @click.group("agent") diff --git a/tune/protox/agent/coerce_config.py b/tune/protox/agent/coerce_config.py index db8f06eb..70ec54ce 100644 --- a/tune/protox/agent/coerce_config.py +++ b/tune/protox/agent/coerce_config.py @@ -1,10 +1,13 @@ from typing import Any + import yaml from misc.utils import DBGymConfig, TuningMode, open_and_save -def coerce_config(dbgym_cfg: DBGymConfig, space: dict[str, Any], hpo_params: dict[str, Any]) -> dict[str, Any]: +def coerce_config( + dbgym_cfg: DBGymConfig, space: dict[str, Any], hpo_params: dict[str, Any] +) -> dict[str, Any]: if "space_version" not in hpo_params: # This is an old version. Coerce the params file. new_config = {} diff --git a/tune/protox/agent/hpo.py b/tune/protox/agent/hpo.py index bc3d8432..05ca46ef 100644 --- a/tune/protox/agent/hpo.py +++ b/tune/protox/agent/hpo.py @@ -1,35 +1,80 @@ +import json +import os +import random import shutil import sys import time -import json -import yaml -from pathlib import Path -from ray import tune -import numpy as np -import torch -import os -import pandas as pd from datetime import datetime +from pathlib import Path from typing import Any, Optional, Union -import random + import click +import numpy as np +import pandas as pd import ray -from ray.tune import Trainable +import torch +import yaml +from ray import tune +from ray.air import FailureConfig, RunConfig +from ray.train import SyncConfig +from ray.tune import Trainable, TuneConfig from ray.tune.schedulers import FIFOScheduler from ray.tune.search.basic_variant import BasicVariantGenerator -from ray.tune import TuneConfig -from ray.air import RunConfig, FailureConfig -from ray.train import SyncConfig +from misc.utils import ( + BENCHMARK_NAME_PLACEHOLDER, + DEFAULT_BOOT_CONFIG_FPATH, + DEFAULT_SYSKNOBS_PATH, + DEFAULT_WORKLOAD_TIMEOUT, + SCALE_FACTOR_PLACEHOLDER, + WORKLOAD_NAME_PLACEHOLDER, + WORKSPACE_PATH_PLACEHOLDER, + DBGymConfig, + TuningMode, + conv_inputpath_to_realabspath, + default_benchbase_config_path, + default_benchmark_config_path, + default_dbdata_parent_dpath, + default_embedder_path, + default_hpoed_agent_params_fname, + default_pgbin_path, + default_pristine_dbdata_snapshot_path, + default_workload_path, + is_ssd, + link_result, + open_and_save, + restart_ray, + workload_name_fn, +) from tune.protox.agent.build_trial import build_trial -from misc.utils import DEFAULT_BOOT_CONFIG_FPATH, DEFAULT_WORKLOAD_TIMEOUT, DBGymConfig, TuningMode, link_result, open_and_save, restart_ray, conv_inputpath_to_realabspath, default_pristine_dbdata_snapshot_path, default_workload_path, default_embedder_path, default_benchmark_config_path, default_benchbase_config_path, WORKSPACE_PATH_PLACEHOLDER, BENCHMARK_NAME_PLACEHOLDER, WORKLOAD_NAME_PLACEHOLDER, SCALE_FACTOR_PLACEHOLDER, DEFAULT_SYSKNOBS_PATH, default_pgbin_path, workload_name_fn, default_dbdata_parent_dpath, default_hpoed_agent_params_fname, is_ssd - METRIC_NAME = "Best Metric" class AgentHPOArgs: - def __init__(self, benchmark_name, workload_name, embedder_path, benchmark_config_path, benchbase_config_path, sysknobs_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath, pgbin_path, workload_path, seed, agent, max_concurrent, num_samples, tune_duration_during_hpo, workload_timeout, query_timeout, enable_boot_during_hpo, boot_config_fpath_during_hpo, build_space_good_for_boot): + def __init__( + self, + benchmark_name, + workload_name, + embedder_path, + benchmark_config_path, + benchbase_config_path, + sysknobs_path, + pristine_dbdata_snapshot_path, + dbdata_parent_dpath, + pgbin_path, + workload_path, + seed, + agent, + max_concurrent, + num_samples, + tune_duration_during_hpo, + workload_timeout, + query_timeout, + enable_boot_during_hpo, + boot_config_fpath_during_hpo, + build_space_good_for_boot, + ): self.benchmark_name = benchmark_name self.workload_name = workload_name self.embedder_path = embedder_path @@ -55,8 +100,18 @@ def __init__(self, benchmark_name, workload_name, embedder_path, benchmark_confi @click.command() @click.pass_obj @click.argument("benchmark-name") -@click.option("--seed-start", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).") -@click.option("--seed-end", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).") +@click.option( + "--seed-start", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).", +) +@click.option( + "--seed-end", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).", +) @click.option( "--query-subset", type=click.Choice(["all", "even", "odd"]), @@ -139,7 +194,10 @@ def __init__(self, benchmark_name, workload_name, embedder_path, benchmark_confi help=f"The # of times to specific hyperparameter configs to sample from the hyperparameter search space and train agent models with.", ) @click.option( - "--tune-duration-during-hpo", default=4, type=float, help="The number of hours to run each hyperparamer config tuning trial for." + "--tune-duration-during-hpo", + default=4, + type=float, + help="The number of hours to run each hyperparamer config tuning trial for.", ) @click.option( "--workload-timeout", @@ -211,43 +269,84 @@ def hpo( # Set args to defaults programmatically (do this before doing anything else in the function) workload_name = workload_name_fn(scale_factor, seed_start, seed_end, query_subset) if embedder_path == None: - embedder_path = default_embedder_path(dbgym_cfg.dbgym_workspace_path, benchmark_name, workload_name) + embedder_path = default_embedder_path( + dbgym_cfg.dbgym_workspace_path, benchmark_name, workload_name + ) if benchmark_config_path == None: benchmark_config_path = default_benchmark_config_path(benchmark_name) if benchbase_config_path == None: benchbase_config_path = default_benchbase_config_path(benchmark_name) if pristine_dbdata_snapshot_path == None: - pristine_dbdata_snapshot_path = default_pristine_dbdata_snapshot_path(dbgym_cfg.dbgym_workspace_path, benchmark_name, scale_factor) + pristine_dbdata_snapshot_path = default_pristine_dbdata_snapshot_path( + dbgym_cfg.dbgym_workspace_path, benchmark_name, scale_factor + ) if dbdata_parent_dpath == None: - dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) + dbdata_parent_dpath = default_dbdata_parent_dpath( + dbgym_cfg.dbgym_workspace_path + ) if pgbin_path == None: pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path) if workload_path == None: - workload_path = default_workload_path(dbgym_cfg.dbgym_workspace_path, benchmark_name, workload_name) + workload_path = default_workload_path( + dbgym_cfg.dbgym_workspace_path, benchmark_name, workload_name + ) if seed == None: seed = random.randint(0, 1e8) # Convert all input paths to absolute paths embedder_path = conv_inputpath_to_realabspath(dbgym_cfg, embedder_path) - benchmark_config_path = conv_inputpath_to_realabspath(dbgym_cfg, benchmark_config_path) - benchbase_config_path = conv_inputpath_to_realabspath(dbgym_cfg, benchbase_config_path) + benchmark_config_path = conv_inputpath_to_realabspath( + dbgym_cfg, benchmark_config_path + ) + benchbase_config_path = conv_inputpath_to_realabspath( + dbgym_cfg, benchbase_config_path + ) sysknobs_path = conv_inputpath_to_realabspath(dbgym_cfg, sysknobs_path) - pristine_dbdata_snapshot_path = conv_inputpath_to_realabspath(dbgym_cfg, pristine_dbdata_snapshot_path) + pristine_dbdata_snapshot_path = conv_inputpath_to_realabspath( + dbgym_cfg, pristine_dbdata_snapshot_path + ) dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath) pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path) workload_path = conv_inputpath_to_realabspath(dbgym_cfg, workload_path) - boot_config_fpath_during_hpo = conv_inputpath_to_realabspath(dbgym_cfg, boot_config_fpath_during_hpo) + boot_config_fpath_during_hpo = conv_inputpath_to_realabspath( + dbgym_cfg, boot_config_fpath_during_hpo + ) # Check assertions on args if intended_dbdata_hardware == "hdd": - assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" + assert not is_ssd( + dbdata_parent_dpath + ), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" elif intended_dbdata_hardware == "ssd": - assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" + assert is_ssd( + dbdata_parent_dpath + ), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" else: assert False # Create args object - hpo_args = AgentHPOArgs(benchmark_name, workload_name, embedder_path, benchmark_config_path, benchbase_config_path, sysknobs_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath, pgbin_path, workload_path, seed, agent, max_concurrent, num_samples, tune_duration_during_hpo, workload_timeout, query_timeout, enable_boot_during_hpo, boot_config_fpath_during_hpo, build_space_good_for_boot) + hpo_args = AgentHPOArgs( + benchmark_name, + workload_name, + embedder_path, + benchmark_config_path, + benchbase_config_path, + sysknobs_path, + pristine_dbdata_snapshot_path, + dbdata_parent_dpath, + pgbin_path, + workload_path, + seed, + agent, + max_concurrent, + num_samples, + tune_duration_during_hpo, + workload_timeout, + query_timeout, + enable_boot_during_hpo, + boot_config_fpath_during_hpo, + build_space_good_for_boot, + ) _tune_hpo(dbgym_cfg, hpo_args) @@ -260,14 +359,14 @@ def build_space( workload_path: Path, embedder_path: list[Path], pgconn_info: dict[str, str], - benchbase_config: dict[str, Any]={}, - tune_duration_during_hpo: int=30, - seed: int=0, - enable_boot_during_hpo: bool=False, - boot_config_fpath_during_hpo: Path=None, + benchbase_config: dict[str, Any] = {}, + tune_duration_during_hpo: int = 30, + seed: int = 0, + enable_boot_during_hpo: bool = False, + boot_config_fpath_during_hpo: Path = None, build_space_good_for_boot: bool = False, - workload_timeouts: list[int]=[600], - query_timeouts: list[int]=[30], + workload_timeouts: list[int] = [600], + query_timeouts: list[int] = [30], ) -> dict[str, Any]: return { @@ -286,7 +385,6 @@ def build_space( "boot_config_fpath": { str(TuningMode.HPO): boot_config_fpath_during_hpo, }, - # Timeouts. "tune_duration": { str(TuningMode.HPO): tune_duration_during_hpo, @@ -295,7 +393,6 @@ def build_space( str(TuningMode.HPO): tune.choice(workload_timeouts), }, "query_timeout": tune.choice(query_timeouts), - # Paths. "workload_path": str(workload_path), "pgconn_info": pgconn_info, @@ -303,31 +400,34 @@ def build_space( "benchbase_config": benchbase_config, # Embeddings. "embedder_path": tune.choice(map(str, embedder_path)), - # Default quantization factor to use. "default_quantization_factor": 100, "system_knobs": sysknobs, - # Horizon before resetting. "horizon": 5, - # Workload Eval. "workload_eval_mode": tune.choice(["all", "all_enum"]), "workload_eval_inverse": tune.choice([False, True]), "workload_eval_reset": True, - # Reward. "reward": tune.choice(["multiplier", "relative"]), "reward_scaler": tune.choice([1, 2, 10]), "workload_timeout_penalty": 1, "normalize_reward": tune.choice([False, True]), - # State. - "metric_state": tune.choice(([] if build_space_good_for_boot else ["metric"]) + ["structure", "structure_normalize"]), + "metric_state": tune.choice( + ([] if build_space_good_for_boot else ["metric"]) + + ["structure", "structure_normalize"] + ), "maximize_state": not benchmark_config.get("oltp_workload", False), # Whether to normalize state or not. - "normalize_state": tune.sample_from(lambda spc: False if spc["config"]["metric_state"] == "structure_normalize" else True), - + "normalize_state": tune.sample_from( + lambda spc: ( + False + if spc["config"]["metric_state"] == "structure_normalize" + else True + ) + ), # LSC Parameters. The units for these are based on the embedding itself. # TODO(): Set these parameters based on the workload/embedding structure itself. "lsc": { @@ -343,7 +443,6 @@ def build_space( # How many episodes to start. "shift_after": 3, }, - # RL Agent Parameters. # Number of warmup steps. "learning_starts": 0, @@ -363,11 +462,9 @@ def build_space( "grad_clip": tune.choice([1.0, 5.0, 10.0]), # Gradient steps per sample. "gradient_steps": tune.choice([1, 2, 4]), - # Training steps. "train_freq_unit": tune.choice(["step", "episode"]), "train_freq_frequency": 1, - # Target noise. "target_noise": { "target_noise_clip": tune.choice([0.05, 0.1, 0.15]), @@ -379,7 +476,6 @@ def build_space( "noise_sigma": tune.choice([0.05, 0.1, 0.15]), }, "scale_noise_perturb": True, - # Neighbor parameters. "neighbor_parameters": { "knob_num_nearest": tune.choice([10, 100]), @@ -403,7 +499,7 @@ def __init__(self, tune_duration: float) -> None: self.limit = (tune_duration * 3600) > 0 self.remain = int(tune_duration * 3600) self.running = False - self.start = 0. + self.start = 0.0 def resume(self) -> None: self.start = time.time() @@ -428,7 +524,12 @@ def __call__(self) -> bool: class TuneTrial: - def __init__(self, dbgym_cfg: DBGymConfig, tuning_mode: TuningMode, ray_trial_id: Optional[str]=None) -> None: + def __init__( + self, + dbgym_cfg: DBGymConfig, + tuning_mode: TuningMode, + ray_trial_id: Optional[str] = None, + ) -> None: """ We use this object for HPO, tune, and replay. It behaves *slightly* differently depending on what it's used for, which is why we have the tuning_mode param. @@ -437,16 +538,20 @@ def __init__(self, dbgym_cfg: DBGymConfig, tuning_mode: TuningMode, ray_trial_id self.tuning_mode = tuning_mode if self.tuning_mode == TuningMode.HPO: - assert ray_trial_id != None, "If we're doing HPO, we will create multiple TuneTrial() objects. We thus need to differentiate them somehow." + assert ( + ray_trial_id != None + ), "If we're doing HPO, we will create multiple TuneTrial() objects. We thus need to differentiate them somehow." else: - assert ray_trial_id == None, "If we're not doing HPO, we (currently) will create only one TuneTrial() object. For clarity, we set ray_trial_id to None since ray_trial_id should not be used in this case." + assert ( + ray_trial_id == None + ), "If we're not doing HPO, we (currently) will create only one TuneTrial() object. For clarity, we set ray_trial_id to None since ray_trial_id should not be used in this case." self.ray_trial_id = ray_trial_id def setup(self, hpo_params: dict[str, Any]) -> None: # Attach mythril directory to the search path. sys.path.append(os.path.expanduser(self.dbgym_cfg.dbgym_repo_path)) - torch.set_default_dtype(torch.float32) # type: ignore + torch.set_default_dtype(torch.float32) # type: ignore seed = ( hpo_params["seed"] if hpo_params["seed"] != -1 @@ -498,8 +603,14 @@ def step(self) -> dict[Any, Any]: ) self.env_init = True - assert self.ray_trial_id != None if self.tuning_mode == TuningMode.HPO else True, "If we're doing HPO, we need to ensure that we're passing a non-None ray_trial_id to stash_results() to avoid conflicting folder names." - self.logger.stash_results(infos, name_override="baseline", ray_trial_id=self.ray_trial_id) + assert ( + self.ray_trial_id != None + if self.tuning_mode == TuningMode.HPO + else True + ), "If we're doing HPO, we need to ensure that we're passing a non-None ray_trial_id to stash_results() to avoid conflicting folder names." + self.logger.stash_results( + infos, name_override="baseline", ray_trial_id=self.ray_trial_id + ) else: self.agent.learn(self.env, total_timesteps=1, tuning_mode=self.tuning_mode) @@ -511,13 +622,13 @@ def step(self) -> dict[Any, Any]: "AgentEpisode": episode, "AgentTimesteps": it, "TrialStep": self.step_count, - "Best Metric": self.target_reset.real_best_metric - if self.target_reset - else -1, - "Best Seen Metric": self.target_reset.best_metric - if self.target_reset - else -1, - "HoursElapsed": (time.time() - self.start_time) / 3600., + "Best Metric": ( + self.target_reset.real_best_metric if self.target_reset else -1 + ), + "Best Seen Metric": ( + self.target_reset.best_metric if self.target_reset else -1 + ), + "HoursElapsed": (time.time() - self.start_time) / 3600.0, } # If we've timed out. Note that we've timed out. @@ -529,10 +640,11 @@ def step(self) -> dict[Any, Any]: def cleanup(self) -> None: self.logger.flush() - self.env.close() # type: ignore + self.env.close() # type: ignore if Path(self.signal).exists(): os.remove(self.signal) + # I want to pass dbgym_cfg into TuneOpt without putting it inside `hpo_params`. This is because it's a pain to turn DBGymConfig # into a nice dictionary of strings, and nothing in DBGymConfig would be relevant to someone checking the configs later # Using a function to create a class is Ray's recommended way of doing this (see @@ -546,7 +658,9 @@ class TuneOpt(Trainable): dbgym_cfg = global_dbgym_cfg def setup(self, hpo_params: dict[str, Any]) -> None: - self.trial = TuneTrial(TuneOpt.dbgym_cfg, TuningMode.HPO, ray_trial_id=self.trial_id) + self.trial = TuneTrial( + TuneOpt.dbgym_cfg, TuningMode.HPO, ray_trial_id=self.trial_id + ) self.trial.setup(hpo_params) def step(self) -> dict[Any, Any]: @@ -562,7 +676,7 @@ def save_checkpoint(self, checkpoint_dir: str) -> None: def load_checkpoint(self, checkpoint_dir: Union[dict[Any, Any], None]) -> None: # We can't actually do anything about this right now. pass - + return TuneOpt @@ -583,16 +697,20 @@ def _tune_hpo(dbgym_cfg: DBGymConfig, hpo_args: AgentHPOArgs) -> None: workload_timeouts = [hpo_args.workload_timeout] query_timeouts = [hpo_args.query_timeout] - benchbase_config = { - "oltp_config": { - "oltp_num_terminals": hpo_args.oltp_num_terminals, - "oltp_duration": hpo_args.oltp_duration, - "oltp_sf": hpo_args.oltp_sf, - "oltp_warmup": hpo_args.oltp_warmup, - }, - "benchbase_path": hpo_args.benchbase_path, - "benchbase_config_path": hpo_args.benchbase_config_path, - } if is_oltp else {} + benchbase_config = ( + { + "oltp_config": { + "oltp_num_terminals": hpo_args.oltp_num_terminals, + "oltp_duration": hpo_args.oltp_duration, + "oltp_sf": hpo_args.oltp_sf, + "oltp_warmup": hpo_args.oltp_warmup, + }, + "benchbase_path": hpo_args.benchbase_path, + "benchbase_config_path": hpo_args.benchbase_config_path, + } + if is_oltp + else {} + ) space = build_space( sysknobs, @@ -615,15 +733,15 @@ def _tune_hpo(dbgym_cfg: DBGymConfig, hpo_args: AgentHPOArgs) -> None: ) restart_ray(dbgym_cfg.root_yaml["ray_gcs_port"]) - ray.init(address=f"localhost:{dbgym_cfg.root_yaml['ray_gcs_port']}", log_to_driver=False) + ray.init( + address=f"localhost:{dbgym_cfg.root_yaml['ray_gcs_port']}", log_to_driver=False + ) # Scheduler. - scheduler = FIFOScheduler() # type: ignore + scheduler = FIFOScheduler() # type: ignore # Search. - search = BasicVariantGenerator( - max_concurrent=hpo_args.max_concurrent - ) + search = BasicVariantGenerator(max_concurrent=hpo_args.max_concurrent) mode = "max" if is_oltp else "min" tune_config = TuneConfig( @@ -659,18 +777,29 @@ def _tune_hpo(dbgym_cfg: DBGymConfig, hpo_args: AgentHPOArgs) -> None: if results[i].error: print(f"Trial {results[i]} FAILED") assert False, print("Encountered exceptions!") - + # Save the best params.json. best_result = results.get_best_result(metric=METRIC_NAME, mode=mode) best_params_generated_fpath = Path(best_result.path) / "params.json" # Before saving, copy it into run_*/[codebase]/data/. This way, save_file() called on # params.json will link directly to run_*/[codebase]/data/params.json instead of to # run_*/[codebase]/hpo_ray_results/TuneOpt*/. - best_params_copy_fpath = dbgym_cfg.cur_task_runs_data_path(mkdir=True) / "params.json" + best_params_copy_fpath = ( + dbgym_cfg.cur_task_runs_data_path(mkdir=True) / "params.json" + ) shutil.copy(best_params_generated_fpath, best_params_copy_fpath) - link_result(dbgym_cfg, best_params_copy_fpath, custom_result_name=default_hpoed_agent_params_fname(hpo_args.benchmark_name, hpo_args.workload_name) + ".link") + link_result( + dbgym_cfg, + best_params_copy_fpath, + custom_result_name=default_hpoed_agent_params_fname( + hpo_args.benchmark_name, hpo_args.workload_name + ) + + ".link", + ) # We also link from run_*/[codebase]/data/params.json to run_*/[codebase]/hpo_ray_results/TuneOpt*/**/params.json. # This way, when _manually_ looking through run_*/, we can see which HPO trial was # responsible for creating params.json. - best_params_link_fpath = dbgym_cfg.cur_task_runs_data_path(mkdir=True) / "params.json.link" + best_params_link_fpath = ( + dbgym_cfg.cur_task_runs_data_path(mkdir=True) / "params.json.link" + ) os.symlink(best_params_generated_fpath, best_params_link_fpath) diff --git a/tune/protox/agent/off_policy_algorithm.py b/tune/protox/agent/off_policy_algorithm.py index dd39d7ba..fd33004e 100644 --- a/tune/protox/agent/off_policy_algorithm.py +++ b/tune/protox/agent/off_policy_algorithm.py @@ -189,7 +189,9 @@ def collect_rollouts( # We only stash the results if we're not doing HPO, or else the results from concurrent HPO would get # stashed in the same directory and potentially cause a race condition. if self.logger: - assert self.ray_trial_id != None if tuning_mode == TuningMode.HPO else True, "If we're doing HPO, we need to ensure that we're passing a non-None ray_trial_id to stash_results() to avoid conflicting folder names." + assert ( + self.ray_trial_id != None if tuning_mode == TuningMode.HPO else True + ), "If we're doing HPO, we need to ensure that we're passing a non-None ray_trial_id to stash_results() to avoid conflicting folder names." self.logger.stash_results(infos, ray_trial_id=self.ray_trial_id) self.num_timesteps += 1 @@ -217,7 +219,9 @@ def collect_rollouts( num_collected_steps, num_collected_episodes, continue_training ) - def learn(self, env: AgentEnv, total_timesteps: int, tuning_mode: TuningMode) -> None: + def learn( + self, env: AgentEnv, total_timesteps: int, tuning_mode: TuningMode + ) -> None: assert isinstance(env, AgentEnv) total_timesteps = self._setup_learn(env, total_timesteps) diff --git a/tune/protox/agent/replay.py b/tune/protox/agent/replay.py index 9bf346bb..6c59ba5c 100644 --- a/tune/protox/agent/replay.py +++ b/tune/protox/agent/replay.py @@ -5,16 +5,26 @@ Additionally, the original tuning run may have been accelerated by Boot, whereas the replayed tuning run is not. """ + import json import logging import pickle +from pathlib import Path + import click import pandas as pd import tqdm -from pathlib import Path from dateutil.parser import parse -from misc.utils import DBGymConfig, TuningMode, conv_inputpath_to_realabspath, open_and_save, save_file, workload_name_fn, default_tuning_steps_dpath +from misc.utils import ( + DBGymConfig, + TuningMode, + conv_inputpath_to_realabspath, + default_tuning_steps_dpath, + open_and_save, + save_file, + workload_name_fn, +) from tune.protox.agent.build_trial import build_trial from tune.protox.env.pg_env import PostgresEnv from tune.protox.env.space.holon_space import HolonSpace @@ -22,13 +32,17 @@ from tune.protox.env.types import HolonAction from tune.protox.env.workload import Workload - REPLAY_DATA_FNAME = "replay_data.csv" class ReplayArgs: def __init__( - self, workload_timeout_during_replay: bool, replay_all_variations: bool, simulated: bool, cutoff: float, blocklist: list + self, + workload_timeout_during_replay: bool, + replay_all_variations: bool, + simulated: bool, + cutoff: float, + blocklist: list, ): self.workload_timeout_during_replay = workload_timeout_during_replay self.replay_all_variations = replay_all_variations @@ -40,8 +54,18 @@ def __init__( @click.command() @click.pass_obj @click.argument("benchmark-name") -@click.option("--seed-start", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).") -@click.option("--seed-end", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).") +@click.option( + "--seed-start", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).", +) +@click.option( + "--seed-end", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).", +) @click.option( "--query-subset", type=click.Choice(["all", "even", "odd"]), @@ -61,7 +85,7 @@ def __init__( "--tuning-steps-dpath", default=None, type=Path, - help="The path to the `tuning_steps` directory to be replayed." + help="The path to the `tuning_steps` directory to be replayed.", ) @click.option( "--workload-timeout-during-replay", @@ -69,51 +93,79 @@ def __init__( type=int, # You can make it use the workload timeout used during tuning if you want. # I just made it use the workload timeout from HPO because I don't currently persist the tuning HPO params. - help="The timeout (in seconds) of a workload when replaying. By default, it will be equal to the workload timeout used during HPO." + help="The timeout (in seconds) of a workload when replaying. By default, it will be equal to the workload timeout used during HPO.", ) @click.option( "--replay-all-variations", is_flag=True, - help="If true, replay all the variations of each query. If false, only replay the variation we found was best in the tuning run. Replaying all variations has two possible use cases: (1) it makes the cache warm to better replicate behavior during tuning, (2) if the best variation during tuning was determined with Boot, it might not still be the best variation." + help="If true, replay all the variations of each query. If false, only replay the variation we found was best in the tuning run. Replaying all variations has two possible use cases: (1) it makes the cache warm to better replicate behavior during tuning, (2) if the best variation during tuning was determined with Boot, it might not still be the best variation.", ) @click.option( "--simulated", is_flag=True, - help="Set to true to use the runtimes from the original tuning run instead of replaying the workload." + help="Set to true to use the runtimes from the original tuning run instead of replaying the workload.", ) @click.option( "--cutoff", default=None, type=float, - help="Only evaluate configs up to cutoff hours. None means \"evaluate all configs\"." + help='Only evaluate configs up to cutoff hours. None means "evaluate all configs".', ) @click.option( "--blocklist", default=[], type=list, - help="Ignore running queries in the blocklist." + help="Ignore running queries in the blocklist.", ) -def replay(dbgym_cfg: DBGymConfig, benchmark_name: str, seed_start: int, seed_end: int, query_subset: str, scale_factor: float, boot_enabled_during_tune: bool, tuning_steps_dpath: Path, workload_timeout_during_replay: bool, replay_all_variations: bool, simulated: bool, cutoff: float, blocklist: list) -> None: +def replay( + dbgym_cfg: DBGymConfig, + benchmark_name: str, + seed_start: int, + seed_end: int, + query_subset: str, + scale_factor: float, + boot_enabled_during_tune: bool, + tuning_steps_dpath: Path, + workload_timeout_during_replay: bool, + replay_all_variations: bool, + simulated: bool, + cutoff: float, + blocklist: list, +) -> None: # Set args to defaults programmatically (do this before doing anything else in the function) workload_name = workload_name_fn(scale_factor, seed_start, seed_end, query_subset) if tuning_steps_dpath == None: - tuning_steps_dpath = default_tuning_steps_dpath(dbgym_cfg.dbgym_workspace_path, benchmark_name, workload_name, boot_enabled_during_tune) + tuning_steps_dpath = default_tuning_steps_dpath( + dbgym_cfg.dbgym_workspace_path, + benchmark_name, + workload_name, + boot_enabled_during_tune, + ) # Convert all input paths to absolute paths tuning_steps_dpath = conv_inputpath_to_realabspath(dbgym_cfg, tuning_steps_dpath) # Group args together to reduce the # of parameters we pass into functions - replay_args = ReplayArgs(workload_timeout_during_replay, replay_all_variations, simulated, cutoff, blocklist) + replay_args = ReplayArgs( + workload_timeout_during_replay, + replay_all_variations, + simulated, + cutoff, + blocklist, + ) # Replay replay_tuning_run(dbgym_cfg, tuning_steps_dpath, replay_args) -def replay_tuning_run(dbgym_cfg: DBGymConfig, tuning_steps_dpath: Path, replay_args: ReplayArgs): +def replay_tuning_run( + dbgym_cfg: DBGymConfig, tuning_steps_dpath: Path, replay_args: ReplayArgs +): """ Replay a single tuning run (as in one tuning_steps/ folder). """ + def _is_tuning_step_line(line: str) -> bool: return "mv" in line and "tuning_steps" in line and "baseline" not in line @@ -123,12 +175,16 @@ def _is_tuning_step_line(line: str) -> bool: # Set defaults that depend on hpo_params if replay_args.workload_timeout_during_replay == None: - replay_args.workload_timeout_during_replay = hpo_params["workload_timeout"][str(TuningMode.HPO)] + replay_args.workload_timeout_during_replay = hpo_params["workload_timeout"][ + str(TuningMode.HPO) + ] # Set the hpo_params that are allowed to differ between HPO, tuning, and replay. hpo_params["enable_boot"][str(TuningMode.REPLAY)] = False hpo_params["boot_config_fpath"][str(TuningMode.REPLAY)] = None - hpo_params["workload_timeout"][str(TuningMode.REPLAY)] = replay_args.workload_timeout_during_replay + hpo_params["workload_timeout"][ + str(TuningMode.REPLAY) + ] = replay_args.workload_timeout_during_replay # Go through output.log and find the tuning_steps/[time]/ folders # This finds all the [time] folders in tuning_steps/ (except "baseline" since we ignore that in `_is_tuning_step_line()`), @@ -140,21 +196,35 @@ def _is_tuning_step_line(line: str) -> bool: for line in f: if not start_found: if "Baseline Metric" in line: - start_time = parse(line.split("INFO:")[-1].split(" Baseline Metric")[0].split("[")[0]) + start_time = parse( + line.split("INFO:")[-1] + .split(" Baseline Metric")[0] + .split("[")[0] + ) start_found = True else: if _is_tuning_step_line(line): repo = eval(line.split("Running ")[-1])[-1] last_folder = repo.split("/")[-1] - time_since_start = parse(line.split("DEBUG:")[-1].split(" Running")[0].split("[")[0]) - if replay_args.cutoff == None or (time_since_start - start_time).total_seconds() < replay_args.cutoff * 3600: + time_since_start = parse( + line.split("DEBUG:")[-1].split(" Running")[0].split("[")[0] + ) + if ( + replay_args.cutoff == None + or (time_since_start - start_time).total_seconds() + < replay_args.cutoff * 3600 + ): folders.append(last_folder) - + # Set tune_duration to be high so that it doesn't cut the replay off early - hpo_params["tune_duration"][str(TuningMode.REPLAY)] = replay_args.workload_timeout_during_replay * len(folders) + hpo_params["tune_duration"][str(TuningMode.REPLAY)] = ( + replay_args.workload_timeout_during_replay * len(folders) + ) # Build PostgresEnv. - _, _, agent_env, _, _ = build_trial(dbgym_cfg, TuningMode.REPLAY, hpo_params["seed"], hpo_params) + _, _, agent_env, _, _ = build_trial( + dbgym_cfg, TuningMode.REPLAY, hpo_params["seed"], hpo_params + ) pg_env: PostgresEnv = agent_env.unwrapped action_space: HolonSpace = pg_env.action_space @@ -172,14 +242,26 @@ def _is_tuning_step_line(line: str) -> bool: # A convenience wrapper around execute_workload() which fills in the arguments properly and processes the return values. def _execute_workload_wrapper(actions_info: list["HolonAction"]) -> list[float]: - logging.info(f"\n\nfetch_server_knobs(): {fetch_server_knobs(pg_env.pg_conn.conn(), action_space.get_knob_space().tables, action_space.get_knob_space().knobs, pg_env.workload.queries)}\n\n") - logging.info(f"\n\nfetch_server_indexes(): {fetch_server_indexes(pg_env.pg_conn.conn(), action_space.get_knob_space().tables)}\n\n") - assert replay_args.workload_timeout_during_replay == hpo_params["workload_timeout"][str(TuningMode.REPLAY)] == pg_env.workload.workload_timeout, "All these different sources of workload_timeout during replay should show the same value" + logging.info( + f"\n\nfetch_server_knobs(): {fetch_server_knobs(pg_env.pg_conn.conn(), action_space.get_knob_space().tables, action_space.get_knob_space().knobs, pg_env.workload.queries)}\n\n" + ) + logging.info( + f"\n\nfetch_server_indexes(): {fetch_server_indexes(pg_env.pg_conn.conn(), action_space.get_knob_space().tables)}\n\n" + ) + assert ( + replay_args.workload_timeout_during_replay + == hpo_params["workload_timeout"][str(TuningMode.REPLAY)] + == pg_env.workload.workload_timeout + ), "All these different sources of workload_timeout during replay should show the same value" if replay_args.replay_all_variations: all_holon_action_variations = actions_info["all_holon_action_variations"] - actions = [holon_action for (_, holon_action) in all_holon_action_variations] - variation_names = [variation_name for (variation_name, _) in all_holon_action_variations] + actions = [ + holon_action for (_, holon_action) in all_holon_action_variations + ] + variation_names = [ + variation_name for (variation_name, _) in all_holon_action_variations + ] else: # Note that "best observed" is not an entirely accurate name. Specifically, if the workload times out, some queries # will not have had a chance to run at all. Based on the behavior of `_mutilate_action_with_metrics()`, we select @@ -188,21 +270,28 @@ def _execute_workload_wrapper(actions_info: list["HolonAction"]) -> list[float]: actions = [best_observed_holon_action] variation_names = ["BestObserved"] - num_timed_out_queries, did_workload_time_out, qid_runtime_data = pg_env.workload.execute_workload( - pg_conn=pg_env.pg_conn, - actions=actions, - variation_names=variation_names, - observation_space=None, - action_space=action_space, - reset_metrics=None, - query_timeout=None, - workload_qdir=None, - blocklist=replay_args.blocklist, - first=False, + num_timed_out_queries, did_workload_time_out, qid_runtime_data = ( + pg_env.workload.execute_workload( + pg_conn=pg_env.pg_conn, + actions=actions, + variation_names=variation_names, + observation_space=None, + action_space=action_space, + reset_metrics=None, + query_timeout=None, + workload_qdir=None, + blocklist=replay_args.blocklist, + first=False, + ) ) workload_runtime = Workload.compute_total_workload_runtime(qid_runtime_data) num_executed_queries = len(qid_runtime_data) - return num_executed_queries, num_timed_out_queries, did_workload_time_out, workload_runtime + return ( + num_executed_queries, + num_timed_out_queries, + did_workload_time_out, + workload_runtime, + ) run_data = [] progess_bar = tqdm.tqdm(total=num_lines) @@ -218,17 +307,27 @@ def _execute_workload_wrapper(actions_info: list["HolonAction"]) -> list[float]: if not start_found: if "Baseline Metric" in line: start_found = True - start_time = parse(line.split("INFO:")[-1].split(" Baseline Metric")[0].split("[")[0]) + start_time = parse( + line.split("INFO:")[-1] + .split(" Baseline Metric")[0] + .split("[")[0] + ) progess_bar.update(1) continue elif _is_tuning_step_line(line): if _is_tuning_step_line(line): repo = eval(line.split("Running ")[-1])[-1] - time_since_start = parse(line.split("DEBUG:")[-1].split(" Running")[0].split("[")[0]) + time_since_start = parse( + line.split("DEBUG:")[-1].split(" Running")[0].split("[")[0] + ) elif "Found new maximal state with" in line: repo = eval(maximal_repo.split("Running ")[-1])[-1] - time_since_start = parse(maximal_repo.split("DEBUG:")[-1].split(" Running")[0].split("[")[0]) + time_since_start = parse( + maximal_repo.split("DEBUG:")[-1] + .split(" Running")[0] + .split("[")[0] + ) maximal_repo = None # Get the original runtime as well as whether any individual queries and/or the full workload timed out. @@ -240,40 +339,67 @@ def _execute_workload_wrapper(actions_info: list["HolonAction"]) -> list[float]: # because Proto-X decreases `workload_timeout` over the course of the tuning run. Specifically, at the end of a tuning step, Proto-X # sets `workload_timeout` to be equal to the runtime of the workload that just ran. # We separate the penalty rows from the non-penalty rows to process them separately. - run_raw_csv_penalty_rows = run_raw_csv[run_raw_csv["Transaction Name"] == "P"] - run_raw_csv_non_penalty_rows = run_raw_csv[run_raw_csv["Transaction Name"] != "P"] + run_raw_csv_penalty_rows = run_raw_csv[ + run_raw_csv["Transaction Name"] == "P" + ] + run_raw_csv_non_penalty_rows = run_raw_csv[ + run_raw_csv["Transaction Name"] != "P" + ] # Get the number of executed queries. A query timing out is not the same as a query not being executed. We do this instead of getting the # number of skipped queries since we don't have the total # of queries with the current codebase. num_executed_queries_in_original = len(run_raw_csv_non_penalty_rows) # `num_timed_out_queries_in_original` counts the number of queries where *all variations* timed out. Note that the query_timeout of # a query may be set extremely low because the workload is about to time out, so it could be viewed as "unfair" to count those queries as # having timed out. Regardless, that's how we currently do things. - num_timed_out_queries_in_original = run_raw_csv_non_penalty_rows["Timed Out"].sum() + num_timed_out_queries_in_original = run_raw_csv_non_penalty_rows[ + "Timed Out" + ].sum() # Penalties are added when the workload times out so this is a reliable indicator of whether the workload timed out. did_workload_time_out_in_original = len(run_raw_csv_penalty_rows) > 0 # Penalties are meant to affect the reward of the tuning agent but they are unrelated to the actual runtime, so we ignore them when # computing the original runtime. - original_workload_runtime = run_raw_csv_non_penalty_rows["Latency (microseconds)"].sum() / 1e6 + original_workload_runtime = ( + run_raw_csv_non_penalty_rows["Latency (microseconds)"].sum() / 1e6 + ) assert original_workload_runtime > 0 # Extract the necessary values from action.pkl - with open_and_save(dbgym_cfg, tuning_steps_dpath / repo / "action.pkl", "rb") as f: + with open_and_save( + dbgym_cfg, tuning_steps_dpath / repo / "action.pkl", "rb" + ) as f: actions_info = pickle.load(f) - all_holon_action_variations = actions_info["all_holon_action_variations"] + all_holon_action_variations = actions_info[ + "all_holon_action_variations" + ] # Extract the KnobSpaceAction and IndexAction from all_holon_action_variations. # These two should be identical across all HolonActions, which we will assert. _, first_holon_action = all_holon_action_variations[0] knob_space_action = first_holon_action[0] index_space_raw_sample = first_holon_action[1] - index_action = action_space.get_index_space().to_action(index_space_raw_sample) - assert all([knob_space_action == holon_action[0] for (_, holon_action) in all_holon_action_variations]) - assert all([index_action == action_space.get_index_space().to_action(holon_action[1]) for (_, holon_action) in all_holon_action_variations]) + index_action = action_space.get_index_space().to_action( + index_space_raw_sample + ) + assert all( + [ + knob_space_action == holon_action[0] + for (_, holon_action) in all_holon_action_variations + ] + ) + assert all( + [ + index_action + == action_space.get_index_space().to_action(holon_action[1]) + for (_, holon_action) in all_holon_action_variations + ] + ) # Get the indexes from this action and the prior state index_acts = set() index_acts.add(index_action) assert len(index_acts) > 0 - with open_and_save(dbgym_cfg, tuning_steps_dpath / repo / "prior_state.pkl", "rb") as f: + with open_and_save( + dbgym_cfg, tuning_steps_dpath / repo / "prior_state.pkl", "rb" + ) as f: prior_states = pickle.load(f) all_sc = set(prior_states[1]) for index_act in index_acts: @@ -293,17 +419,36 @@ def _execute_workload_wrapper(actions_info: list["HolonAction"]) -> list[float]: # Modify Postgres to have the right indexes and system-wide knobs. `index_modification_sqls` holds the indexes # while `cc` holds the system-wide knobs. if not replay_args.simulated: - cc, _ = action_space.get_knob_space().generate_action_plan(knob_space_action, prior_states[0]) + cc, _ = action_space.get_knob_space().generate_action_plan( + knob_space_action, prior_states[0] + ) # Like in tuning, we don't dump the page cache when calling shift_state() to see how the workload # performs in a warm cache scenario. - pg_env.shift_state(cc, index_modification_sqls, dump_page_cache=False) + pg_env.shift_state( + cc, index_modification_sqls, dump_page_cache=False + ) existing_index_acts = index_acts # Execute the workload to get the runtime. if not replay_args.simulated: - num_executed_queries_in_replay, num_timed_out_queries_in_replay, did_workload_time_out_in_replay, replayed_workload_runtime = _execute_workload_wrapper(actions_info) + ( + num_executed_queries_in_replay, + num_timed_out_queries_in_replay, + did_workload_time_out_in_replay, + replayed_workload_runtime, + ) = _execute_workload_wrapper(actions_info) else: - num_executed_queries_in_replay, num_timed_out_queries_in_replay, did_workload_time_out_in_replay, replayed_workload_runtime = num_executed_queries_in_original, num_timed_out_queries_in_original, did_workload_time_out_in_original, original_workload_runtime + ( + num_executed_queries_in_replay, + num_timed_out_queries_in_replay, + did_workload_time_out_in_replay, + replayed_workload_runtime, + ) = ( + num_executed_queries_in_original, + num_timed_out_queries_in_original, + did_workload_time_out_in_original, + original_workload_runtime, + ) # Perform some validity checks and then add this tuning step's data to `run_data``. this_step_run_data = { @@ -320,7 +465,10 @@ def _execute_workload_wrapper(actions_info: list["HolonAction"]) -> list[float]: } # Log before performing checks to help with debugging. logging.info(f"this_step_run_data={this_step_run_data}") - assert not (num_timed_out_queries_in_replay > 0 and not did_workload_time_out_in_replay), "During replay, individual queries should not time out unless they timed out because the whole workload timed out." + assert not ( + num_timed_out_queries_in_replay > 0 + and not did_workload_time_out_in_replay + ), "During replay, individual queries should not time out unless they timed out because the whole workload timed out." run_data.append(this_step_run_data) current_step += 1 @@ -331,7 +479,9 @@ def _execute_workload_wrapper(actions_info: list["HolonAction"]) -> list[float]: # Output. run_data_df = pd.DataFrame(run_data) - pd.set_option('display.max_columns', 10) - print(f"Finished replaying with run_data_df=\n{run_data_df}\n. Data stored in {dbgym_cfg.cur_task_runs_path()}.") + pd.set_option("display.max_columns", 10) + print( + f"Finished replaying with run_data_df=\n{run_data_df}\n. Data stored in {dbgym_cfg.cur_task_runs_path()}." + ) run_data_df.to_csv(dbgym_cfg.cur_task_runs_data_path("run_data.csv"), index=False) - pg_env.close() \ No newline at end of file + pg_env.close() diff --git a/tune/protox/agent/tune.py b/tune/protox/agent/tune.py index c25eaf62..2ec6045b 100644 --- a/tune/protox/agent/tune.py +++ b/tune/protox/agent/tune.py @@ -1,12 +1,26 @@ import json import os -from pathlib import Path import shutil import time +from pathlib import Path + import click import pandas as pd -from misc.utils import DEFAULT_BOOT_CONFIG_FPATH, WORKSPACE_PATH_PLACEHOLDER, DBGymConfig, TuningMode, conv_inputpath_to_realabspath, link_result, open_and_save, default_hpoed_agent_params_path, BENCHMARK_NAME_PLACEHOLDER, WORKLOAD_NAME_PLACEHOLDER, workload_name_fn, default_tuning_steps_dname +from misc.utils import ( + BENCHMARK_NAME_PLACEHOLDER, + DEFAULT_BOOT_CONFIG_FPATH, + WORKLOAD_NAME_PLACEHOLDER, + WORKSPACE_PATH_PLACEHOLDER, + DBGymConfig, + TuningMode, + conv_inputpath_to_realabspath, + default_hpoed_agent_params_path, + default_tuning_steps_dname, + link_result, + open_and_save, + workload_name_fn, +) from tune.protox.agent.coerce_config import coerce_config from tune.protox.agent.hpo import TuneTrial, build_space @@ -15,8 +29,18 @@ @click.command() @click.pass_obj @click.argument("benchmark-name") -@click.option("--seed-start", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).") -@click.option("--seed-end", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).") +@click.option( + "--seed-start", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).", +) +@click.option( + "--seed-end", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).", +) @click.option( "--query-subset", type=click.Choice(["all", "even", "odd"]), @@ -48,31 +72,52 @@ "--tune-duration-during-tune", default=None, type=float, - help="The number of hours to run the tuning agent for. If you do not specify this argument, it will be the same as --tune-duration-during-hpo." + help="The number of hours to run the tuning agent for. If you do not specify this argument, it will be the same as --tune-duration-during-hpo.", ) -def tune(dbgym_cfg: DBGymConfig, benchmark_name: str, seed_start: int, seed_end: int, query_subset: str, scale_factor: float, hpoed_agent_params_path: Path, enable_boot_during_tune: bool, boot_config_fpath_during_tune: Path, tune_duration_during_tune: float) -> None: - """IMPORTANT: The "tune" here is the one in "tune a DBMS". This is *different* from the "tune" in ray.tune.TuneConfig, which means to "tune hyperparameters".""" +def tune( + dbgym_cfg: DBGymConfig, + benchmark_name: str, + seed_start: int, + seed_end: int, + query_subset: str, + scale_factor: float, + hpoed_agent_params_path: Path, + enable_boot_during_tune: bool, + boot_config_fpath_during_tune: Path, + tune_duration_during_tune: float, +) -> None: + """IMPORTANT: The "tune" here is the one in "tune a DBMS". This is *different* from the "tune" in ray.tune.TuneConfig, which means to "tune hyperparameters".""" # Set args to defaults programmatically (do this before doing anything else in the function) workload_name = workload_name_fn(scale_factor, seed_start, seed_end, query_subset) if hpoed_agent_params_path == None: - hpoed_agent_params_path = default_hpoed_agent_params_path(dbgym_cfg.dbgym_workspace_path, benchmark_name, workload_name) + hpoed_agent_params_path = default_hpoed_agent_params_path( + dbgym_cfg.dbgym_workspace_path, benchmark_name, workload_name + ) # Convert all input paths to absolute paths - hpoed_agent_params_path = conv_inputpath_to_realabspath(dbgym_cfg, hpoed_agent_params_path) - boot_config_fpath_during_tune = conv_inputpath_to_realabspath(dbgym_cfg, boot_config_fpath_during_tune) + hpoed_agent_params_path = conv_inputpath_to_realabspath( + dbgym_cfg, hpoed_agent_params_path + ) + boot_config_fpath_during_tune = conv_inputpath_to_realabspath( + dbgym_cfg, boot_config_fpath_during_tune + ) # Tune with open_and_save(dbgym_cfg, hpoed_agent_params_path, "r") as f: hpo_params = json.load(f) # Coerce using a dummy space. - hpo_params = coerce_config(dbgym_cfg, build_space( - sysknobs={}, - benchmark_config={}, - workload_path=Path(), - embedder_path=[], - pgconn_info={} - ), hpo_params) + hpo_params = coerce_config( + dbgym_cfg, + build_space( + sysknobs={}, + benchmark_config={}, + workload_path=Path(), + embedder_path=[], + pgconn_info={}, + ), + hpo_params, + ) # Set defaults that depend on hpo_params if tune_duration_during_tune == None: @@ -86,9 +131,13 @@ def tune(dbgym_cfg: DBGymConfig, benchmark_name: str, seed_start: int, seed_end: # Note that while we currently do not persist the hpo_params used during *tuning* back to # a file, this is entirely possible to do in the future if needed. hpo_params["enable_boot"][str(TuningMode.TUNE)] = enable_boot_during_tune - hpo_params["boot_config_fpath"][str(TuningMode.TUNE)] = boot_config_fpath_during_tune + hpo_params["boot_config_fpath"][ + str(TuningMode.TUNE) + ] = boot_config_fpath_during_tune hpo_params["tune_duration"][str(TuningMode.TUNE)] = tune_duration_during_tune - hpo_params["workload_timeout"][str(TuningMode.TUNE)] = hpo_params["workload_timeout"][str(TuningMode.HPO)] + hpo_params["workload_timeout"][str(TuningMode.TUNE)] = hpo_params[ + "workload_timeout" + ][str(TuningMode.HPO)] # Piggyback off the HPO magic. tune_trial = TuneTrial(dbgym_cfg, TuningMode.TUNE) @@ -115,8 +164,14 @@ def tune(dbgym_cfg: DBGymConfig, benchmark_name: str, seed_start: int, seed_end: # We copy instead of just symlinking so that tuning_steps/ is a fully self-contained directory. hpoed_agent_params_copy_fpath = tuning_steps_dpath / "params.json" shutil.copy(hpoed_agent_params_path, hpoed_agent_params_copy_fpath) - tuning_steps_link_dname = default_tuning_steps_dname(benchmark_name, workload_name, enable_boot_during_tune) - link_result(dbgym_cfg, tuning_steps_dpath, custom_result_name=tuning_steps_link_dname + ".link") + tuning_steps_link_dname = default_tuning_steps_dname( + benchmark_name, workload_name, enable_boot_during_tune + ) + link_result( + dbgym_cfg, + tuning_steps_dpath, + custom_result_name=tuning_steps_link_dname + ".link", + ) # We also create a link to hpoed_agent_params_path. This is useful when we are _manually_ looking through # run_*/ and want to see which other run_*/ was responsible for creating params.json hpoed_agent_params_link_fpath = tuning_steps_dpath / "params.json.link" diff --git a/tune/protox/embedding/analyze.py b/tune/protox/embedding/analyze.py index cdf6666c..d895e9ca 100644 --- a/tune/protox/embedding/analyze.py +++ b/tune/protox/embedding/analyze.py @@ -18,17 +18,17 @@ from tune.protox.embedding.train_all import ( create_vae_model, fetch_index_parameters, + fetch_vae_parameters_from_workload, load_input_data, - fetch_vae_parameters_from_workload ) -from tune.protox.embedding.trainer import StratifiedRandomSampler -from tune.protox.embedding.vae import VAELoss, gen_vae_collate -from tune.protox.env.space.latent_space.latent_index_space import LatentIndexSpace -from tune.protox.env.workload import Workload from tune.protox.embedding.train_args import ( EmbeddingAnalyzeArgs, EmbeddingTrainGenericArgs, ) +from tune.protox.embedding.trainer import StratifiedRandomSampler +from tune.protox.embedding.vae import VAELoss, gen_vae_collate +from tune.protox.env.space.latent_space.latent_index_space import LatentIndexSpace +from tune.protox.env.workload import Workload STATS_FNAME = "stats.txt" RANGES_FNAME = "ranges.txt" @@ -46,7 +46,9 @@ def redist_trained_models(dbgym_cfg: DBGymConfig, num_parts: int): Redistribute all embeddings_*/ folders inside the run_*/ folder into num_parts subfolders """ inputs = [ - f for f in dbgym_cfg.cur_task_runs_data_path(mkdir=True).glob("embeddings*") if os.path.isdir(f) + f + for f in dbgym_cfg.cur_task_runs_data_path(mkdir=True).glob("embeddings*") + if os.path.isdir(f) ] for part_i in range(num_parts): @@ -57,7 +59,12 @@ def redist_trained_models(dbgym_cfg: DBGymConfig, num_parts: int): shutil.move(emb, _get_part_i_dpath(dbgym_cfg, part_i)) -def analyze_all_embeddings_parts(dbgym_cfg: DBGymConfig, num_parts: int, generic_args: EmbeddingTrainGenericArgs, analyze_args: EmbeddingAnalyzeArgs): +def analyze_all_embeddings_parts( + dbgym_cfg: DBGymConfig, + num_parts: int, + generic_args: EmbeddingTrainGenericArgs, + analyze_args: EmbeddingAnalyzeArgs, +): """ Analyze all part*/ dirs _in parallel_ """ @@ -71,7 +78,12 @@ def analyze_all_embeddings_parts(dbgym_cfg: DBGymConfig, num_parts: int, generic f.write(f"{analyze_all_parts_duration}") -def _analyze_embeddings_part(dbgym_cfg: DBGymConfig, part_i: int, generic_args: EmbeddingTrainGenericArgs, analyze_args: EmbeddingAnalyzeArgs): +def _analyze_embeddings_part( + dbgym_cfg: DBGymConfig, + part_i: int, + generic_args: EmbeddingTrainGenericArgs, + analyze_args: EmbeddingAnalyzeArgs, +): """ Analyze (meaning create both stats.txt and ranges.txt) all the embedding models in the part[part_i]/ dir """ @@ -90,7 +102,12 @@ def _analyze_embeddings_part(dbgym_cfg: DBGymConfig, part_i: int, generic_args: f.write(f"{create_range_duration}") -def _create_stats_for_part(dbgym_cfg: DBGymConfig, part_dpath: Path, generic_args: EmbeddingTrainGenericArgs, analyze_args: EmbeddingAnalyzeArgs): +def _create_stats_for_part( + dbgym_cfg: DBGymConfig, + part_dpath: Path, + generic_args: EmbeddingTrainGenericArgs, + analyze_args: EmbeddingAnalyzeArgs, +): """ Creates a stats.txt file inside each embeddings_*/models/epoch*/ dir inside this part*/ dir TODO(wz2): what does stats.txt contain? @@ -298,7 +315,12 @@ def _create_stats_for_part(dbgym_cfg: DBGymConfig, part_dpath: Path, generic_arg gc.collect() -def _create_ranges_for_part(dbgym_cfg: DBGymConfig, part_dpath: Path, generic_args: EmbeddingTrainGenericArgs, analyze_args: EmbeddingAnalyzeArgs): +def _create_ranges_for_part( + dbgym_cfg: DBGymConfig, + part_dpath: Path, + generic_args: EmbeddingTrainGenericArgs, + analyze_args: EmbeddingAnalyzeArgs, +): """ Create the ranges.txt for all models in part_dpath TODO(wz2): what does ranges.txt contain? @@ -306,11 +328,7 @@ def _create_ranges_for_part(dbgym_cfg: DBGymConfig, part_dpath: Path, generic_ar # Unlike for training, we're safe to use all threads for creating ranges os.environ["OMP_NUM_THREADS"] = str(os.cpu_count()) paths = sorted( - [ - f - for f in part_dpath.rglob("embedder_*.pth") - if "optimizer" not in str(f) - ] + [f for f in part_dpath.rglob("embedder_*.pth") if "optimizer" not in str(f)] ) for embedder_fpath in tqdm.tqdm(paths): _create_ranges_for_embedder( @@ -318,7 +336,12 @@ def _create_ranges_for_part(dbgym_cfg: DBGymConfig, part_dpath: Path, generic_ar ) -def _create_ranges_for_embedder(dbgym_cfg: DBGymConfig, embedder_fpath: Path, generic_args: EmbeddingTrainGenericArgs, analyze_args: EmbeddingAnalyzeArgs): +def _create_ranges_for_embedder( + dbgym_cfg: DBGymConfig, + embedder_fpath: Path, + generic_args: EmbeddingTrainGenericArgs, + analyze_args: EmbeddingAnalyzeArgs, +): """ Create the ranges.txt file corresponding to a specific part*/embeddings_*/models/epoch*/embedder_*.pth file """ @@ -337,21 +360,29 @@ def _create_ranges_for_embedder(dbgym_cfg: DBGymConfig, embedder_fpath: Path, ge attributes = benchmark_config["attributes"] query_spec = benchmark_config["query_spec"] - workload = Workload(dbgym_cfg, tables, attributes, query_spec, generic_args.workload_path, pid=None) + workload = Workload( + dbgym_cfg, tables, attributes, query_spec, generic_args.workload_path, pid=None + ) modified_attrs = workload.column_usages() # Load VAE. embeddings_dpath = embedder_fpath.parent.parent.parent # part*/embeddings_*/ - embeddings_config_fpath = embeddings_dpath / "config" # part*/embeddings_*/config + embeddings_config_fpath = embeddings_dpath / "config" # part*/embeddings_*/config # don't use open_and_save() because we generated embeddings_config_fpath in this run with open(embeddings_config_fpath, "r") as f: config = json.load(f) assert config["mean_output_act"] == "sigmoid" - index_output_transform = lambda x: torch.nn.Sigmoid()(x) * config["output_scale"] + index_output_transform = ( + lambda x: torch.nn.Sigmoid()(x) * config["output_scale"] + ) + def index_noise_scale(x, n): assert n is None - return torch.clamp(x, 0., config["output_scale"]) - max_attrs, max_cat_features = fetch_vae_parameters_from_workload(workload, len(tables)) + return torch.clamp(x, 0.0, config["output_scale"]) + + max_attrs, max_cat_features = fetch_vae_parameters_from_workload( + workload, len(tables) + ) vae = create_vae_model(config, max_attrs, max_cat_features) # don't call save_file() because we generated embedder_fpath in this run vae.load_state_dict(torch.load(embedder_fpath)) @@ -373,39 +404,61 @@ def index_noise_scale(x, n): index_output_transform=index_output_transform, # No-op noise. index_noise_scale=index_noise_scale, - logger=None) + logger=None, + ) output_scale = config["metric_loss_md"]["output_scale"] bias_separation = config["metric_loss_md"]["bias_separation"] num_segments = min(analyze_args.max_segments, math.ceil(1.0 / bias_separation)) base = 0 - epoch_dpath = embeddings_dpath / "models" / f"epoch{epoch_i}" # part*/embeddings_*/models/epoch*/ + epoch_dpath = ( + embeddings_dpath / "models" / f"epoch{epoch_i}" + ) # part*/embeddings_*/models/epoch*/ ranges_fpath = epoch_dpath / RANGES_FNAME with open(ranges_fpath, "w") as f: for _ in tqdm.tqdm(range(num_segments), total=num_segments, leave=False): classes = {} with torch.no_grad(): - points = torch.rand(analyze_args.num_points_to_sample, config["latent_dim"]) * output_scale + base + points = ( + torch.rand(analyze_args.num_points_to_sample, config["latent_dim"]) + * output_scale + + base + ) protos = idxs.from_latent(points) - neighbors = [idxs.neighborhood(proto, neighbor_parameters={ - "knob_num_nearest": 100, - "knob_span": 1, - "index_num_samples": 1, - "index_rules": False, - })[0] for proto in protos] + neighbors = [ + idxs.neighborhood( + proto, + neighbor_parameters={ + "knob_num_nearest": 100, + "knob_span": 1, + "index_num_samples": 1, + "index_rules": False, + }, + )[0] + for proto in protos + ] for n in neighbors: idx_class = idxs.get_index_class(n) if idx_class not in classes: classes[idx_class] = 0 classes[idx_class] += 1 - classes = sorted([(k, v) for k, v in classes.items()], key=lambda x: x[1], reverse=True) + classes = sorted( + [(k, v) for k, v in classes.items()], key=lambda x: x[1], reverse=True + ) if analyze_args.num_classes_to_keep != 0: - classes = classes[:analyze_args.num_classes_to_keep] + classes = classes[: analyze_args.num_classes_to_keep] f.write(f"Generating range {base} - {base + output_scale}\n") - f.write("\n".join([f"{k}: {v / analyze_args.num_points_to_sample}" for (k, v) in classes])) + f.write( + "\n".join( + [ + f"{k}: {v / analyze_args.num_points_to_sample}" + for (k, v) in classes + ] + ) + ) f.write("\n") base += output_scale diff --git a/tune/protox/embedding/datagen.py b/tune/protox/embedding/datagen.py index 3e4889c8..53defc2b 100644 --- a/tune/protox/embedding/datagen.py +++ b/tune/protox/embedding/datagen.py @@ -3,41 +3,42 @@ import math import os import random +import shutil import time from itertools import chain, combinations from multiprocessing import Pool from pathlib import Path + import click import numpy as np import pandas as pd import yaml from sklearn.preprocessing import quantile_transform -import shutil +from dbms.postgres.cli import create_conn, start_postgres, stop_postgres from misc.utils import ( BENCHMARK_NAME_PLACEHOLDER, + SCALE_FACTOR_PLACEHOLDER, WORKLOAD_NAME_PLACEHOLDER, WORKSPACE_PATH_PLACEHOLDER, - SCALE_FACTOR_PLACEHOLDER, DBGymConfig, conv_inputpath_to_realabspath, default_benchmark_config_path, - default_workload_path, - default_pristine_dbdata_snapshot_path, + default_dbdata_parent_dpath, default_pgbin_path, - traindata_fname, + default_pristine_dbdata_snapshot_path, + default_workload_path, + is_ssd, link_result, open_and_save, save_file, + traindata_fname, workload_name_fn, - default_dbdata_parent_dpath, - is_ssd, ) from tune.protox.embedding.loss import COST_COLUMNS from tune.protox.env.space.primitive_space.index_space import IndexSpace from tune.protox.env.types import QueryType from tune.protox.env.workload import Workload -from dbms.postgres.cli import create_conn, start_postgres, stop_postgres from util.shell import subprocess_run # FUTURE(oltp) @@ -54,8 +55,18 @@ # generic args @click.argument("benchmark-name") -@click.option("--seed-start", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).") -@click.option("--seed-end", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).") +@click.option( + "--seed-start", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).", +) +@click.option( + "--seed-end", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).", +) @click.option( "--query-subset", type=click.Choice(["all", "even", "odd"]), @@ -66,7 +77,12 @@ default=1.0, help=f"The scale factor used when generating the data of the benchmark.", ) -@click.option("--pgbin-path", type=Path, default=None, help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.") +@click.option( + "--pgbin-path", + type=Path, + default=None, + help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.", +) # TODO(phw2): need to run pgtune before gathering data @click.option( "--pristine-dbdata-snapshot-path", @@ -177,7 +193,9 @@ def datagen( Updates the symlink in the data/ dir to point to the new .parquet file. """ # check args - assert seed_start <= seed_end, f'seed_start ({seed_start}) must be <= seed_end ({seed_end})' + assert ( + seed_start <= seed_end + ), f"seed_start ({seed_start}) must be <= seed_end ({seed_end})" # set args to defaults programmatically (do this before doing anything else in the function) # TODO(phw2): figure out whether different scale factors use the same config @@ -196,7 +214,9 @@ def datagen( dbgym_cfg.dbgym_workspace_path, benchmark_name, scale_factor ) if dbdata_parent_dpath == None: - dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path) + dbdata_parent_dpath = default_dbdata_parent_dpath( + dbgym_cfg.dbgym_workspace_path + ) if max_concurrent == None: max_concurrent = os.cpu_count() if seed == None: @@ -204,16 +224,24 @@ def datagen( # Convert all input paths to absolute paths workload_path = conv_inputpath_to_realabspath(dbgym_cfg, workload_path) - benchmark_config_path = conv_inputpath_to_realabspath(dbgym_cfg, benchmark_config_path) + benchmark_config_path = conv_inputpath_to_realabspath( + dbgym_cfg, benchmark_config_path + ) pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path) - pristine_dbdata_snapshot_path = conv_inputpath_to_realabspath(dbgym_cfg, pristine_dbdata_snapshot_path) + pristine_dbdata_snapshot_path = conv_inputpath_to_realabspath( + dbgym_cfg, pristine_dbdata_snapshot_path + ) dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath) # Check assertions on args if intended_dbdata_hardware == "hdd": - assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" + assert not is_ssd( + dbdata_parent_dpath + ), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD" elif intended_dbdata_hardware == "ssd": - assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" + assert is_ssd( + dbdata_parent_dpath + ), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD" else: assert False @@ -238,7 +266,14 @@ def datagen( # Group args together to reduce the # of parameters we pass into functions # I chose to group them into separate objects instead because it felt hacky to pass a giant args object into every function generic_args = EmbeddingDatagenGenericArgs( - benchmark_name, workload_name, scale_factor, benchmark_config_path, seed, workload_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath + benchmark_name, + workload_name, + scale_factor, + benchmark_config_path, + seed, + workload_path, + pristine_dbdata_snapshot_path, + dbdata_parent_dpath, ) dir_gen_args = EmbeddingDirGenArgs( leading_col_tbls, @@ -252,7 +287,11 @@ def datagen( # run all steps start_time = time.time() - dbdata_dpath = untar_snapshot(dbgym_cfg, generic_args.pristine_dbdata_snapshot_path, generic_args.dbdata_parent_dpath) + dbdata_dpath = untar_snapshot( + dbgym_cfg, + generic_args.pristine_dbdata_snapshot_path, + generic_args.dbdata_parent_dpath, + ) pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path) start_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) _gen_traindata_dir(dbgym_cfg, generic_args, dir_gen_args) @@ -263,9 +302,13 @@ def datagen( stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath) -def untar_snapshot(dbgym_cfg: DBGymConfig, dbdata_snapshot_fpath: Path, dbdata_parent_dpath: Path) -> Path: +def untar_snapshot( + dbgym_cfg: DBGymConfig, dbdata_snapshot_fpath: Path, dbdata_parent_dpath: Path +) -> Path: # It should be an absolute path and it should exist - assert dbdata_snapshot_fpath.is_absolute() and dbdata_snapshot_fpath.exists(), f"untar_snapshot(): dbdata_snapshot_fpath ({dbdata_snapshot_fpath}) either doesn't exist or is not absolute" + assert ( + dbdata_snapshot_fpath.is_absolute() and dbdata_snapshot_fpath.exists() + ), f"untar_snapshot(): dbdata_snapshot_fpath ({dbdata_snapshot_fpath}) either doesn't exist or is not absolute" # It may be a symlink so we need to resolve them first dbdata_snapshot_real_fpath = dbdata_snapshot_fpath.resolve() save_file(dbgym_cfg, dbdata_snapshot_real_fpath) @@ -286,7 +329,17 @@ class EmbeddingDatagenGenericArgs: I wanted to make multiple classes instead of just one to conceptually separate the different args """ - def __init__(self, benchmark_name, workload_name, scale_factor, benchmark_config_path, seed, workload_path, pristine_dbdata_snapshot_path, dbdata_parent_dpath): + def __init__( + self, + benchmark_name, + workload_name, + scale_factor, + benchmark_config_path, + seed, + workload_path, + pristine_dbdata_snapshot_path, + dbdata_parent_dpath, + ): self.benchmark_name = benchmark_name self.workload_name = workload_name self.scale_factor = scale_factor @@ -399,7 +452,9 @@ def _gen_traindata_dir(dbgym_cfg: DBGymConfig, generic_args, dir_gen_args): def _combine_traindata_dir_into_parquet( - dbgym_cfg: DBGymConfig, generic_args: EmbeddingDatagenGenericArgs, file_gen_args: EmbeddingFileGenArgs + dbgym_cfg: DBGymConfig, + generic_args: EmbeddingDatagenGenericArgs, + file_gen_args: EmbeddingFileGenArgs, ): tbl_dirs = {} with open_and_save(dbgym_cfg, generic_args.benchmark_config_path, "r") as f: @@ -499,7 +554,9 @@ def read(file: Path) -> pd.DataFrame: cur_bias -= sep_bias df = pd.concat(datum, ignore_index=True) - traindata_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True) / traindata_fname(generic_args.benchmark_name, generic_args.workload_name) + traindata_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True) / traindata_fname( + generic_args.benchmark_name, generic_args.workload_name + ) df.to_parquet(traindata_path) link_result(dbgym_cfg, traindata_path) diff --git a/tune/protox/embedding/select.py b/tune/protox/embedding/select.py index 936bd328..1e28dce0 100644 --- a/tune/protox/embedding/select.py +++ b/tune/protox/embedding/select.py @@ -7,9 +7,12 @@ import pandas as pd import tqdm -from misc.utils import DBGymConfig, link_result, default_embedder_dname -from tune.protox.embedding.train_args import EmbeddingTrainGenericArgs, EmbeddingSelectArgs +from misc.utils import DBGymConfig, default_embedder_dname, link_result from tune.protox.embedding.analyze import RANGES_FNAME, STATS_FNAME +from tune.protox.embedding.train_args import ( + EmbeddingSelectArgs, + EmbeddingTrainGenericArgs, +) class DotDict(dict): @@ -18,7 +21,11 @@ class DotDict(dict): __delattr__ = dict.__delitem__ -def select_best_embeddings(dbgym_cfg: DBGymConfig, generic_args: EmbeddingTrainGenericArgs, select_args: EmbeddingSelectArgs) -> None: +def select_best_embeddings( + dbgym_cfg: DBGymConfig, + generic_args: EmbeddingTrainGenericArgs, + select_args: EmbeddingSelectArgs, +) -> None: data = _load_data(dbgym_cfg, select_args) if generic_args.traindata_path is not None and os.path.exists( @@ -28,10 +35,10 @@ def select_best_embeddings(dbgym_cfg: DBGymConfig, generic_args: EmbeddingTrainG data = _attach(data, raw_data, select_args.idx_limit) curated_dpath = dbgym_cfg.cur_task_runs_data_path("curated", mkdir=True) - curated_results_fpath = dbgym_cfg.cur_task_runs_data_path(mkdir=True) / "curated_results.csv" - data.to_csv( - curated_results_fpath, index=False + curated_results_fpath = ( + dbgym_cfg.cur_task_runs_data_path(mkdir=True) / "curated_results.csv" ) + data.to_csv(curated_results_fpath, index=False) if "idx_class_total_error" in data: data["elbo"] = data.elbo + data.idx_class_total_error @@ -64,9 +71,7 @@ def select_best_embeddings(dbgym_cfg: DBGymConfig, generic_args: EmbeddingTrainG for loop_i, tup in enumerate(df.itertuples()): epoch = int(str(tup.path).split("epoch")[-1]) model_dpath = curated_dpath / f"model{idx}" - shutil.copytree( - tup.path, model_dpath - ) + shutil.copytree(tup.path, model_dpath) shutil.copy( Path(tup.root) / "config", model_dpath / "config", @@ -77,7 +82,14 @@ def select_best_embeddings(dbgym_cfg: DBGymConfig, generic_args: EmbeddingTrainG ) if loop_i == 0: - link_result(dbgym_cfg, model_dpath, custom_result_name=default_embedder_dname(generic_args.benchmark_name, generic_args.workload_name) + ".link") + link_result( + dbgym_cfg, + model_dpath, + custom_result_name=default_embedder_dname( + generic_args.benchmark_name, generic_args.workload_name + ) + + ".link", + ) info_txt.write(f"model{idx}/embedder.pth\n") idx += 1 diff --git a/tune/protox/embedding/train.py b/tune/protox/embedding/train.py index afc41077..69eba251 100644 --- a/tune/protox/embedding/train.py +++ b/tune/protox/embedding/train.py @@ -1,6 +1,7 @@ import logging import random from pathlib import Path + import click import numpy as np import torch @@ -39,8 +40,18 @@ # generic args @click.argument("benchmark-name", type=str) -@click.option("--seed-start", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).") -@click.option("--seed-end", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).") +@click.option( + "--seed-start", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).", +) +@click.option( + "--seed-end", + type=int, + default=15721, + help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).", +) @click.option( "--query-subset", type=click.Choice(["all", "even", "odd"]), @@ -201,7 +212,9 @@ def train( seed = random.randint(0, 1e8) # Convert all input paths to absolute paths - benchmark_config_path = conv_inputpath_to_realabspath(dbgym_cfg, benchmark_config_path) + benchmark_config_path = conv_inputpath_to_realabspath( + dbgym_cfg, benchmark_config_path + ) traindata_path = conv_inputpath_to_realabspath(dbgym_cfg, traindata_path) hpo_space_path = conv_inputpath_to_realabspath(dbgym_cfg, hpo_space_path) @@ -211,12 +224,21 @@ def train( torch.manual_seed(seed) logging.getLogger().setLevel(logging.INFO) - workload_path = conv_inputpath_to_realabspath(dbgym_cfg, default_workload_path( - dbgym_cfg.dbgym_workspace_path, benchmark_name, workload_name - )) + workload_path = conv_inputpath_to_realabspath( + dbgym_cfg, + default_workload_path( + dbgym_cfg.dbgym_workspace_path, benchmark_name, workload_name + ), + ) # group args. see comment in datagen.py:datagen() generic_args = EmbeddingTrainGenericArgs( - benchmark_name, workload_name, scale_factor, benchmark_config_path, traindata_path, seed, workload_path + benchmark_name, + workload_name, + scale_factor, + benchmark_config_path, + traindata_path, + seed, + workload_path, ) train_args = EmbeddingTrainAllArgs( hpo_space_path, diff --git a/tune/protox/embedding/train_all.py b/tune/protox/embedding/train_all.py index f5bbd687..e8358387 100644 --- a/tune/protox/embedding/train_all.py +++ b/tune/protox/embedding/train_all.py @@ -1,5 +1,5 @@ -import gc import copy +import gc import json import logging import os @@ -9,6 +9,7 @@ from datetime import datetime from pathlib import Path from typing import Any, Callable, Optional, Tuple, Union + import numpy as np import pandas as pd import ray @@ -62,7 +63,9 @@ def fetch_index_parameters( tables = data["tables"] attributes = data["attributes"] query_spec = data["query_spec"] - workload = Workload(dbgym_cfg, tables, attributes, query_spec, workload_path, pid=None) + workload = Workload( + dbgym_cfg, tables, attributes, query_spec, workload_path, pid=None + ) modified_attrs = workload.column_usages() space = IndexSpace( @@ -85,7 +88,12 @@ def fetch_index_parameters( def load_input_data( - dbgym_cfg: DBGymConfig, traindata_path: Path, train_size: int, max_attrs: int, require_cost: bool, seed: int + dbgym_cfg: DBGymConfig, + traindata_path: Path, + train_size: int, + max_attrs: int, + require_cost: bool, + seed: int, ) -> Tuple[TensorDataset, Any, Any, Optional[TensorDataset], int]: # Load the input data. columns = [] @@ -187,7 +195,9 @@ def train_all_embeddings( # Connect to cluster or die. restart_ray(dbgym_cfg.root_yaml["ray_gcs_port"]) - ray.init(address=f"localhost:{dbgym_cfg.root_yaml['ray_gcs_port']}", log_to_driver=False) + ray.init( + address=f"localhost:{dbgym_cfg.root_yaml['ray_gcs_port']}", log_to_driver=False + ) scheduler = FIFOScheduler() # type: ignore # Search. @@ -284,8 +294,13 @@ def _hpo_train( config["metric_loss_md"]["output_scale"] = config["output_scale"] dtime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - trial_dpath = dbgym_cfg.cur_task_runs_data_path(mkdir=True) / f"embeddings_{dtime}_{os.getpid()}" - assert not trial_dpath.exists(), f"at this point, trial_dpath ({trial_dpath}) should not exist" + trial_dpath = ( + dbgym_cfg.cur_task_runs_data_path(mkdir=True) + / f"embeddings_{dtime}_{os.getpid()}" + ) + assert ( + not trial_dpath.exists() + ), f"at this point, trial_dpath ({trial_dpath}) should not exist" # Seed seed = np.random.randint(int(1), int(1e8)) diff --git a/tune/protox/embedding/train_args.py b/tune/protox/embedding/train_args.py index cad4cee4..f4a955f9 100644 --- a/tune/protox/embedding/train_args.py +++ b/tune/protox/embedding/train_args.py @@ -2,7 +2,14 @@ class EmbeddingTrainGenericArgs: """Same comment as EmbeddingDatagenGenericArgs""" def __init__( - self, benchmark_name, workload_name, scale_factor, benchmark_config_path, traindata_path, seed, workload_path + self, + benchmark_name, + workload_name, + scale_factor, + benchmark_config_path, + traindata_path, + seed, + workload_path, ): self.benchmark_name = benchmark_name self.workload_name = workload_name diff --git a/tune/protox/env/logger.py b/tune/protox/env/logger.py index 12176780..627e6a3c 100644 --- a/tune/protox/env/logger.py +++ b/tune/protox/env/logger.py @@ -92,25 +92,40 @@ def get_logger(self, name: Optional[str]) -> logging.Logger: return logging.getLogger(name) def stash_results( - self, info_dict: dict[str, Any], name_override: Optional[str] = None, ray_trial_id: Optional[str] = None, + self, + info_dict: dict[str, Any], + name_override: Optional[str] = None, + ray_trial_id: Optional[str] = None, ) -> None: """ Stash data about this step of tuning so that it can be replayed. """ - dname = name_override if name_override else datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + dname = ( + name_override + if name_override + else datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ) if ray_trial_id != None: # Orthogonal to whether name_override is used, ray_trial_id disambiguates between folders created # by different HPO trials so that the folders don't overwrite each other. dname += f"_{ray_trial_id}" - if info_dict["results_dpath"] is not None and Path(info_dict["results_dpath"]).exists(): - local["mv"][info_dict["results_dpath"], f"{self.tuning_steps_dpath}/{dname}"].run() + if ( + info_dict["results_dpath"] is not None + and Path(info_dict["results_dpath"]).exists() + ): + local["mv"][ + info_dict["results_dpath"], f"{self.tuning_steps_dpath}/{dname}" + ].run() else: - Path(f"{self.tuning_steps_dpath}/{dname}").mkdir(parents=True, exist_ok=True) + Path(f"{self.tuning_steps_dpath}/{dname}").mkdir( + parents=True, exist_ok=True + ) if info_dict["prior_pgconf"]: local["cp"][ - info_dict["prior_pgconf"], f"{self.tuning_steps_dpath}/{dname}/old_pg.conf" + info_dict["prior_pgconf"], + f"{self.tuning_steps_dpath}/{dname}/old_pg.conf", ].run() if info_dict["prior_state_container"]: diff --git a/tune/protox/env/lsc/lsc.py b/tune/protox/env/lsc/lsc.py index fdc18749..824baa75 100644 --- a/tune/protox/env/lsc/lsc.py +++ b/tune/protox/env/lsc/lsc.py @@ -59,12 +59,13 @@ def __init__( ) self.logger.get_logger(__name__).info("LSC Shift Max: %s", self.max) - def apply_bias(self, action: ProtoAction) -> ProtoAction: if not self.enabled: return action - assert action.shape[-1] == self.vae_configuration["latent_dim"], print(action.shape, self.vae_configuration["latent_dim"]) + assert action.shape[-1] == self.vae_configuration["latent_dim"], print( + action.shape, self.vae_configuration["latent_dim"] + ) # Get the LSC shift associated with the current episode. lsc_shift = self.lsc_shift[(self.num_steps % self.horizon)] @@ -82,14 +83,13 @@ def current_bias(self) -> float: def current_scale(self) -> np.typing.NDArray[np.float32]: if not self.enabled: - return np.array([-1.], dtype=np.float32) + return np.array([-1.0], dtype=np.float32) lsc_shift = self.lsc_shift[(self.num_steps % self.horizon)] lsc_max = self.max[(self.num_steps % self.horizon)] rel = lsc_shift / lsc_max return np.array([(rel * 2.0) - 1], dtype=np.float32) - def inverse_scale(self, value: torch.Tensor) -> torch.Tensor: if not self.enabled: return torch.zeros_like(value).float() @@ -98,7 +98,6 @@ def inverse_scale(self, value: torch.Tensor) -> torch.Tensor: lsc_shift = ((value + 1) / 2.0) * lsc_max return cast(torch.Tensor, lsc_shift * self.vae_configuration["output_scale"]) - def advance(self) -> None: if self.frozen or (not self.enabled): return diff --git a/tune/protox/env/mqo/mqo_wrapper.py b/tune/protox/env/mqo/mqo_wrapper.py index 61f1d277..84baa36f 100644 --- a/tune/protox/env/mqo/mqo_wrapper.py +++ b/tune/protox/env/mqo/mqo_wrapper.py @@ -78,7 +78,7 @@ def _mutilate_action_with_metrics( # query knobs of their best variation. # For queries that executed where all their variations timed out, we don't want to use the knobs # in `timeout_qknobs` since those are known to be bad. Instead, we just use the knobs selected by - # by the agent, which may be different from the knobs of *all* variations. + # by the agent, which may be different from the knobs of *all* variations. # Finally, for queries that didn't execute, we'll assume that some arbitrary variation ("PrevDual") # is probably better than the knobs set by the agent. @@ -162,20 +162,38 @@ def __init__( self.best_observed: dict[str, BestQueryRun] = {} self.logger = logger - def _update_best_observed(self, query_metric_data: dict[str, BestQueryRun], force_overwrite=False) -> None: + def _update_best_observed( + self, query_metric_data: dict[str, BestQueryRun], force_overwrite=False + ) -> None: if query_metric_data is not None: for qid, best_run in query_metric_data.items(): if qid not in self.best_observed or force_overwrite: - self.best_observed[qid] = BestQueryRun(best_run.query_run, best_run.runtime, best_run.timed_out, None, None) + self.best_observed[qid] = BestQueryRun( + best_run.query_run, + best_run.runtime, + best_run.timed_out, + None, + None, + ) if self.logger: - self.logger.get_logger(__name__).debug(f"[best_observe] {qid}: {best_run.runtime/1e6} (force: {force_overwrite})") + self.logger.get_logger(__name__).debug( + f"[best_observe] {qid}: {best_run.runtime/1e6} (force: {force_overwrite})" + ) elif not best_run.timed_out: qobs = self.best_observed[qid] assert qobs.runtime and best_run.runtime if best_run.runtime < qobs.runtime: - self.best_observed[qid] = BestQueryRun(best_run.query_run, best_run.runtime, best_run.timed_out, None, None) + self.best_observed[qid] = BestQueryRun( + best_run.query_run, + best_run.runtime, + best_run.timed_out, + None, + None, + ) if self.logger: - self.logger.get_logger(__name__).debug(f"[best_observe] {qid}: {best_run.runtime/1e6}") + self.logger.get_logger(__name__).debug( + f"[best_observe] {qid}: {best_run.runtime/1e6}" + ) def step( # type: ignore self, @@ -301,8 +319,12 @@ def transmute( with torch.no_grad(): # Pass the mutilated action back through. assert isinstance(self.action_space, HolonSpace) - info["actions_info"]["best_observed_holon_action"] = best_observed_holon_action - info["maximal_embed"] = self.action_space.to_latent([best_observed_holon_action]) + info["actions_info"][ + "best_observed_holon_action" + ] = best_observed_holon_action + info["maximal_embed"] = self.action_space.to_latent( + [best_observed_holon_action] + ) return self.unwrapped.step_post_execute(success, action, info) @@ -380,7 +402,9 @@ def reset(self, *args: Any, **kwargs: Any) -> Tuple[Any, EnvInfoDict]: # type: # Reward should be irrelevant. If we do accidentally use it, cause an error. # Similarly, metric should be irrelevant. Do not shift the workload timeout. - info = EnvInfoDict({"metric": None, "reward": None, "results_dpath": results_dpath}) + info = EnvInfoDict( + {"metric": None, "reward": None, "results_dpath": results_dpath} + ) # Use this to adjust the container and state but don't shift the step. state, _, _, _, info = self.unwrapped.step_post_execute( True, action, info, soft=True @@ -389,7 +413,8 @@ def reset(self, *args: Any, **kwargs: Any) -> Tuple[Any, EnvInfoDict]: # type: # Update the reward baseline. if self.unwrapped.reward_utility: self.unwrapped.reward_utility.set_relative_baseline( - self.unwrapped.baseline_metric, prev_result=metric, + self.unwrapped.baseline_metric, + prev_result=metric, ) if self.logger: diff --git a/tune/protox/env/pg_env.py b/tune/protox/env/pg_env.py index 92236519..3e267d53 100644 --- a/tune/protox/env/pg_env.py +++ b/tune/protox/env/pg_env.py @@ -158,17 +158,19 @@ def reset( # type: ignore ) default_action = self.action_space.null_action(sc) - success, metric, _, results_dpath, _, query_metric_data = self.workload.execute( - pg_conn=self.pg_conn, - reward_utility=self.reward_utility, - observation_space=self.observation_space, - action_space=self.action_space, - actions=[default_action], - variation_names=["GlobalDual"], - benchbase_config=self.benchbase_config, - query_timeout=self.query_timeout, - update=False, - first=True, + success, metric, _, results_dpath, _, query_metric_data = ( + self.workload.execute( + pg_conn=self.pg_conn, + reward_utility=self.reward_utility, + observation_space=self.observation_space, + action_space=self.action_space, + actions=[default_action], + variation_names=["GlobalDual"], + benchbase_config=self.benchbase_config, + query_timeout=self.query_timeout, + update=False, + first=True, + ) ) # Ensure that the first run succeeds. @@ -253,9 +255,15 @@ def step_execute( assert isinstance(self.observation_space, StateSpace) assert isinstance(self.action_space, HolonSpace) # Evaluate the benchmark. - self.logger.get_logger(__name__).info(f"\n\nfetch_server_knobs(): {fetch_server_knobs(self.pg_conn.conn(), self.action_space.get_knob_space().tables, self.action_space.get_knob_space().knobs, self.workload.queries)}\n\n") - self.logger.get_logger(__name__).info(f"\n\nfetch_server_indexes(): {fetch_server_indexes(self.pg_conn.conn(), self.action_space.get_knob_space().tables)}\n\n") - self.logger.get_logger(__name__).info(f"\n\naction_names: {[a[0] for a in all_holon_action_variations]}\n\n") + self.logger.get_logger(__name__).info( + f"\n\nfetch_server_knobs(): {fetch_server_knobs(self.pg_conn.conn(), self.action_space.get_knob_space().tables, self.action_space.get_knob_space().knobs, self.workload.queries)}\n\n" + ) + self.logger.get_logger(__name__).info( + f"\n\nfetch_server_indexes(): {fetch_server_indexes(self.pg_conn.conn(), self.action_space.get_knob_space().tables)}\n\n" + ) + self.logger.get_logger(__name__).info( + f"\n\naction_names: {[a[0] for a in all_holon_action_variations]}\n\n" + ) ( success, metric, @@ -376,16 +384,20 @@ def attempt_checkpoint(conn_str: str) -> None: conn_str, autocommit=True, prepare_threshold=None ) as conn: conn.execute("CHECKPOINT") - + break except psycopg.OperationalError as e: attempts += 1 if attempts >= 5: - assert False, f"attempt_checkpoint() failed after 5 attempts with {e}" + assert ( + False + ), f"attempt_checkpoint() failed after 5 attempts with {e}" if self.logger: - self.logger.get_logger(__name__).debug(f"[attempt_checkpoint]: {e}") + self.logger.get_logger(__name__).debug( + f"[attempt_checkpoint]: {e}" + ) time.sleep(5) shift_start = time.time() diff --git a/tune/protox/env/space/holon_space.py b/tune/protox/env/space/holon_space.py index b0fe9538..ee51a5de 100644 --- a/tune/protox/env/space/holon_space.py +++ b/tune/protox/env/space/holon_space.py @@ -45,7 +45,7 @@ def _latent_assert_check( first_d = self.to_latent([carprod_neighbors[first_drift]])[0] def eq_fn(x: torch.Tensor, y: torch.Tensor) -> bool: - return bool(torch.isclose(x, y).all().item()) + return bool(torch.isclose(x, y, atol=0.001).all().item()) assert eq_fn(zero, carprod_embeds[0]), print(zero, carprod_embeds[0]) assert eq_fn(last, carprod_embeds[-1]), print(last, carprod_embeds[-1]) @@ -368,4 +368,4 @@ def generate_plan_from_config( assert len(outputs) == 3 config_changes = list(itertools.chain(*[o[0] for o in outputs])) sql_commands = list(itertools.chain(*[o[1] for o in outputs])) - return config_changes, sql_commands \ No newline at end of file + return config_changes, sql_commands diff --git a/tune/protox/env/space/primitive/index.py b/tune/protox/env/space/primitive/index.py index 7fcc1509..ae31a486 100644 --- a/tune/protox/env/space/primitive/index.py +++ b/tune/protox/env/space/primitive/index.py @@ -54,7 +54,9 @@ def construct_md( raw_repr=None, bias=0.0, ) - assert ia.get_index_name() == idx_name, f"ia.get_index_name()={ia.get_index_name()} but idx_name={idx_name}" + assert ( + ia.get_index_name() == idx_name + ), f"ia.get_index_name()={ia.get_index_name()} but idx_name={idx_name}" return ia def sql(self, add: bool, allow_fail: bool = False) -> str: @@ -83,7 +85,7 @@ def get_index_name(self): if self not in IndexAction.index_name_map: IndexAction.index_name_map[self] = f"index{IndexAction.index_name_counter}" IndexAction.index_name_counter += 1 - + return IndexAction.index_name_map[self] # This equality/hash mechanism is purely based off of index identity. diff --git a/tune/protox/env/space/state/metric.py b/tune/protox/env/space/state/metric.py index 099fde14..7fec068e 100644 --- a/tune/protox/env/space/state/metric.py +++ b/tune/protox/env/space/state/metric.py @@ -129,7 +129,11 @@ def require_metrics(self) -> bool: return True def __init__( - self, dbgym_cfg: DBGymConfig, spaces: Mapping[str, spaces.Space[Any]], tables: list[str], seed: int + self, + dbgym_cfg: DBGymConfig, + spaces: Mapping[str, spaces.Space[Any]], + tables: list[str], + seed: int, ) -> None: self.dbgym_cfg = dbgym_cfg self.tables = tables @@ -152,7 +156,9 @@ def __init__( self.internal_spaces[metric] = Box(low=-np.inf, high=np.inf) super().__init__(self.internal_spaces, seed) - def check_benchbase(self, dbgym_cfg: DBGymConfig, results_dpath: Union[str, Path]) -> bool: + def check_benchbase( + self, dbgym_cfg: DBGymConfig, results_dpath: Union[str, Path] + ) -> bool: assert results_dpath is not None assert Path(results_dpath).exists() metric_files = [f for f in Path(results_dpath).rglob("*metrics.json")] @@ -182,8 +188,12 @@ def check_benchbase(self, dbgym_cfg: DBGymConfig, results_dpath: Union[str, Path initial_data = initial_metrics[key] final_data = final_metrics[key] if spec["filter_db"]: - initial_data = [d for d in initial_data if d["datname"] == DBGYM_POSTGRES_DBNAME] - final_data = [d for d in final_data if d["datname"] == DBGYM_POSTGRES_DBNAME] + initial_data = [ + d for d in initial_data if d["datname"] == DBGYM_POSTGRES_DBNAME + ] + final_data = [ + d for d in final_data if d["datname"] == DBGYM_POSTGRES_DBNAME + ] elif spec["per_table"]: initial_data = sorted( [d for d in initial_data if d["relname"] in self.tables], @@ -254,8 +264,12 @@ def state_delta( initial_data = initial[key] final_data = final[key] if spec["filter_db"]: - initial_data = [d for d in initial_data if d["datname"] == DBGYM_POSTGRES_DBNAME] - final_data = [d for d in final_data if d["datname"] == DBGYM_POSTGRES_DBNAME] + initial_data = [ + d for d in initial_data if d["datname"] == DBGYM_POSTGRES_DBNAME + ] + final_data = [ + d for d in final_data if d["datname"] == DBGYM_POSTGRES_DBNAME + ] elif spec["per_table"]: initial_data = sorted( [d for d in initial_data if d["relname"] in self.tables], diff --git a/tune/protox/env/space/state/space.py b/tune/protox/env/space/state/space.py index 8119818b..7423fc30 100644 --- a/tune/protox/env/space/state/space.py +++ b/tune/protox/env/space/state/space.py @@ -14,7 +14,9 @@ def require_metrics(self) -> bool: pass @abstractmethod - def check_benchbase(self, dbgym_cfg: DBGymConfig, results_dpath: Union[str, Path]) -> bool: + def check_benchbase( + self, dbgym_cfg: DBGymConfig, results_dpath: Union[str, Path] + ) -> bool: pass @abstractmethod diff --git a/tune/protox/env/space/state/structure.py b/tune/protox/env/space/state/structure.py index df681a2d..04dbffdd 100644 --- a/tune/protox/env/space/state/structure.py +++ b/tune/protox/env/space/state/structure.py @@ -38,10 +38,12 @@ def __init__( } else: self.internal_spaces = { - k: gym.spaces.Box(low=-np.inf, high=np.inf, shape=(s.critic_dim(),)) - if s.uses_embed() else s + k: ( + gym.spaces.Box(low=-np.inf, high=np.inf, shape=(s.critic_dim(),)) + if s.uses_embed() + else s + ) for k, s in action_space.get_spaces() - } self.internal_spaces.update(spaces) @@ -50,7 +52,9 @@ def __init__( def require_metrics(self) -> bool: return False - def check_benchbase(self, dbgym_cfg: DBGymConfig, results_dpath: Union[str, Path]) -> bool: + def check_benchbase( + self, dbgym_cfg: DBGymConfig, results_dpath: Union[str, Path] + ) -> bool: # We don't use benchbase metrics anyways. return True diff --git a/tune/protox/env/util/execute.py b/tune/protox/env/util/execute.py index f991c257..6ec5d695 100644 --- a/tune/protox/env/util/execute.py +++ b/tune/protox/env/util/execute.py @@ -85,7 +85,9 @@ def _acquire_metrics_around_query( if query_timeout > 0: _force_statement_timeout(connection, query_timeout * 1000) else: - assert query_timeout == 0, f"Setting query_timeout to 0 indicates \"timeout\". However, setting query_timeout ({query_timeout}) < 0 is a bug." + assert ( + query_timeout == 0 + ), f'Setting query_timeout to 0 indicates "timeout". However, setting query_timeout ({query_timeout}) < 0 is a bug.' qid_runtime, did_time_out, explain_data = _time_query( logger, prefix, connection, query, query_timeout diff --git a/tune/protox/env/util/pg_conn.py b/tune/protox/env/util/pg_conn.py index b28a75bb..233b49bc 100644 --- a/tune/protox/env/util/pg_conn.py +++ b/tune/protox/env/util/pg_conn.py @@ -5,6 +5,7 @@ create dbdata. util.pg provides helpers used by *both* of the above files (as well as other files). """ + import os import shutil import threading @@ -14,13 +15,18 @@ import psutil import psycopg +import yaml from plumbum import local from psycopg.errors import ProgramLimitExceeded, QueryCanceled -import yaml -from tune.protox.env.logger import Logger, time_record from misc.utils import DBGymConfig, link_result, open_and_save, parent_dpath_of_path -from util.pg import DBGYM_POSTGRES_USER, DBGYM_POSTGRES_PASS, DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES +from tune.protox.env.logger import Logger, time_record +from util.pg import ( + DBGYM_POSTGRES_DBNAME, + DBGYM_POSTGRES_PASS, + DBGYM_POSTGRES_USER, + SHARED_PRELOAD_LIBRARIES, +) class PostgresConn: @@ -54,7 +60,9 @@ def __init__( # checkpoint_dbdata_snapshot_fpath is the .tgz snapshot that represents the current # state of the database as it is being tuned. It is generated while tuning and is # discarded once tuning is completed. - self.checkpoint_dbdata_snapshot_fpath = dbgym_cfg.dbgym_tmp_path / "checkpoint_dbdata.tgz" + self.checkpoint_dbdata_snapshot_fpath = ( + dbgym_cfg.dbgym_tmp_path / "checkpoint_dbdata.tgz" + ) # dbdata_parent_dpath is the parent directory of the dbdata that is *actively being tuned*. # Setting this lets us control the hardware device dbdata is built on (e.g. HDD vs. SSD). self.dbdata_parent_dpath = dbdata_parent_dpath @@ -79,13 +87,16 @@ def disconnect(self) -> None: self._conn = None def move_log(self) -> None: - pglog_fpath = self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) / f"pg{self.pgport}.log" - pglog_this_step_fpath = self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) / f"pg{self.pgport}.log.{self.log_step}" + pglog_fpath = ( + self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) + / f"pg{self.pgport}.log" + ) + pglog_this_step_fpath = ( + self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) + / f"pg{self.pgport}.log.{self.log_step}" + ) if pglog_fpath.exists(): - shutil.move( - pglog_fpath, - pglog_this_step_fpath - ) + shutil.move(pglog_fpath, pglog_this_step_fpath) self.log_step += 1 @time_record("shutdown") @@ -134,11 +145,16 @@ def start_with_changes( if SHARED_PRELOAD_LIBRARIES: # This way of doing it works for both single or multiple libraries. An example of a way # that *doesn't* work is `f"shared_preload_libraries='"{SHARED_PRELOAD_LIBRARIES}"'"` - conf_changes.append(f"shared_preload_libraries='{SHARED_PRELOAD_LIBRARIES}'") + conf_changes.append( + f"shared_preload_libraries='{SHARED_PRELOAD_LIBRARIES}'" + ) dbdata_auto_conf_path = self.dbdata_dpath / "postgresql.auto.conf" with open(dbdata_auto_conf_path, "w") as f: f.write("\n".join(conf_changes)) - save_auto_conf_path = self.dbgym_cfg.cur_task_runs_data_path(".", mkdir=True) / "postgresql.auto.conf" + save_auto_conf_path = ( + self.dbgym_cfg.cur_task_runs_data_path(".", mkdir=True) + / "postgresql.auto.conf" + ) local["cp"][dbdata_auto_conf_path, save_auto_conf_path].run() link_result(self.dbgym_cfg, save_auto_conf_path) @@ -177,7 +193,8 @@ def start_with_changes( "-l", # We log to pg{self.pgport}.log instead of pg.log so that different PostgresConn objects # don't all try to write to the same file. - self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) / f"pg{self.pgport}.log", + self.dbgym_cfg.cur_task_runs_artifacts_path(mkdir=True) + / f"pg{self.pgport}.log", "start", ].run(retcode=None) @@ -227,7 +244,7 @@ def start_with_changes( # don't crash if enable_boot is off and the file doesn't exist. with open_and_save(self.dbgym_cfg, self.boot_config_fpath) as f: boot_config = yaml.safe_load(f) - + self._set_up_boot( boot_config["intelligent_cache"], boot_config["early_stop"], @@ -245,7 +262,17 @@ def start_with_changes( return True - def _set_up_boot(self, intelligent_cache: bool, early_stop: bool, seq_sample: bool, seq_sample_pct: int, seq_sample_seed: int, mu_hyp_opt: float, mu_hyp_time: int, mu_hyp_stdev: float): + def _set_up_boot( + self, + intelligent_cache: bool, + early_stop: bool, + seq_sample: bool, + seq_sample_pct: int, + seq_sample_seed: int, + mu_hyp_opt: float, + mu_hyp_time: int, + mu_hyp_stdev: float, + ): """ Sets up Boot on the currently running Postgres instances. Uses instance vars of PostgresConn for configuration. @@ -330,7 +357,7 @@ def cancel_fn(conn_str: str) -> None: self.disconnect() return 0, None - + def restore_pristine_snapshot(self): self._restore_snapshot(self.pristine_dbdata_snapshot_fpath) @@ -339,7 +366,8 @@ def restore_checkpointed_snapshot(self): @time_record("restore") def _restore_snapshot( - self, dbdata_snapshot_path: Path, + self, + dbdata_snapshot_path: Path, ) -> bool: self.shutdown_postgres() @@ -349,7 +377,12 @@ def _restore_snapshot( # Strip the "dbdata" so we can implant directly into the target dbdata_dpath. assert dbdata_snapshot_path.exists() local["tar"][ - "xf", dbdata_snapshot_path, "-C", self.dbdata_dpath, "--strip-components", "1" + "xf", + dbdata_snapshot_path, + "-C", + self.dbdata_dpath, + "--strip-components", + "1", ].run() # Imprint the required port. ( diff --git a/tune/protox/env/workload.py b/tune/protox/env/workload.py index 5d646f54..f56b931b 100644 --- a/tune/protox/env/workload.py +++ b/tune/protox/env/workload.py @@ -1,11 +1,12 @@ -import math import copy import json +import math import shutil +import tempfile import time from pathlib import Path from typing import Any, Optional, Tuple, Union, cast -import tempfile + import numpy as np import pglast # type: ignore from plumbum import local @@ -68,7 +69,10 @@ def _crunch( pid: Optional[int], query_spec: QuerySpec, ) -> None: - assert all(sql[1].exists() and not sql[1].is_symlink() and sql[1].is_absolute() for sql in sqls), f"sqls ({sqls}) should only contain existent real absolute paths." + assert all( + sql[1].exists() and not sql[1].is_symlink() and sql[1].is_absolute() + for sql in sqls + ), f"sqls ({sqls}) should only contain existent real absolute paths." do_tbl_include_subsets_prune = query_spec["tbl_include_subsets_prune"] self.order = [] self.queries = QueryMap({}) @@ -329,7 +333,9 @@ def max_indexable(self) -> int: return max([len(cols) for _, cols in self.query_usages.items()]) @staticmethod - def compute_total_workload_runtime(qid_runtime_data: dict[str, BestQueryRun]) -> float: + def compute_total_workload_runtime( + qid_runtime_data: dict[str, BestQueryRun] + ) -> float: return sum(best_run.runtime for best_run in qid_runtime_data.values()) / 1.0e6 @time_record("execute") @@ -379,7 +385,7 @@ def execute_workload( for action in actions ], ) - + # Figure out workload to execute. if workload_qdir is not None and workload_qdir[0] is not None: # Load actual queries to execute. @@ -418,7 +424,7 @@ def execute_workload( for qidx, (sql_type, query) in enumerate(queries): assert sql_type != QueryType.UNKNOWN if sql_type != QueryType.SELECT: - # This is a sanity check because any OLTP workload should be run through benchbase, and any OLAP workload should not have INS_UPD_DEL queries. + # This is a sanity check because any OLTP workload should be run through benchbase, and any OLAP workload should not have INS_UPD_DEL queries. assert sql_type != QueryType.INS_UPD_DEL pg_conn.conn().execute(query) continue @@ -443,7 +449,9 @@ def execute_workload( if r[2] not in [rr[2] for rr in runs]: runs.append(r) - target_pqt = query_timeout if query_timeout else this_execution_workload_timeout + target_pqt = ( + query_timeout if query_timeout else this_execution_workload_timeout + ) skip_execute = False if ( reset_metrics is not None @@ -468,7 +476,12 @@ def execute_workload( connection=pg_conn.conn(), runs=runs, query=query, - query_timeout=min(target_pqt, this_execution_workload_timeout - Workload.compute_total_workload_runtime(qid_runtime_data) + 1), + query_timeout=min( + target_pqt, + this_execution_workload_timeout + - Workload.compute_total_workload_runtime(qid_runtime_data) + + 1, + ), logger=self.logger, sysknobs=sysknobs, observation_space=observation_space, @@ -490,9 +503,12 @@ def execute_workload( assert best_run.runtime qid_runtime_data[qid] = best_run - if Workload.compute_total_workload_runtime(qid_runtime_data) > this_execution_workload_timeout: + if ( + Workload.compute_total_workload_runtime(qid_runtime_data) + > this_execution_workload_timeout + ): # We need to undo any potential statements after the timed out query. - for st, rq in queries[qidx+1:]: + for st, rq in queries[qidx + 1 :]: if st != QueryType.SELECT: # This is a sanity check because any OLTP workload should be run through benchbase, and any OLAP workload should not have INS_UPD_DEL queries. If we do have INS_UPD_DEL queries, our "undo" logic will likely have to change. assert st != QueryType.INS_UPD_DEL @@ -573,7 +589,9 @@ def flatten(d: dict[str, Any]) -> dict[str, Any]: assert best_run and best_run.runtime and best_run.query_run rtime = best_run.runtime pfx = best_run.query_run.prefix - f.write(f"{i+1},{qid},{start},{rtime},{best_run.timed_out},0,{pfx}\n") + f.write( + f"{i+1},{qid},{start},{rtime},{best_run.timed_out},0,{pfx}\n" + ) start += rtime / 1e6 # Write a penalty term if needed. @@ -581,7 +599,8 @@ def flatten(d: dict[str, Any]) -> dict[str, Any]: if workload_timed_out and self.workload_timeout_penalty > 1: # Get the penalty. penalty = ( - this_execution_workload_timeout * self.workload_timeout_penalty - Workload.compute_total_workload_runtime(qid_runtime_data) + this_execution_workload_timeout * self.workload_timeout_penalty + - Workload.compute_total_workload_runtime(qid_runtime_data) ) penalty = (penalty + 1.05) * 1e6 if not first else penalty * 1e6 elif workload_timed_out and not first: @@ -592,7 +611,9 @@ def flatten(d: dict[str, Any]) -> dict[str, Any]: f.write(f"{len(self.order)},P,{time.time()},{penalty},,0,PENALTY\n") # Get all the timeouts. - num_timed_out_queries = sum([1 if best_run.timed_out else 0 for _, best_run in qid_runtime_data.items()]) + num_timed_out_queries = sum( + [1 if best_run.timed_out else 0 for _, best_run in qid_runtime_data.items()] + ) return num_timed_out_queries, workload_timed_out, qid_runtime_data @time_record("execute") @@ -637,8 +658,16 @@ def execute( # Generate a unique temporary directory to store results in. results_dpath = Path(tempfile.mkdtemp()) - print(results_dpath.is_dir(), results_dpath.exists(), not any(results_dpath.iterdir())) - assert results_dpath.is_dir() and results_dpath.exists() and not any(results_dpath.iterdir()), "results_dpath should be existent and empty since mkdtemp should guarantee a unique dir." + print( + results_dpath.is_dir(), + results_dpath.exists(), + not any(results_dpath.iterdir()), + ) + assert ( + results_dpath.is_dir() + and results_dpath.exists() + and not any(results_dpath.iterdir()) + ), "results_dpath should be existent and empty since mkdtemp should guarantee a unique dir." if self.benchbase: # Execute benchbase if specified. @@ -646,19 +675,21 @@ def execute( # We can only create a state if we succeeded. success = observation_space.check_benchbase(self.dbgym_cfg, results_dpath) else: - num_timed_out_queries, did_workload_time_out, query_metric_data = self.execute_workload( - pg_conn, - actions=actions, - variation_names=variation_names, - results_dpath=results_dpath, - observation_space=observation_space, - action_space=action_space, - reset_metrics=reset_metrics, - override_workload_timeout=self.workload_timeout, - query_timeout=query_timeout, - workload_qdir=None, - blocklist=[], - first=first, + num_timed_out_queries, did_workload_time_out, query_metric_data = ( + self.execute_workload( + pg_conn, + actions=actions, + variation_names=variation_names, + results_dpath=results_dpath, + observation_space=observation_space, + action_space=action_space, + reset_metrics=reset_metrics, + override_workload_timeout=self.workload_timeout, + query_timeout=query_timeout, + workload_qdir=None, + blocklist=[], + first=first, + ) ) did_anything_time_out = num_timed_out_queries > 0 or did_workload_time_out success = True @@ -673,4 +704,11 @@ def execute( self.logger.get_logger(__name__).info( f"Benchmark iteration with metric {metric} (reward: {reward}) (did_anything_timeout: {did_anything_time_out})" ) - return success, metric, reward, results_dpath, did_anything_time_out, query_metric_data + return ( + success, + metric, + reward, + results_dpath, + did_anything_time_out, + query_metric_data, + ) diff --git a/tune/protox/tests/test_index_space.py b/tune/protox/tests/test_index_space.py index 73d72c5c..02225649 100644 --- a/tune/protox/tests/test_index_space.py +++ b/tune/protox/tests/test_index_space.py @@ -1,5 +1,6 @@ import unittest from pathlib import Path + import numpy as np import yaml @@ -11,7 +12,9 @@ class IndexSpaceTests(unittest.TestCase): @staticmethod def load( - config_path=Path("tune/protox/tests/unittest_benchmark_configs/unittest_tpch.yaml").resolve(), + config_path=Path( + "tune/protox/tests/unittest_benchmark_configs/unittest_tpch.yaml" + ).resolve(), aux_type=True, aux_include=True, ): diff --git a/tune/protox/tests/test_workload.py b/tune/protox/tests/test_workload.py index f31ac71b..fb46fea3 100644 --- a/tune/protox/tests/test_workload.py +++ b/tune/protox/tests/test_workload.py @@ -1,10 +1,11 @@ -import yaml import json import unittest from pathlib import Path -from tune.protox.env.workload import Workload +import yaml + from tune.protox.env.space.primitive_space import IndexSpace +from tune.protox.env.workload import Workload class WorkloadTests(unittest.TestCase): @@ -25,7 +26,7 @@ def load(config_file: str, workload_path: Path): workload_path=workload_path, pid=None, workload_timeout=0, - workload_timeout_penalty=1., + workload_timeout_penalty=1.0, logger=None, ) @@ -51,46 +52,48 @@ def diff_classmapping(self, ref, target): def test_tpch(self): with open("tune/protox/tests/unittest_ref_models/ref_tpch_model.txt", "r") as f: ref = json.load(f)["class_mapping"] - ref = { - (v["relname"], v["ord_column"]): int(k) - for k, v in ref.items() - } + ref = {(v["relname"], v["ord_column"]): int(k) for k, v in ref.items()} - w, i = WorkloadTests.load("tune/protox/tests/unittest_benchmark_configs/unittest_tpch.yaml", Path("tune/protox/tests/unittest_tpch_dir").resolve()) + w, i = WorkloadTests.load( + "tune/protox/tests/unittest_benchmark_configs/unittest_tpch.yaml", + Path("tune/protox/tests/unittest_tpch_dir").resolve(), + ) self.assertEqual(i.class_mapping, ref) def test_job(self): # don't call open_and_save() because this is a unittest - with open("tune/protox/tests/unittest_ref_models/ref_job_full_model.txt", "r") as f: + with open( + "tune/protox/tests/unittest_ref_models/ref_job_full_model.txt", "r" + ) as f: ref = json.load(f)["class_mapping"] - ref = { - (v["relname"], v["ord_column"]): int(k) - for k, v in ref.items() - } + ref = {(v["relname"], v["ord_column"]): int(k) for k, v in ref.items()} - w, i = WorkloadTests.load("tune/protox/tests/unittest_benchmark_configs/unittest_job_full.yaml", Path("tune/protox/tests/unittest_job_full_dir").resolve()) + w, i = WorkloadTests.load( + "tune/protox/tests/unittest_benchmark_configs/unittest_job_full.yaml", + Path("tune/protox/tests/unittest_job_full_dir").resolve(), + ) self.assertEqual(i.class_mapping, ref) def test_dsb(self): # don't call open_and_save() because this is a unittest with open("tune/protox/tests/unittest_ref_models/ref_dsb_model.txt", "r") as f: ref = json.load(f)["class_mapping"] - ref = { - (v["relname"], v["ord_column"]): int(k) - for k, v in ref.items() - } + ref = {(v["relname"], v["ord_column"]): int(k) for k, v in ref.items()} - w, i = WorkloadTests.load("tune/protox/tests/unittest_benchmark_configs/unittest_dsb.yaml", Path("tune/protox/tests/unittest_dsb_dir").resolve()) + w, i = WorkloadTests.load( + "tune/protox/tests/unittest_benchmark_configs/unittest_dsb.yaml", + Path("tune/protox/tests/unittest_dsb_dir").resolve(), + ) self.diff_classmapping(ref, i.class_mapping) def test_tpcc(self): # don't call open_and_save() because this is a unittest with open("tune/protox/tests/unittest_ref_models/ref_tpcc_model.txt", "r") as f: ref = json.load(f)["class_mapping"] - ref = { - (v["relname"], v["ord_column"]): int(k) - for k, v in ref.items() - } + ref = {(v["relname"], v["ord_column"]): int(k) for k, v in ref.items()} - w, i = WorkloadTests.load("tune/protox/tests/unittest_benchmark_configs/unittest_tpcc.yaml", Path("tune/protox/tests/unittest_tpcc_dir").resolve()) + w, i = WorkloadTests.load( + "tune/protox/tests/unittest_benchmark_configs/unittest_tpcc.yaml", + Path("tune/protox/tests/unittest_tpcc_dir").resolve(), + ) self.assertEqual(i.class_mapping, ref) diff --git a/util/pg.py b/util/pg.py index ee45772d..8c5f1e78 100644 --- a/util/pg.py +++ b/util/pg.py @@ -1,9 +1,10 @@ from pathlib import Path from typing import List + import pglast -from sqlalchemy import Connection, Engine, text, create_engine -from sqlalchemy.engine import CursorResult import psycopg +from sqlalchemy import Connection, Engine, create_engine, text +from sqlalchemy.engine import CursorResult from misc.utils import DBGymConfig, open_and_save @@ -39,7 +40,7 @@ def sql_file_execute(dbgym_cfg: DBGymConfig, conn: Connection, filepath: Path) - # The reason pgport is an argument is because when doing agnet HPO, we want to run multiple instances of Postgres # at the same time. In this situation, they need to have different ports -def get_connstr(pgport: int=DEFAULT_POSTGRES_PORT, use_psycopg=True) -> str: +def get_connstr(pgport: int = DEFAULT_POSTGRES_PORT, use_psycopg=True) -> str: connstr_suffix = f"{DBGYM_POSTGRES_USER}:{DBGYM_POSTGRES_PASS}@localhost:{pgport}/{DBGYM_POSTGRES_DBNAME}" # use_psycopg means whether or not we use the psycopg.connect() function # counterintuively, you *don't* need psycopg in the connection string if you *are* @@ -48,12 +49,10 @@ def get_connstr(pgport: int=DEFAULT_POSTGRES_PORT, use_psycopg=True) -> str: return connstr_prefix + "://" + connstr_suffix -def create_conn(pgport: int=DEFAULT_POSTGRES_PORT, use_psycopg=True) -> Connection: +def create_conn(pgport: int = DEFAULT_POSTGRES_PORT, use_psycopg=True) -> Connection: connstr = get_connstr(use_psycopg=use_psycopg, pgport=pgport) if use_psycopg: - return psycopg.connect( - connstr, autocommit=True, prepare_threshold=None - ) + return psycopg.connect(connstr, autocommit=True, prepare_threshold=None) else: engine: Engine = create_engine( connstr,