From 40e90af3b06bede4e170220307a42f84c602dc1f Mon Sep 17 00:00:00 2001 From: Benedikt schesch Date: Mon, 25 Sep 2023 15:34:39 -0700 Subject: [PATCH] Further changes --- src/python/cache_utils.py | 5 - src/python/merge_differ.py | 2 - src/python/merge_tester.py | 89 ++++++---- src/python/merge_tools_comparator.py | 163 +++++++++--------- .../resolve-import-conflicts-in-file.py | 19 +- src/scripts/merge_tools/spork.sh | 2 +- src/scripts/run_repo_tests.sh | 5 +- 7 files changed, 145 insertions(+), 140 deletions(-) diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py index 2ef4864ca6..4b8cea5855 100755 --- a/src/python/cache_utils.py +++ b/src/python/cache_utils.py @@ -35,7 +35,6 @@ def set_in_cache( cache_value: Union[str, dict, None], repo_slug: str, cache_directory: Path, - overwrite: bool = True, acquire_lock: bool = True, ) -> None: """Puts an entry in the cache, then writes the cache to disk. @@ -45,16 +44,12 @@ def set_in_cache( cache_value (dict): The value to write. repo_slug (str): The slug of the repository, which is "owner/reponame". cache_directory (Path): The path to the cache directory. - overwrite (bool, optional) = True: Whether to overwrite an existing cache entry. - If False, attempting to overwrite an existing cache entry throws an exception. """ if acquire_lock: lock = get_cache_lock(repo_slug, cache_directory) lock.acquire() cache_path = get_cache_path(repo_slug, cache_directory) cache = load_cache(repo_slug, cache_directory) - if cache_key in cache and not overwrite: - raise ValueError("Cache key already exists") cache[cache_key] = cache_value output = json.dumps(cache, indent=4) with open(cache_path, "w") as f: diff --git a/src/python/merge_differ.py b/src/python/merge_differ.py index 574eddef00..cd8e3a60fa 100755 --- a/src/python/merge_differ.py +++ b/src/python/merge_differ.py @@ -118,8 +118,6 @@ def merge_differ(args: Tuple[pd.Series, Path]) -> None: command.append(str(repo2.repo_path)) with open(diff_file, "w") as f: subprocess.run(command, stdout=f, stderr=f) - del repo2 - del repo1 def diff_file_name(sha1: str, sha2: str) -> Path: diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py index d0c51a8701..cf2a396148 100755 --- a/src/python/merge_tester.py +++ b/src/python/merge_tester.py @@ -54,10 +54,10 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: print("merge_tester: Started ", repo_slug, merge_data["left"], merge_data["right"]) merge_data["parents pass"] = False - # TODO: It's not clear to me whether "left" and "right" are keys for branches, or shas, or something else. How about improving the key name? (We can change the find-merge-conflicts program if it is the source of the problem.) for branch in ["left", "right"]: + commit_sha = merge_data[branch] repo = Repository(repo_slug, cache_directory=cache_directory) - repo.checkout(merge_data[branch]) + repo.checkout(commit_sha) tree_fingerprint = repo.compute_tree_fingerprint() if tree_fingerprint != merge_data[f"{branch}_tree_fingerprint"]: raise Exception( @@ -112,6 +112,54 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: return merge_data +def build_arguments( + args: argparse.Namespace, + repo_slug: str, +) -> list: + """Builds the arguments for the merge_tester function. + Args: + args (argparse.Namespace): The arguments of the script. + repo_slug (str): The slug of the repository. + Returns: + list: The arguments for the merge_tester function. + """ + merge_list_file = Path( + os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") + ) + output_file = Path( + os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") + ) + if not merge_list_file.exists(): + print( + "merge_tester:", + repo_slug, + "does not have a list of merges. Missing file: ", + merge_list_file, + ) + return [] + + if output_file.exists(): + print( + "merge_tester: Skipping", + repo_slug, + "because it is already computed.", + ) + return [] + try: + merges = pd.read_csv(merge_list_file, header=0, index_col="idx") + except pd.errors.EmptyDataError: + print( + "merge_tester: Skipping", + repo_slug, + "because it does not contain any merges.", + ) + return [] + return [ + (repo_slug, merge_data, Path(args.cache_dir)) + for _, merge_data in merges.iterrows() + ] + + def main(): # pylint: disable=too-many-locals,too-many-statements """Main function""" print("merge_tester: Start") @@ -126,46 +174,11 @@ def main(): # pylint: disable=too-many-locals,too-many-statements repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx") - # TODO: I suggest abstracting this loop into a separate function (through printing "number of merges to test"). print("merge_tester: Started collecting merges to test") merge_tester_arguments = [] for _, repository_data in tqdm(repos.iterrows(), total=len(repos)): repo_slug = repository_data["repository"] - merge_list_file = Path( - os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") - ) - output_file = Path( - os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") - ) - if not merge_list_file.exists(): - print( - "merge_tester:", - repo_slug, - "does not have a list of merges. Missing file: ", - merge_list_file, - ) - continue - - if output_file.exists(): - print( - "merge_tester: Skipping", - repo_slug, - "because it is already computed.", - ) - continue - try: - merges = pd.read_csv(merge_list_file, header=0, index_col="idx") - except pd.errors.EmptyDataError: - print( - "merge_tester: Skipping", - repo_slug, - "because it does not contain any merges.", - ) - continue - merge_tester_arguments += [ - (repo_slug, merge_data, Path(args.cache_dir)) - for _, merge_data in merges.iterrows() - ] + merge_tester_arguments += build_arguments(args, repo_slug) # Shuffle input to reduce cache contention random.seed(42) diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py index 4b06bb7295..d149730d9d 100755 --- a/src/python/merge_tools_comparator.py +++ b/src/python/merge_tools_comparator.py @@ -69,8 +69,18 @@ def merger( # pylint: disable=too-many-locals merge_data["right"], merge_tool.name, ) - cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []} - # TODO: I suggest abstracting out the body of this loop as a function. + cache_data[merge_tool.name] = {"results": [], "run_time": []} + log_file: Path = ( + cache_directory + / "merge_logs" + / slug_repo_name(repo_slug) + / merge_data["left"] + / merge_data["right"] + / (merge_tool.name + f".log") + ) + log_file.parent.mkdir(parents=True, exist_ok=True) + log_file.unlink(missing_ok=True) + for i in range(N_REPETITIONS): repo = Repository(repo_slug, cache_directory=cache_directory) ( @@ -86,48 +96,25 @@ def merger( # pylint: disable=too-many-locals right_commit=merge_data["right"], timeout=TIMEOUT_MERGING, ) - log_file: Path = ( - cache_directory - / "merge_logs" - / slug_repo_name(repo_slug) - / merge_data["left"] - / merge_data["right"] - / (merge_tool.name + f".log") - ) - log_file.parent.mkdir(parents=True, exist_ok=True) - if log_file.exists(): - log_file.unlink() - with open(log_file, "w") as f: - f.write(explanation) + if i == 0: # Only write the log file once. + with open(log_file, "w") as f: + f.write(explanation) + cache_data[merge_tool.name]["results"].append(merge_status.name) cache_data[merge_tool.name]["log_file"] = str(log_file) cache_data[merge_tool.name]["run_time"].append(run_time) if "merge_fingerprint" not in cache_data[merge_tool.name]: cache_data[merge_tool.name]["merge_fingerprint"] = merge_fingerprint - else: - if ( - cache_data[merge_tool.name]["merge_fingerprint"] - != merge_fingerprint - ): - raise Exception( - "merge_tools_comparator: Merge fingerprint mismatch", - i, - repo_slug, - merge_data["left"], - merge_data["right"], - merge_tool.name, - merge_status, - cache_data[merge_tool.name]["merge_fingerprint"], - merge_fingerprint, - cache_data, - ) - - if "left_tree_fingerprint" not in cache_data: cache_data["left_tree_fingerprint"] = left_fingerprint cache_data["right_tree_fingerprint"] = right_fingerprint else: + assert ( + cache_data[merge_tool.name]["merge_fingerprint"] + == merge_fingerprint + ) assert cache_data["left_tree_fingerprint"] == left_fingerprint assert cache_data["right_tree_fingerprint"] == right_fingerprint + set_in_cache(cache_key, cache_data, repo_slug, merge_cache_directory) return cache_data @@ -156,6 +143,64 @@ def check_if_two_merges_differ(cache_data: dict) -> bool: return False +def build_merge_arguments(args: argparse.Namespace, repo_slug: str): + """ + Creates the arguments for the merger function. + Args: + args (argparse.Namespace): The arguments to the script. + repo_slug (str): The repository slug. + Returns: + list: A list of arguments for the merger function. + """ + merge_list_file = Path( + os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") + ) + output_file = Path( + os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") + ) + if not merge_list_file.exists(): + raise Exception( + "merge_tools_comparator:", + repo_slug, + "does not have a list of merges. Missing file: ", + merge_list_file, + ) + + if output_file.exists(): + print( + "merge_tools_comparator: Skipping", + repo_slug, + "because it is already computed.", + ) + return [] + + merges = pd.read_csv( + merge_list_file, + names=["idx", "branch_name", "merge", "left", "right", "notes"], + dtype={ + "idx": int, + "branch_name": str, + "merge": str, + "left": str, + "right": str, + "notes": str, + }, + header=0, + index_col="idx", + ) + merges["notes"].replace(np.nan, "", inplace=True) + + if args.only_trivial_merges: + merges = merges[merges["notes"].str.contains("a parent is the base")] + elif not args.include_trivial_merges: + merges = merges[~merges["notes"].str.contains("a parent is the base")] + arguments = [ + (repo_slug, merge_data, Path(args.cache_dir)) + for _, merge_data in merges.iterrows() + ] + return arguments + + if __name__ == "__main__": print("merge_tools_comparator: Start") parser = argparse.ArgumentParser() @@ -174,56 +219,8 @@ def check_if_two_merges_differ(cache_data: dict) -> bool: print("merge_tools_comparator: Constructing Inputs") merger_arguments = [] for _, repository_data in tqdm(repos.iterrows(), total=len(repos)): - # TODO: I suggest abstracting out the body of this loop as a function. - merges_repo = [] repo_slug = repository_data["repository"] - merge_list_file = Path( - os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") - ) - output_file = Path( - os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") - ) - if not merge_list_file.exists(): - raise Exception( - "merge_tools_comparator:", - repo_slug, - "does not have a list of merges. Missing file: ", - merge_list_file, - ) - - if output_file.exists(): - print( - "merge_tools_comparator: Skipping", - repo_slug, - "because it is already computed.", - ) - continue - - merges = pd.read_csv( - merge_list_file, - names=["idx", "branch_name", "merge", "left", "right", "notes"], - dtype={ - "idx": int, - "branch_name": str, - "merge": str, - "left": str, - "right": str, - "notes": str, - }, - header=0, - index_col="idx", - ) - merges["notes"].replace(np.nan, "", inplace=True) - - if args.only_trivial_merges: - merges = merges[merges["notes"].str.contains("a parent is the base")] - elif not args.include_trivial_merges: - merges = merges[~merges["notes"].str.contains("a parent is the base")] - - merger_arguments += [ - (repo_slug, merge_data, Path(args.cache_dir)) - for _, merge_data in merges.iterrows() - ] + merger_arguments += build_merge_arguments(args, repo_slug) # Shuffle input to reduce cache contention random.seed(42) diff --git a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py index 864a4a8bbc..36f2750419 100755 --- a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py +++ b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#! /usr/bin/env python3 # TODO: Benedikt, could you review and improve this file? I'm sure it is not perfect Python style. """Edits a file in place to remove conflict markers related to Java imports. @@ -12,6 +12,7 @@ import shutil import sys import tempfile +from typing import List, Union, Tuple if len(sys.argv) != 2: print( @@ -24,12 +25,12 @@ lines = file.readlines() -def all_import_lines(lines): +def all_import_lines(lines: List[str]) -> bool: """Return true if every line is a Java import line.""" return all(line.startswith("import ") for line in lines) -def merge(base, parent1, parent2): +def merge(base, parent1: List[str], parent2: List[str]) -> Union[List[str], None]: """Given text for the base and two parents, return merged text. This currently does a simplistic merge that retains all lines in either parent. @@ -40,7 +41,7 @@ def merge(base, parent1, parent2): parent2: a list of lines Returns: - a list of lines, or None if it cannot do merging. + Union[List[str], None]: If the merge is successful, returns a list of lines. """ if ( @@ -53,7 +54,9 @@ def merge(base, parent1, parent2): return None -def looking_at_conflict(start_index, lines): # pylint: disable=R0911 +def looking_at_conflict( # pylint: disable=too-many-return-statements + start_index: int, lines: List[str] +) -> Union[None, Tuple[List[str], List[str], List[str], int]]: """Tests whether the text starting at line `start_index` is the beginning of a conflict. If not, returns None. If so, returns a 4-tuple of (base, parent1, parent2, num_lines_in_conflict) @@ -82,7 +85,7 @@ def looking_at_conflict(start_index, lines): # pylint: disable=R0911 if index == num_lines: print( "Starting at line " - + start_index + + str(start_index) + ", did not find ||||||| or ======= in " + filename ) @@ -98,7 +101,7 @@ def looking_at_conflict(start_index, lines): # pylint: disable=R0911 if index == num_lines: print( "Starting at line " - + start_index + + str(start_index) + ", did not find ======= in " + filename ) @@ -114,7 +117,7 @@ def looking_at_conflict(start_index, lines): # pylint: disable=R0911 if index == num_lines: print( "Starting at line " - + start_index + + str(start_index) + ", did not find >>>>>>> in " + filename ) diff --git a/src/scripts/merge_tools/spork.sh b/src/scripts/merge_tools/spork.sh index 3e38375de0..7748d5f2e2 100755 --- a/src/scripts/merge_tools/spork.sh +++ b/src/scripts/merge_tools/spork.sh @@ -13,8 +13,8 @@ if [ "$#" -ne 3 ]; then exit 1 fi -# TODO: This script is an odd place to put this `killall` command. Should it be elsewhere in the experimental infrastructure? # Kill all Java processes that are running for over an hour (to avoid memory leaks). +# Spork tends to create Java processes that don't terminate even when the parent process is killed. killall -9 java --older-than 1h SCRIPT_PATH="$(dirname "$0")"; SCRIPT_PATH="$(eval "cd \"$SCRIPT_PATH\" && pwd")" diff --git a/src/scripts/run_repo_tests.sh b/src/scripts/run_repo_tests.sh index fc032b38b2..1d647d9b27 100755 --- a/src/scripts/run_repo_tests.sh +++ b/src/scripts/run_repo_tests.sh @@ -7,9 +7,8 @@ set -o nounset -# TODO: This is too agressive and will interfere with the user's other work. -# TODO: Instead, create a directory under /tmp (or, better, under /tmp/$USER) for the AST merging experiments, and clean it. -# Test side effects can be seen in the /tmp directory. +# Test side effects can be seen in the /tmp directory which are created by +# testing infrastructure. We delete them to avoid filling up the disk. # We delete all the files older than 2h and owned by the current user. find /tmp -maxdepth 1 -user "$(whoami)" -mmin +120 -exec rm -rf {} \;