From 18e99be9e80cb4c8851642ec160b2a519b7eb652 Mon Sep 17 00:00:00 2001 From: Michael Ernst Date: Tue, 9 Jul 2024 21:16:29 -0700 Subject: [PATCH] Make `merge_idx` a field of the `Repository` class, for diagnostics (#312) --- src/python/merge_analyzer.py | 50 ++++++++++++++++++++--------- src/python/merge_runtime_measure.py | 7 ++-- src/python/merge_tester.py | 21 +++++++----- src/python/replay_merge.py | 5 +++ src/python/repo.py | 6 +++- src/python/test_repo_heads.py | 1 + src/python/write_head_hashes.py | 1 + 7 files changed, 63 insertions(+), 28 deletions(-) diff --git a/src/python/merge_analyzer.py b/src/python/merge_analyzer.py index be19c3b6d3..72280525d9 100755 --- a/src/python/merge_analyzer.py +++ b/src/python/merge_analyzer.py @@ -66,6 +66,7 @@ def get_diff_files( def diff_merge_analyzer( + merge_idx: str, repo_slug: str, left_sha: str, right_sha: str, @@ -75,6 +76,7 @@ def diff_merge_analyzer( Computes the diff between two branches using git diff. Args: repo (Repository): The repository object. + merge_idx (str): The merge index, such as 42-123. repo_slug (str): The repository slug. left_sha (str): The left sha. right_sha (str): The right sha. @@ -91,6 +93,7 @@ def diff_merge_analyzer( return cache_data repo = Repository( + merge_idx, repo_slug, cache_directory=cache_dir, workdir_id=repo_slug + "/diff-" + left_sha + "-" + right_sha, @@ -149,34 +152,41 @@ def diff_merge_analyzer( def merge_analyzer( - args: Tuple[str, pd.Series, Path], + args: Tuple[str, str, pd.Series, Path], ) -> pd.Series: """ Merges two branches and returns the result. Args: - args (Tuple[str,pd.Series,Path]): A tuple containing the repo slug, + args (Tuple[str,str,pd.Series,Path]): A tuple containing the merge index, the repo slug, the merge data (which is side-effected), and the cache path. Returns: dict: A dictionary containing the merge result. """ - repo_slug, merge_data, cache_directory = args + merge_idx, repo_slug, merge_data, cache_directory = args left_sha = merge_data["left"] right_sha = merge_data["right"] - logger.info(f"merge_analyzer: Analyzing {repo_slug} {left_sha} {right_sha}") + logger.info( + f"merge_analyzer: Analyzing {merge_idx} {repo_slug} {left_sha} {right_sha}" + ) # Compute diff size in lines between left and right - cache_data = diff_merge_analyzer(repo_slug, left_sha, right_sha, cache_directory) + cache_data = diff_merge_analyzer( + merge_idx, repo_slug, left_sha, right_sha, cache_directory + ) if cache_data["diff contains java file"] in (False, None): merge_data["test merge"] = False merge_data["diff contains java file"] = False - logger.info(f"merge_analyzer: Analyzed {repo_slug} {left_sha} {right_sha}") + logger.info( + f"merge_analyzer: Analyzed {merge_idx} {repo_slug} {left_sha} {right_sha}" + ) return merge_data # Checkout left parent repo_left = Repository( + merge_idx, repo_slug, cache_directory=cache_directory, workdir_id=repo_slug + "/left-" + left_sha + "-" + right_sha, @@ -185,6 +195,7 @@ def merge_analyzer( # Checkout right parent repo_right = Repository( + merge_idx, repo_slug, cache_directory=cache_directory, workdir_id=repo_slug + "/right-" + left_sha + "-" + right_sha, @@ -214,15 +225,20 @@ def merge_analyzer( merge_data["parents pass"] and merge_data["diff contains java file"] is True ) - logger.info(f"merge_analyzer: Analyzed {repo_slug} {left_sha} {right_sha}") + logger.info( + f"merge_analyzer: Analyzed {merge_idx} {repo_slug} {left_sha} {right_sha}" + ) return merge_data -def build_merge_analyzer_arguments(args: argparse.Namespace, repo_slug: str): +def build_merge_analyzer_arguments( + repo_idx: str, args: argparse.Namespace, repo_slug: str +): """ Creates the arguments for the merger function. Args: + reo_idx (str): The repository index. args (argparse.Namespace): The arguments to the script. repo_slug (str): The repository slug. Returns: @@ -255,8 +271,8 @@ def build_merge_analyzer_arguments(args: argparse.Namespace, repo_slug: str): merges["notes"] = merges["notes"].fillna("") arguments = [ - (repo_slug, merge_data, Path(args.cache_dir)) - for _, merge_data in merges.iterrows() + (f"{repo_idx}-{idx}", repo_slug, merge_data, Path(args.cache_dir)) + for idx, merge_data in merges.iterrows() ] return arguments @@ -297,9 +313,11 @@ def plot_vertical_histogram(data, title, ax): TimeRemainingColumn(), ) as progress: task = progress.add_task("[green]Constructing Input...", total=len(repos)) - for _, repository_data in repos.iterrows(): + for repo_idx, repository_data in repos.iterrows(): repo_slug = repository_data["repository"] - merger_arguments += build_merge_analyzer_arguments(args, repo_slug) + merger_arguments += build_merge_analyzer_arguments( + repo_idx, args, repo_slug + ) progress.update(task, advance=1) # Shuffle input to reduce cache contention @@ -339,10 +357,10 @@ def plot_vertical_histogram(data, title, ax): TimeRemainingColumn(), ) as progress: task = progress.add_task("[green]Processing...", total=len(merger_arguments)) - for idx, merge_data in enumerate(merger_arguments): - repo_slug = merge_data[0] - results_data = merger_results[idx] - repo_result[repo_slug].append(merger_results[idx]) + for new_merges_idx, merge_data in enumerate(merger_arguments): + repo_slug = merge_data[1] + results_data = merger_results[new_merges_idx] + repo_result[repo_slug].append(merger_results[new_merges_idx]) n_new_analyzed += 1 if "test merge" in results_data and results_data["test merge"]: n_new_candidates_to_test += 1 diff --git a/src/python/merge_runtime_measure.py b/src/python/merge_runtime_measure.py index 3183e37a7a..52fd6ac702 100755 --- a/src/python/merge_runtime_measure.py +++ b/src/python/merge_runtime_measure.py @@ -63,7 +63,7 @@ def main(): merges = merges.sample(frac=1, random_state=42) if len(merges) > args.n_sampled_timing: merges = merges.iloc[: args.n_sampled_timing] - for idx, merge_data in merges.iterrows(): + for merge_idx, merge_data in merges.iterrows(): for merge_tool in MERGE_TOOL: left_hash, right_hash = ( merge_data["left"], @@ -75,7 +75,7 @@ def main(): ) if cache_entry is not None: - merges.at[idx, f"{merge_tool.name}_run_time"] = np.median( + merges.at[merge_idx, f"{merge_tool.name}_run_time"] = np.median( cache_entry["run_time"] # type: ignore ) else: @@ -86,6 +86,7 @@ def main(): run_times = [] for _ in range(args.n_timings): repo = Repository( + merge_idx, repo_slug, workdir_id=repo_slug + f"/merge-tester-{merge_tool.name}-" @@ -118,7 +119,7 @@ def main(): acquire_lock=False, ) - merges.at[idx, f"{merge_tool.name}_run_time"] = np.median( + merges.at[merge_idx, f"{merge_tool.name}_run_time"] = np.median( cache_entry["run_time"] # type: ignore ) out_file = args.output_dir / f"{repo_slug}.csv" diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py index b0d8eae903..9577f93dbf 100755 --- a/src/python/merge_tester.py +++ b/src/python/merge_tester.py @@ -34,21 +34,25 @@ ) -def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: +def merge_tester(args: Tuple[str, str, pd.Series, Path]) -> pd.Series: """Tests a merge with each merge tool. Args: - args (Tuple[str,pd.Series,Path]): A tuple containing the repository slug, - the repository info, and the cache path. + args (Tuple[str,str,pd.Series,Path]): A tuple containing the merge index, + the repository slug, the repository info, and the cache path. Returns: pd.Series: The result of the test. """ - repo_slug, merge_data, cache_directory = args + merge_idx, repo_slug, merge_data, cache_directory = args + print( + f"merge_tester: Started {merge_idx} {repo_slug} {merge_data['left']} {merge_data['right']}" + ) logger.info( - f"merge_tester: Started {repo_slug} {merge_data['left']} {merge_data['right']}" + f"merge_tester: Started {merge_idx} {repo_slug} {merge_data['left']} {merge_data['right']}" ) while psutil.cpu_percent() > 90 or psutil.virtual_memory().percent > 85: logger.trace( "merge_tester: Waiting for CPU or memory to be available." + + merge_idx + repo_slug + merge_data["left"] + merge_data["right"] @@ -57,6 +61,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: for merge_tool in MERGE_TOOL: repo = Repository( + merge_idx, repo_slug, cache_directory=cache_directory, workdir_id=repo_slug @@ -138,8 +143,8 @@ def build_arguments( return [] merges = merges[merges["sampled for testing"]] return [ - (repo_slug, merge_data, Path(args.cache_dir)) - for _, merge_data in merges.iterrows() + (merge_idx, repo_slug, merge_data, Path(args.cache_dir)) + for merge_idx, merge_data in merges.iterrows() ] @@ -214,7 +219,7 @@ def main(): ) for idx, merge_results in enumerate(merge_tester_arguments): progress.update(task, advance=1) - repo_slug = merge_results[0] + repo_slug = merge_results[1] repo_result[repo_slug].append(merge_tester_results[idx]) n_total_merges = 0 diff --git a/src/python/replay_merge.py b/src/python/replay_merge.py index 8972a9eb81..5162509922 100755 --- a/src/python/replay_merge.py +++ b/src/python/replay_merge.py @@ -113,6 +113,7 @@ def merge_replay( ) if not (WORKDIR_DIRECTORY / workdir).exists(): repo = Repository( + merge_idx, repo_slug, cache_directory=Path("no_cache/"), workdir_id=workdir, @@ -127,6 +128,7 @@ def merge_replay( ) if not (WORKDIR_DIRECTORY / workdir).exists(): repo = Repository( + merge_idx, repo_slug, cache_directory=Path("no_cache/"), workdir_id=workdir, @@ -141,6 +143,7 @@ def merge_replay( ) if not (WORKDIR_DIRECTORY / workdir).exists(): repo = Repository( + merge_idx, repo_slug, cache_directory=Path("no_cache/"), workdir_id=workdir, @@ -164,6 +167,7 @@ def merge_replay( ) if not (WORKDIR_DIRECTORY / workdir).exists(): repo = Repository( + merge_idx, repo_slug, cache_directory=Path("no_cache/"), workdir_id=workdir, @@ -209,6 +213,7 @@ def merge_replay( continue try: repo = Repository( + merge_idx, repo_slug, cache_directory=Path("no_cache/"), workdir_id=workdir, diff --git a/src/python/repo.py b/src/python/repo.py index ff30633937..75c06fce1f 100755 --- a/src/python/repo.py +++ b/src/python/repo.py @@ -182,10 +182,13 @@ def repo_test(wcopy_dir: Path, timeout: int) -> Tuple[TEST_STATE, str]: class Repository: - """A class that represents a repository.""" + """A class that represents a repository. + merge_idx is purely for diagnostic purposes. + """ def __init__( self, + merge_idx: str, repo_slug: str, cache_directory: Path = Path(""), workdir_id: str = uuid.uuid4().hex, # uuid4 is a random UID @@ -197,6 +200,7 @@ def __init__( repo_slug (str): The slug of the repository, which is "owner/reponame". cache_directory (Path): The prefix of the cache. """ + self.merge_idx = merge_idx self.repo_slug = repo_slug.lower() self.owner, self.name = self.repo_slug.split("/") self.repo_path = REPOS_PATH / repo_slug diff --git a/src/python/test_repo_heads.py b/src/python/test_repo_heads.py index 61f4129e5e..ee6769a4cc 100755 --- a/src/python/test_repo_heads.py +++ b/src/python/test_repo_heads.py @@ -72,6 +72,7 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> pd.Series: # Load repo try: repo = Repository( + "HEAD", repo_slug, cache_directory=cache, workdir_id=repo_slug + "/head-" + repo_info["repository"], diff --git a/src/python/write_head_hashes.py b/src/python/write_head_hashes.py index a0243aa1ff..37a4f0e2e6 100755 --- a/src/python/write_head_hashes.py +++ b/src/python/write_head_hashes.py @@ -48,6 +48,7 @@ def get_latest_hash(args): try: logger.info("write_head_hashes " + repo_slug + " : Cloning repo") repo = Repository( + "HEAD", repo_slug, workdir_id=repo_slug + "/head-" + repo_slug, lazy_clone=False,