Further changes

benedikt-schesch · Sep 25, 2023 · 40e90af · 40e90af
1 parent 606b16c
commit 40e90af
Show file tree

Hide file tree

Showing 7 changed files with 145 additions and 140 deletions.
diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py
@@ -35,7 +35,6 @@ def set_in_cache(
     cache_value: Union[str, dict, None],
     repo_slug: str,
     cache_directory: Path,
-    overwrite: bool = True,
     acquire_lock: bool = True,
 ) -> None:
     """Puts an entry in the cache, then writes the cache to disk.
@@ -45,16 +44,12 @@ def set_in_cache(
         cache_value (dict): The value to write.
         repo_slug (str): The slug of the repository, which is "owner/reponame".
         cache_directory (Path): The path to the cache directory.
-        overwrite (bool, optional) = True: Whether to overwrite an existing cache entry.
-             If False, attempting to overwrite an existing cache entry throws an exception.
     """
     if acquire_lock:
         lock = get_cache_lock(repo_slug, cache_directory)
         lock.acquire()
     cache_path = get_cache_path(repo_slug, cache_directory)
     cache = load_cache(repo_slug, cache_directory)
-    if cache_key in cache and not overwrite:
-        raise ValueError("Cache key already exists")
     cache[cache_key] = cache_value
     output = json.dumps(cache, indent=4)
     with open(cache_path, "w") as f:

diff --git a/src/python/merge_differ.py b/src/python/merge_differ.py
@@ -118,8 +118,6 @@ def merge_differ(args: Tuple[pd.Series, Path]) -> None:
             command.append(str(repo2.repo_path))
             with open(diff_file, "w") as f:
                 subprocess.run(command, stdout=f, stderr=f)
-            del repo2
-        del repo1
 
 
 def diff_file_name(sha1: str, sha2: str) -> Path:

diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py
@@ -54,10 +54,10 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
     print("merge_tester: Started ", repo_slug, merge_data["left"], merge_data["right"])
 
     merge_data["parents pass"] = False
-    # TODO: It's not clear to me whether "left" and "right" are keys for branches, or shas, or something else.  How about improving the key name?  (We can change the find-merge-conflicts program if it is the source of the problem.)
     for branch in ["left", "right"]:
+        commit_sha = merge_data[branch]
         repo = Repository(repo_slug, cache_directory=cache_directory)
-        repo.checkout(merge_data[branch])
+        repo.checkout(commit_sha)
         tree_fingerprint = repo.compute_tree_fingerprint()
         if tree_fingerprint != merge_data[f"{branch}_tree_fingerprint"]:
             raise Exception(
@@ -112,6 +112,54 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
     return merge_data
 
 
+def build_arguments(
+    args: argparse.Namespace,
+    repo_slug: str,
+) -> list:
+    """Builds the arguments for the merge_tester function.
+    Args:
+        args (argparse.Namespace): The arguments of the script.
+        repo_slug (str): The slug of the repository.
+    Returns:
+        list: The arguments for the merge_tester function.
+    """
+    merge_list_file = Path(
+        os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv")
+    )
+    output_file = Path(
+        os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
+    )
+    if not merge_list_file.exists():
+        print(
+            "merge_tester:",
+            repo_slug,
+            "does not have a list of merges. Missing file: ",
+            merge_list_file,
+        )
+        return []
+
+    if output_file.exists():
+        print(
+            "merge_tester: Skipping",
+            repo_slug,
+            "because it is already computed.",
+        )
+        return []
+    try:
+        merges = pd.read_csv(merge_list_file, header=0, index_col="idx")
+    except pd.errors.EmptyDataError:
+        print(
+            "merge_tester: Skipping",
+            repo_slug,
+            "because it does not contain any merges.",
+        )
+        return []
+    return [
+        (repo_slug, merge_data, Path(args.cache_dir))
+        for _, merge_data in merges.iterrows()
+    ]
+
+
 def main():  # pylint: disable=too-many-locals,too-many-statements
     """Main function"""
     print("merge_tester: Start")
@@ -126,46 +174,11 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
 
     repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx")
 
-    # TODO: I suggest abstracting this loop into a separate function (through printing "number of merges to test").
     print("merge_tester: Started collecting merges to test")
     merge_tester_arguments = []
     for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
         repo_slug = repository_data["repository"]
-        merge_list_file = Path(
-            os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv")
-        )
-        output_file = Path(
-            os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
-        )
-        if not merge_list_file.exists():
-            print(
-                "merge_tester:",
-                repo_slug,
-                "does not have a list of merges. Missing file: ",
-                merge_list_file,
-            )
-            continue
-
-        if output_file.exists():
-            print(
-                "merge_tester: Skipping",
-                repo_slug,
-                "because it is already computed.",
-            )
-            continue
-        try:
-            merges = pd.read_csv(merge_list_file, header=0, index_col="idx")
-        except pd.errors.EmptyDataError:
-            print(
-                "merge_tester: Skipping",
-                repo_slug,
-                "because it does not contain any merges.",
-            )
-            continue
-        merge_tester_arguments += [
-            (repo_slug, merge_data, Path(args.cache_dir))
-            for _, merge_data in merges.iterrows()
-        ]
+        merge_tester_arguments += build_arguments(args, repo_slug)
 
     # Shuffle input to reduce cache contention
     random.seed(42)

diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py
@@ -69,8 +69,18 @@ def merger(  # pylint: disable=too-many-locals
             merge_data["right"],
             merge_tool.name,
         )
-        cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []}
-        # TODO: I suggest abstracting out the body of this loop as a function.
+        cache_data[merge_tool.name] = {"results": [], "run_time": []}
+        log_file: Path = (
+            cache_directory
+            / "merge_logs"
+            / slug_repo_name(repo_slug)
+            / merge_data["left"]
+            / merge_data["right"]
+            / (merge_tool.name + f".log")
+        )
+        log_file.parent.mkdir(parents=True, exist_ok=True)
+        log_file.unlink(missing_ok=True)
+
         for i in range(N_REPETITIONS):
             repo = Repository(repo_slug, cache_directory=cache_directory)
             (
@@ -86,48 +96,25 @@ def merger(  # pylint: disable=too-many-locals
                 right_commit=merge_data["right"],
                 timeout=TIMEOUT_MERGING,
             )
-            log_file: Path = (
-                cache_directory
-                / "merge_logs"
-                / slug_repo_name(repo_slug)
-                / merge_data["left"]
-                / merge_data["right"]
-                / (merge_tool.name + f".log")
-            )
-            log_file.parent.mkdir(parents=True, exist_ok=True)
-            if log_file.exists():
-                log_file.unlink()
-            with open(log_file, "w") as f:
-                f.write(explanation)
+            if i == 0:  # Only write the log file once.
+                with open(log_file, "w") as f:
+                    f.write(explanation)
+
             cache_data[merge_tool.name]["results"].append(merge_status.name)
             cache_data[merge_tool.name]["log_file"] = str(log_file)
             cache_data[merge_tool.name]["run_time"].append(run_time)
             if "merge_fingerprint" not in cache_data[merge_tool.name]:
                 cache_data[merge_tool.name]["merge_fingerprint"] = merge_fingerprint
-            else:
-                if (
-                    cache_data[merge_tool.name]["merge_fingerprint"]
-                    != merge_fingerprint
-                ):
-                    raise Exception(
-                        "merge_tools_comparator: Merge fingerprint mismatch",
-                        i,
-                        repo_slug,
-                        merge_data["left"],
-                        merge_data["right"],
-                        merge_tool.name,
-                        merge_status,
-                        cache_data[merge_tool.name]["merge_fingerprint"],
-                        merge_fingerprint,
-                        cache_data,
-                    )
-
-            if "left_tree_fingerprint" not in cache_data:
                 cache_data["left_tree_fingerprint"] = left_fingerprint
                 cache_data["right_tree_fingerprint"] = right_fingerprint
             else:
+                assert (
+                    cache_data[merge_tool.name]["merge_fingerprint"]
+                    == merge_fingerprint
+                )
                 assert cache_data["left_tree_fingerprint"] == left_fingerprint
                 assert cache_data["right_tree_fingerprint"] == right_fingerprint
+
     set_in_cache(cache_key, cache_data, repo_slug, merge_cache_directory)
     return cache_data
 
@@ -156,6 +143,64 @@ def check_if_two_merges_differ(cache_data: dict) -> bool:
     return False
 
 
+def build_merge_arguments(args: argparse.Namespace, repo_slug: str):
+    """
+    Creates the arguments for the merger function.
+    Args:
+        args (argparse.Namespace): The arguments to the script.
+        repo_slug (str): The repository slug.
+    Returns:
+        list: A list of arguments for the merger function.
+    """
+    merge_list_file = Path(
+        os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv")
+    )
+    output_file = Path(
+        os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
+    )
+    if not merge_list_file.exists():
+        raise Exception(
+            "merge_tools_comparator:",
+            repo_slug,
+            "does not have a list of merges. Missing file: ",
+            merge_list_file,
+        )
+
+    if output_file.exists():
+        print(
+            "merge_tools_comparator: Skipping",
+            repo_slug,
+            "because it is already computed.",
+        )
+        return []
+
+    merges = pd.read_csv(
+        merge_list_file,
+        names=["idx", "branch_name", "merge", "left", "right", "notes"],
+        dtype={
+            "idx": int,
+            "branch_name": str,
+            "merge": str,
+            "left": str,
+            "right": str,
+            "notes": str,
+        },
+        header=0,
+        index_col="idx",
+    )
+    merges["notes"].replace(np.nan, "", inplace=True)
+
+    if args.only_trivial_merges:
+        merges = merges[merges["notes"].str.contains("a parent is the base")]
+    elif not args.include_trivial_merges:
+        merges = merges[~merges["notes"].str.contains("a parent is the base")]
+    arguments = [
+        (repo_slug, merge_data, Path(args.cache_dir))
+        for _, merge_data in merges.iterrows()
+    ]
+    return arguments
+
+
 if __name__ == "__main__":
     print("merge_tools_comparator: Start")
     parser = argparse.ArgumentParser()
@@ -174,56 +219,8 @@ def check_if_two_merges_differ(cache_data: dict) -> bool:
     print("merge_tools_comparator: Constructing Inputs")
     merger_arguments = []
     for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
-        # TODO: I suggest abstracting out the body of this loop as a function.
-        merges_repo = []
         repo_slug = repository_data["repository"]
-        merge_list_file = Path(
-            os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv")
-        )
-        output_file = Path(
-            os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
-        )
-        if not merge_list_file.exists():
-            raise Exception(
-                "merge_tools_comparator:",
-                repo_slug,
-                "does not have a list of merges. Missing file: ",
-                merge_list_file,
-            )
-
-        if output_file.exists():
-            print(
-                "merge_tools_comparator: Skipping",
-                repo_slug,
-                "because it is already computed.",
-            )
-            continue
-
-        merges = pd.read_csv(
-            merge_list_file,
-            names=["idx", "branch_name", "merge", "left", "right", "notes"],
-            dtype={
-                "idx": int,
-                "branch_name": str,
-                "merge": str,
-                "left": str,
-                "right": str,
-                "notes": str,
-            },
-            header=0,
-            index_col="idx",
-        )
-        merges["notes"].replace(np.nan, "", inplace=True)
-
-        if args.only_trivial_merges:
-            merges = merges[merges["notes"].str.contains("a parent is the base")]
-        elif not args.include_trivial_merges:
-            merges = merges[~merges["notes"].str.contains("a parent is the base")]
-
-        merger_arguments += [
-            (repo_slug, merge_data, Path(args.cache_dir))
-            for _, merge_data in merges.iterrows()
-        ]
+        merger_arguments += build_merge_arguments(args, repo_slug)
 
     # Shuffle input to reduce cache contention
     random.seed(42)