Merged v2-update

benedikt-schesch · Sep 24, 2023 · 18ed5f6 · 18ed5f6
2 parents 0ecfc37 + a76a1a6
commit 18ed5f6
Show file tree

Hide file tree

Showing 19 changed files with 117 additions and 103 deletions.
diff --git a/README.md b/README.md
@@ -145,7 +145,7 @@ To run style checking run `make style`.
 
     * merge_tester.py -> Main file which performs merges and evaluates all the results across all projects.
 
-    * test_repo_head.py -> Checks out all repos and removes all repos that fail their tests on main branch.
+    * test_repo_heads.py -> Checks out all repos and removes all repos that fail their tests on main branch.
 
     * latex_output.py -> Output latex code for the resulting plots and table.
 

diff --git a/run.sh b/run.sh
@@ -17,7 +17,7 @@ set -o nounset
 
 REPOS_CSV="$1"
 OUT_DIR="$2"
-N_MERGES=$3
+N_REPETITIONS=$3
 CACHE_DIR="${4}"
 
 comparator_flags=""
@@ -86,7 +86,7 @@ python3 src/python/split_repos.py \
     --num_machines "$num_machines" \
     --output_file "$OUT_DIR/local_repos.csv"
 
-python3 src/python/test_repo_head.py \
+python3 src/python/test_repo_heads.py \
     --repos_csv_with_hashes "$OUT_DIR/local_repos.csv" \
     --output_path "$OUT_DIR/repos_head_passes.csv" \
     --cache_dir "$CACHE_DIR"
@@ -120,5 +120,5 @@ python3 src/python/latex_output.py \
     --tested_merges_path "$OUT_DIR/merges_tested/" \
     --full_repos_csv "$REPOS_CSV" \
     --repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
-    --n_merges "$N_MERGES" \
+    --n_merges "$N_REPETITIONS" \
     --output_dir "$OUT_DIR"
diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py
@@ -5,8 +5,8 @@
 There will be 4 caches in total which are stored on disk after running the run.sh script:
 1) cache/sha_cache_entry:  A cache that maps the commit hash to a sha256 hash of the repository.
 2) cache/test_cache: A cache that maps a sha256 to test results.
-3) cache/merge_results:A cache that maps a merge to the result 
-        of the merge (sha256, runtime and MERGE_STATE)
+3) cache/merge_results: A cache that maps a merge to the result 
+        of the merge (sha256, run time, and MERGE_STATE).
 4) cache/merge_diffs: A cache that stores the diff between merge tools.
 """
 
@@ -24,9 +24,9 @@
 
 
 def slug_repo_name(repo_slug: str) -> str:
-    """Given a GitHub repository slug (owner/reponame), returns the reponame.
+    """Given a GitHub repository slug ("owner/reponame"), returns the reponame.
     Args:
-        repo_slug (str): The slug of the repository, which is 'owner/reponame'.
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
     Returns:
         str: The reponame.
     """
@@ -60,8 +60,8 @@ def get_cache_path(repo_slug: str, cache_prefix: Path) -> Path:
     Returns:
         Path: The path to the cache file.
     """
-    cache_entry_name = slug_repo_name(repo_slug) + ".json"
-    cache_path = cache_prefix / cache_entry_name
+    cache_file_name = slug_repo_name(repo_slug) + ".json"
+    cache_path = cache_prefix / cache_file_name
     cache_path.parent.mkdir(parents=True, exist_ok=True)
     return cache_path
 

diff --git a/src/python/latex_output.py b/src/python/latex_output.py
@@ -13,7 +13,7 @@
 This script generates all the tables and plots for the paper. It requires the
 following input files:
 - full_repos_csv: csv file containing the full list of repositories
-- repos_head_passes_csv: csv file containing the list of valid repositories
+- repos_head_passes_csv: csv file containing the list of repositories whose head passes tests
 - tested_merges_path: path to the folder containing the merge results
 - merges_path: path to the folder containing all found merges.
 TODO: Throughout, be consistent about "directory" vs "folder".

diff --git a/src/python/merge_differ.py b/src/python/merge_differ.py
@@ -21,20 +21,21 @@
 import pandas as pd
 from repo import Repository, MERGE_TOOL, TEST_STATE, MERGE_STATE
 from tqdm import tqdm
-from write_head_hashes import compute_num_process_used
+from write_head_hashes import num_processes
 from cache_utils import slug_repo_name
 
 if os.getenv("TERM", "dumb") == "dumb":
     tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)  # type: ignore
 
-TIMEOUT_TESTING_PARENT = 60 * 30  # 30 minutes
-TIMEOUT_TESTING_MERGE = 60 * 45  # 45 minutes
+TIMEOUT_TESTING_PARENT = 60 * 30  # 30 minutes, in seconds
+TIMEOUT_TESTING_MERGE = 60 * 45  # 45 minutes, in seconds
 
 
 def get_merge_fingerprint(
     merge_data: pd.Series, merge_tool: MERGE_TOOL, cache_prefix: Path
 ) -> Union[Tuple[None, None], Tuple[Repository, str]]:
-    """Returns the repo and the fingerprint of a merge, or None.
+    """Returns the repo and the fingerprint of a merge,
+    or (None, None) if the merge is not successful.
     Does some sanity-checking too.
     Args:
         merge_data: The merge data.
@@ -112,7 +113,6 @@ def merge_differ(args: Tuple[pd.Series, Path]) -> None:
             if repo2 is None or merge_fingerprint2 is None:
                 continue
 
-            # Use lexicographic order to prevent duplicates
             diff_file = diff_file_prefix / diff_file_name(
                 merge_fingerprint1, merge_fingerprint2
             )
@@ -135,6 +135,7 @@ def diff_file_name(sha1: str, sha2: str) -> Path:
     Returns:
         Path: The name of the diff file.
     """
+    # Use lexicographic order to prevent duplicates
     if sha1 < sha2:
         # TODO: Why does this use ".txt" rather than ".diff" as the file extension?
         return Path(sha1 + "_" + sha2 + ".txt")
@@ -155,7 +156,7 @@ def diff_file_name(sha1: str, sha2: str) -> Path:
 
     repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx")
 
-    print("merge_differ: Started listing diffs to compute")
+    print("merge_differ: Started collecting diffs to compute")
     merge_differ_arguments = []
     for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
         merges_repo = []
@@ -213,11 +214,11 @@ def diff_file_name(sha1: str, sha2: str) -> Path:
     random.seed(42)
     random.shuffle(merge_differ_arguments)
 
-    print("merge_differ: Finished listing diffs to compute")
-    print("merge_differ: Number of tested merges:", len(merge_differ_arguments))
+    print("merge_differ: Finished collecting diffs to compute")
+    print("merge_differ: Number of merges to test:", len(merge_differ_arguments))
 
     print("merge_differ: Started Diffing")
-    with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
+    with multiprocessing.Pool(processes=num_processes()) as pool:
         tqdm(
             pool.imap(merge_differ, merge_differ_arguments),
             total=len(merge_differ_arguments),

diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py
@@ -21,8 +21,8 @@
 import psutil
 import pandas as pd
 from repo import Repository, MERGE_TOOL, TEST_STATE
-from write_head_hashes import compute_num_process_used
-from merge_tools_comparator import is_merge_sucess
+from write_head_hashes import num_processes
+from merge_tools_comparator import is_merge_success
 from cache_utils import slug_repo_name
 from tqdm import tqdm
 
@@ -80,7 +80,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
     merge_data["parents pass"] = True
 
     for merge_tool in MERGE_TOOL:
-        if is_merge_sucess(merge_data[merge_tool.name]):
+        if is_merge_success(merge_data[merge_tool.name]):
             # TODO: I suggest abstracting the body of this loop into a separate function, since it stands on its own logically.
             repo = Repository(repo_slug, cache_prefix=cache_prefix)
             (
@@ -109,6 +109,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
                     merge_fingerprint,
                     merge_data[merge_tool.name + "_merge_fingerprint"],
                 )
+            # Update the status from merge success to test result.
             merge_data[merge_tool.name] = result.name
             del repo
         assert merge_tool.name in merge_data
@@ -130,7 +131,7 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
     repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx")
 
     # TODO: I suggest abstracting this loop into a separate function (through printing "number of merges to test").
-    print("merge_tester: Started listing merges to test")
+    print("merge_tester: Started collecting merges to test")
     merge_tester_arguments = []
     for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
         repo_slug = repository_data["repository"]
@@ -174,11 +175,11 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
     random.seed(42)
     random.shuffle(merge_tester_arguments)
 
-    print("merge_tester: Finished listing merges to test")
+    print("merge_tester: Finished collecting merges to test")
     print("merge_tester: Number of merges to test:", len(merge_tester_arguments))
 
     print("merge_tester: Started Testing")
-    with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
+    with multiprocessing.Pool(processes=num_processes()) as pool:
         merge_tester_results = list(
             tqdm(
                 pool.imap(merge_tester, merge_tester_arguments),
@@ -214,12 +215,10 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
                     "because it does not contain any merges.",
                 )
                 continue
-            n_total_merges += len(df)
-            n_total_merges_parents_pass += len(df[df["parents pass"]])
-            continue
-        df = pd.DataFrame(repo_result[repo_slug])
-        df.sort_index(inplace=True)
-        df.to_csv(output_file, index_label="idx")
+        else:
+            df = pd.DataFrame(repo_result[repo_slug])
+            df.sort_index(inplace=True)
+            df.to_csv(output_file, index_label="idx")
         n_total_merges += len(df)
         n_total_merges_parents_pass += len(df[df["parents pass"]])
 

diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py
@@ -28,14 +28,14 @@
 from repo import Repository, MERGE_TOOL, MERGE_STATE
 from tqdm import tqdm
 from cache_utils import set_in_cache, lookup_in_cache, slug_repo_name
-from write_head_hashes import compute_num_process_used
-from variables import TIMEOUT_MERGING, N_MERGES
+from write_head_hashes import num_processes
+from variables import TIMEOUT_MERGING, N_REPETITIONS
 
 if os.getenv("TERM", "dumb") == "dumb":
     tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)  # type: ignore
 
 
-def is_merge_sucess(merge_state: str) -> bool:
+def is_merge_success(merge_state: str) -> bool:
     """Returns true if the merge state indicates success."""
     return merge_state == MERGE_STATE.Merge_success.name
 
@@ -74,8 +74,8 @@ def merger(  # pylint: disable=too-many-locals
             merge_tool.name,
         )
         cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []}
-        for i in range(N_MERGES):
             # TODO: I suggest abstracting out the body of this loop as a function.
+        for i in range(N_REPETITIONS):
             repo = Repository(repo_slug, cache_prefix=cache_prefix)
             (
                 merge_status,
@@ -211,10 +211,11 @@ def merger(  # pylint: disable=too-many-locals
     random.shuffle(merger_arguments)
 
     print("merge_tools_comparator: Finished Constructing Inputs")
+    # New merges are merges whose analysis does not appear in the output folder.
     print("merge_tools_comparator: Number of new merges:", len(merger_arguments))
 
     print("merge_tools_comparator: Started Merging")
-    with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
+    with multiprocessing.Pool(processes=num_processes()) as pool:
         merger_results = list(
             tqdm(pool.imap(merger, merger_arguments), total=len(merger_arguments))
         )
@@ -223,7 +224,6 @@ def merger(  # pylint: disable=too-many-locals
     repo_result = {repo_slug: [] for repo_slug in repos["repository"]}
     print("merge_tools_comparator: Constructing Output")
     n_new_compared = 0
-    n_new_tested = 0
     for i in tqdm(range(len(merger_arguments))):
         repo_slug = merger_arguments[i][0]
         merge_data = merger_arguments[i][1]
@@ -232,13 +232,13 @@ def merger(  # pylint: disable=too-many-locals
         # TODO: I suggest abstracting this loop into a function.  That will also make it easier to return early, which can be done just with "return".  (Currently it makes more iterations even after discovering that two merge tools do differ.)
         for merge_tool1 in MERGE_TOOL:
             for merge_tool2 in MERGE_TOOL:
-                if is_merge_sucess(
+                if is_merge_success(
                     cache_data[merge_tool1.name]["results"][0]
-                ) and is_merge_sucess(cache_data[merge_tool2.name]["results"][0]):
+                ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]):
                     two_merge_tools_differ = True
-                if is_merge_sucess(
+                if is_merge_success(
                     cache_data[merge_tool1.name]["results"][0]
-                ) and is_merge_sucess(cache_data[merge_tool2.name]["results"][0]):
+                ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]):
                     if (
                         cache_data[merge_tool1.name]["merge_fingerprint"]
                         != cache_data[merge_tool2.name]["merge_fingerprint"]
@@ -286,10 +286,13 @@ def merger(  # pylint: disable=too-many-locals
         df.to_csv(output_file, index_label="idx")
         n_total_compared += len(df)
 
+    # This is the number of merges whose "two merge tools differ" bit has been set (to true or
+    # false).
     print(
         "merge_tools_comparator: Number of merge tool outputs that have been newly compared:",
         n_new_compared,
     )
+    # This is the number of merges whose "two merge tools differ" bit has been to true.
     print(
         "merge_tools_comparator: Total number of merge tool outputs that have been compared:",
         n_total_compared,

diff --git a/src/python/repo.py b/src/python/repo.py
@@ -63,16 +63,14 @@ def compute_explanation(
     command: List[str],
     source: Union[subprocess.TimeoutExpired, subprocess.CompletedProcess],
 ) -> str:
-    """Poduces the explanation string of a timedout process or
+    """Produces the explanation string of a timedout process or
     a completed process.
     """
     explanation = "Run Command: " + " ".join(command) + "\nTimed out"
-    explanation += (
-        "\nstdout:\n" + source.stdout.decode("utf-8") if source.stdout else ""
-    )
-    explanation += (
-        "\nstderr:\n" + source.stderr.decode("utf-8") if source.stderr else ""
-    )
+    if source.stdout:
+        explanation += "\nstdout:\n" + source.stdout.decode("utf-8")
+    if source.stderr:
+        explanation += "\nstderr:\n" + source.stderr.decode("utf-8")
     return explanation
 
 
@@ -186,7 +184,7 @@ def _merge_and_test(  # pylint: disable=too-many-arguments
             left_commit (str): The left commit to merge.
             right_commit (str): The right commit to merge.
             timeout (int): The timeout limit, in seconds.
-            n_tests (int): The number of times to perform the test.
+            n_tests (int): The number of times to run the test.
         Returns:
             TEST_STATE: The result of the test.
             str: The tree fingerprint of the merge result.
@@ -402,6 +400,8 @@ def merge(  # pylint: disable=too-many-locals
 
     def compute_tree_fingerprint(self) -> str:
         """Computes the tree fingerprint of the repository.
+        This function must never be run after running tests,
+        since running tests might write output files.
         Returns:
             str: The tree fingerprint.
         """
@@ -467,7 +467,9 @@ def _checkout_and_test(
         timeout: int,
         n_tests: int,
     ) -> TEST_STATE:
-        """Checks out the given commit and tests the repository.
+        """Helper function for `checkout_and_test`,
+        which checks out the given commit and tests the repository.
+        This function does not check the cache.
         Args:
             commit (str): The commit to checkout.
             timeout (int): The timeout limit, in seconds.

diff --git a/src/python/split_repos.py b/src/python/split_repos.py
@@ -21,7 +21,7 @@
     parser.add_argument("--output_file", type=Path)
     args = parser.parse_args()
     df: pd.DataFrame = pd.read_csv(args.repos_csv, index_col="idx")
-    # Shuffle the dataframe so the ordering of the list doesn't bias the output
+    # Shuffle the dataframe so the ordering of the list doesn't bias the output.
     df = df.sample(frac=1, random_state=42)
     df = np.array_split(df, args.num_machines)[args.machine_id]
     df.sort_index(inplace=True)