From 5d510aafda8f0b4cd8c6cb9997cefccfcd880102 Mon Sep 17 00:00:00 2001
From: Michael Ernst <mernst@cs.washington.edu>
Date: Tue, 12 Sep 2023 15:21:36 -0700
Subject: [PATCH 1/6] Rename `test_repo_head.py` to `test_repo_heads.py`

---
 README.md                                     |  2 +-
 run.sh                                        |  2 +-
 .../{test_repo_head.py => test_repo_heads.py} | 24 +++++++++----------
 src/python/write_head_hashes.py               |  2 +-
 4 files changed, 15 insertions(+), 15 deletions(-)
 rename src/python/{test_repo_head.py => test_repo_heads.py} (81%)

diff --git a/README.md b/README.md
index 33a875c1e2..2b00bd7164 100644
--- a/README.md
+++ b/README.md
@@ -145,7 +145,7 @@ To run style checking run `make style`.
 
     * merge_tester.py -> Main file which performs merges and evaluates all the results across all projects.
 
-    * test_repo_head.py -> Checks out all repos and removes all repos that fail their tests on main branch.
+    * test_repo_heads.py -> Checks out all repos and removes all repos that fail their tests on main branch.
 
     * latex_output.py -> Output latex code for the resulting plots and table.
 
diff --git a/run.sh b/run.sh
index ea609c725d..20fdd53a52 100755
--- a/run.sh
+++ b/run.sh
@@ -86,7 +86,7 @@ python3 src/python/split_repos.py \
     --num_machines "$num_machines" \
     --output_file "$OUT_DIR/local_repos.csv"
 
-python3 src/python/test_repo_head.py \
+python3 src/python/test_repo_heads.py \
     --repos_csv_with_hashes "$OUT_DIR/local_repos.csv" \
     --output_path "$OUT_DIR/repos_head_passes.csv" \
     --cache_dir "$CACHE_DIR"
diff --git a/src/python/test_repo_head.py b/src/python/test_repo_heads.py
similarity index 81%
rename from src/python/test_repo_head.py
rename to src/python/test_repo_heads.py
index a8a9851b97..e737e77d8c 100755
--- a/src/python/test_repo_head.py
+++ b/src/python/test_repo_heads.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """Tests the HEAD of a repo and validates it if the test passes.
 
-usage: python3 test_repo_head.py --repos_csv_with_hashes <repos_csv_with_hashes.csv>
+usage: python3 test_repo_heads.py --repos_csv_with_hashes <repos_csv_with_hashes.csv>
                                  --output_path <repos_head_passes.csv>
                                  --cache_dir <cache_dir>
 
@@ -40,13 +40,13 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE:
     """
     repo_info, cache = args
     repo_slug = repo_info["repository"]
-    print("test_repo_head:", repo_slug, ": head_passes_tests : started")
+    print("test_repo_heads:", repo_slug, ": head_passes_tests : started")
 
     repo = Repository(repo_slug, cache_prefix=cache)
     test_state = repo.checkout_and_test(
         repo_info["head hash"], timeout=TIMEOUT_TESTING, n_tests=3
     )
-    print("test_repo_head:", repo_slug, ": head_passes_tests : returning", test_state)
+    print("test_repo_heads:", repo_slug, ": head_passes_tests : returning", test_state)
     return test_state
 
 
@@ -61,7 +61,7 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE:
 
     df = pd.read_csv(args.repos_csv_with_hashes, index_col="idx")
 
-    print("test_repo_head: Started cloning repos")
+    print("test_repo_heads: Started cloning repos")
     with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
         results = [
             pool.apply_async(clone_repo, args=(row["repository"],))
@@ -72,13 +72,13 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE:
                 return_value = result.get(10 * 60)
             except Exception as e:
                 print("Couldn't clone repo", e)
-    print("test_repo_head: Finished cloning repos")
+    print("test_repo_heads: Finished cloning repos")
 
     if os.path.exists(args.output_path):
-        print("test_repo_head: Output file already exists. Exiting.")
+        print("test_repo_heads: Output file already exists. Exiting.")
         sys.exit(0)
 
-    print("test_repo_head: Started Testing")
+    print("test_repo_heads: Started Testing")
     head_passes_tests_arguments = [(v, args.cache_dir) for _, v in df.iterrows()]
     with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
         head_passes_tests_results = list(
@@ -87,18 +87,18 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE:
                 total=len(head_passes_tests_arguments),
             )
         )
-    print("test_repo_head: Finished Testing")
+    print("test_repo_heads: Finished Testing")
 
-    print("test_repo_head: Started Building Output")
+    print("test_repo_heads: Started Building Output")
     out = []
     repos_head_passes_mask = [
         i == TEST_STATE.Tests_passed for i in head_passes_tests_results
     ]
     out = df[repos_head_passes_mask]
-    print("test_repo_head: Finished Building Output")
+    print("test_repo_heads: Finished Building Output")
 
     print(
-        "test_repo_head: Number of repos whose head passes tests:",
+        "test_repo_heads: Number of repos whose head passes tests:",
         len(out),
         "out of",
         len(df),
@@ -106,4 +106,4 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE:
     if len(out) == 0:
         raise Exception("No repos found whose head passes tests")
     out.to_csv(args.output_path, index_label="idx")
-    print("test_repo_head: Done")
+    print("test_repo_heads: Done")
diff --git a/src/python/write_head_hashes.py b/src/python/write_head_hashes.py
index ffa25e15e7..e32dd4ef08 100755
--- a/src/python/write_head_hashes.py
+++ b/src/python/write_head_hashes.py
@@ -105,7 +105,7 @@ def get_latest_hash(args):
 
     # If file exists ignore this step
     if os.path.isfile(args.output_path):
-        print("write_head_hashes: test_repo_head: Cached")
+        print("write_head_hashes: test_repo_heads: Cached")
         sys.exit(0)
 
     df = pd.read_csv(args.repos_csv, index_col="idx")

From 5415252abbf7b8f97c7e1051a41818923835e65a Mon Sep 17 00:00:00 2001
From: Michael Ernst <mernst@cs.washington.edu>
Date: Wed, 13 Sep 2023 05:55:24 -0700
Subject: [PATCH 2/6] Renamings

---
 run.sh                                  |  4 ++--
 src/python/merge_differ.py              |  4 ++--
 src/python/merge_tester.py              |  8 ++++----
 src/python/merge_tools_comparator.py    | 18 +++++++++---------
 src/python/test_repo_heads.py           |  6 +++---
 src/python/variables.py                 |  2 +-
 src/python/write_head_hashes.py         |  4 ++--
 src/scripts/merge_tools/gitmerge.sh     |  2 +-
 src/scripts/merge_tools/intellimerge.sh |  2 +-
 src/scripts/merge_tools/spork.sh        |  2 +-
 10 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/run.sh b/run.sh
index 20fdd53a52..9b1c1594a9 100755
--- a/run.sh
+++ b/run.sh
@@ -17,7 +17,7 @@ set -o nounset
 
 REPOS_CSV="$1"
 OUT_DIR="$2"
-N_MERGES=$3
+N_REPETITIONS=$3
 CACHE_DIR="${4}"
 
 comparator_flags=""
@@ -120,5 +120,5 @@ python3 src/python/latex_output.py \
     --tested_merges_path "$OUT_DIR/merges_tested/" \
     --full_repos_csv "$REPOS_CSV" \
     --repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
-    --n_merges "$N_MERGES" \
+    --n_merges "$N_REPETITIONS" \
     --output_dir "$OUT_DIR"
diff --git a/src/python/merge_differ.py b/src/python/merge_differ.py
index bd2de8db4a..b30a73a6a1 100755
--- a/src/python/merge_differ.py
+++ b/src/python/merge_differ.py
@@ -19,7 +19,7 @@
 import pandas as pd
 from repo import Repository, MERGE_TOOL, TEST_STATE, MERGE_STATE
 from tqdm import tqdm
-from write_head_hashes import compute_num_process_used
+from write_head_hashes import num_processes
 from cache_utils import slug_repo_name
 
 if os.getenv("TERM", "dumb") == "dumb":
@@ -210,7 +210,7 @@ def diff_file_name(sha1: str, sha2: str) -> Path:
     print("merge_differ: Number of tested merges:", len(merge_differ_arguments))
 
     print("merge_differ: Started Diffing")
-    with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
+    with multiprocessing.Pool(processes=num_processes()) as pool:
         tqdm(
             pool.imap(merge_differ, merge_differ_arguments),
             total=len(merge_differ_arguments),
diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py
index 4f239c367d..884275a6ab 100755
--- a/src/python/merge_tester.py
+++ b/src/python/merge_tester.py
@@ -21,8 +21,8 @@
 import psutil
 import pandas as pd
 from repo import Repository, MERGE_TOOL, TEST_STATE
-from write_head_hashes import compute_num_process_used
-from merge_tools_comparator import is_merge_sucess
+from write_head_hashes import num_processes
+from merge_tools_comparator import is_merge_success
 from cache_utils import slug_repo_name
 from tqdm import tqdm
 
@@ -78,7 +78,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
     merge_data["parents pass"] = True
 
     for merge_tool in MERGE_TOOL:
-        if is_merge_sucess(merge_data[merge_tool.name]):
+        if is_merge_success(merge_data[merge_tool.name]):
             repo = Repository(repo_slug, cache_prefix=cache_prefix)
             (
                 result,
@@ -174,7 +174,7 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
     print("merge_tester: Number of merges to test:", len(merge_tester_arguments))
 
     print("merge_tester: Started Testing")
-    with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
+    with multiprocessing.Pool(processes=num_processes()) as pool:
         merge_tester_results = list(
             tqdm(
                 pool.imap(merge_tester, merge_tester_arguments),
diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py
index 77d32a75fe..9c091b2230 100755
--- a/src/python/merge_tools_comparator.py
+++ b/src/python/merge_tools_comparator.py
@@ -28,14 +28,14 @@
 from repo import Repository, MERGE_TOOL, MERGE_STATE
 from tqdm import tqdm
 from cache_utils import set_in_cache, lookup_in_cache, slug_repo_name
-from write_head_hashes import compute_num_process_used
-from variables import TIMEOUT_MERGING, N_MERGES
+from write_head_hashes import num_processes
+from variables import TIMEOUT_MERGING, N_REPETITIONS
 
 if os.getenv("TERM", "dumb") == "dumb":
     tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)  # type: ignore
 
 
-def is_merge_sucess(merge_state: str) -> bool:
+def is_merge_success(merge_state: str) -> bool:
     """Returns true if the merge state indicates success."""
     return merge_state == MERGE_STATE.Merge_success.name
 
@@ -72,7 +72,7 @@ def merger(  # pylint: disable=too-many-locals
             merge_tool.name,
         )
         cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []}
-        for i in range(N_MERGES):
+        for i in range(N_REPETITIONS):
             repo = Repository(repo_slug, cache_prefix=cache_prefix)
             (
                 merge_status,
@@ -210,7 +210,7 @@ def merger(  # pylint: disable=too-many-locals
     print("merge_tools_comparator: Number of new merges:", len(merger_arguments))
 
     print("merge_tools_comparator: Started Merging")
-    with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
+    with multiprocessing.Pool(processes=num_processes()) as pool:
         merger_results = list(
             tqdm(pool.imap(merger, merger_arguments), total=len(merger_arguments))
         )
@@ -227,13 +227,13 @@ def merger(  # pylint: disable=too-many-locals
         two_merge_tools_differ = False
         for merge_tool1 in MERGE_TOOL:
             for merge_tool2 in MERGE_TOOL:
-                if is_merge_sucess(
+                if is_merge_success(
                     cache_data[merge_tool1.name]["results"][0]
-                ) and is_merge_sucess(cache_data[merge_tool2.name]["results"][0]):
+                ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]):
                     two_merge_tools_differ = True
-                if is_merge_sucess(
+                if is_merge_success(
                     cache_data[merge_tool1.name]["results"][0]
-                ) and is_merge_sucess(cache_data[merge_tool2.name]["results"][0]):
+                ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]):
                     if (
                         cache_data[merge_tool1.name]["merge_fingerprint"]
                         != cache_data[merge_tool2.name]["merge_fingerprint"]
diff --git a/src/python/test_repo_heads.py b/src/python/test_repo_heads.py
index e737e77d8c..d0e720e846 100755
--- a/src/python/test_repo_heads.py
+++ b/src/python/test_repo_heads.py
@@ -22,7 +22,7 @@
 
 from tqdm import tqdm
 import pandas as pd
-from write_head_hashes import compute_num_process_used, clone_repo
+from write_head_hashes import num_processes, clone_repo
 
 if os.getenv("TERM", "dumb") == "dumb":
     tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)  # type: ignore
@@ -62,7 +62,7 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE:
     df = pd.read_csv(args.repos_csv_with_hashes, index_col="idx")
 
     print("test_repo_heads: Started cloning repos")
-    with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
+    with multiprocessing.Pool(processes=num_processes()) as pool:
         results = [
             pool.apply_async(clone_repo, args=(row["repository"],))
             for _, row in df.iterrows()
@@ -80,7 +80,7 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE:
 
     print("test_repo_heads: Started Testing")
     head_passes_tests_arguments = [(v, args.cache_dir) for _, v in df.iterrows()]
-    with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
+    with multiprocessing.Pool(processes=num_processes()) as pool:
         head_passes_tests_results = list(
             tqdm(
                 pool.imap(head_passes_tests, head_passes_tests_arguments),
diff --git a/src/python/variables.py b/src/python/variables.py
index c01dce0889..519c0fb9f9 100644
--- a/src/python/variables.py
+++ b/src/python/variables.py
@@ -12,4 +12,4 @@
 WORKDIR_PREFIX = Path(".workdir")
 
 TIMEOUT_MERGING = 60 * 15  # 15 minutes, in seconds
-N_MERGES = 3
+N_REPETITIONS = 3
diff --git a/src/python/write_head_hashes.py b/src/python/write_head_hashes.py
index e32dd4ef08..ccb04ab419 100755
--- a/src/python/write_head_hashes.py
+++ b/src/python/write_head_hashes.py
@@ -52,7 +52,7 @@ def clone_repo(repo_slug: str) -> git.repo.Repo:
     return repo
 
 
-def compute_num_process_used() -> int:
+def num_processes() -> int:
     """Comput the number of cpus to be used
     Returns:
         int: the number of cpus to be used.
@@ -112,7 +112,7 @@ def get_latest_hash(args):
 
     print("write_head_hashes: Started cloning repos and collecting head hashes")
 
-    with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
+    with multiprocessing.Pool(processes=num_processes()) as pool:
         get_latest_hash_result = list(
             tqdm(
                 pool.imap(get_latest_hash, df.iterrows()),
diff --git a/src/scripts/merge_tools/gitmerge.sh b/src/scripts/merge_tools/gitmerge.sh
index 6b6c86acad..1c2032d58a 100755
--- a/src/scripts/merge_tools/gitmerge.sh
+++ b/src/scripts/merge_tools/gitmerge.sh
@@ -10,7 +10,7 @@
 set -o nounset
 
 if [ "$#" -ne 4 ]; then
-  echo "Usage: $0 MERGE_SCRIPTS_DIR BRANCH1 BRANCH2 STRATEGY" >&2
+  echo "Usage: $0 CLONE_DIR BRANCH1 BRANCH2 STRATEGY" >&2
   exit 1
 fi
 
diff --git a/src/scripts/merge_tools/intellimerge.sh b/src/scripts/merge_tools/intellimerge.sh
index adc8370865..dfb5358036 100755
--- a/src/scripts/merge_tools/intellimerge.sh
+++ b/src/scripts/merge_tools/intellimerge.sh
@@ -9,7 +9,7 @@
 set -o nounset
 
 if [ "$#" -ne 3 ]; then
-  echo "Usage: $0 MERGE_SCRIPTS_DIR BRANCH1 BRANCH2" >&2
+  echo "Usage: $0 CLONE_DIR BRANCH1 BRANCH2" >&2
   exit 1
 fi
 
diff --git a/src/scripts/merge_tools/spork.sh b/src/scripts/merge_tools/spork.sh
index 0fbe3f1a85..53ea3b5ddb 100755
--- a/src/scripts/merge_tools/spork.sh
+++ b/src/scripts/merge_tools/spork.sh
@@ -9,7 +9,7 @@
 set -o nounset
 
 if [ "$#" -ne 3 ]; then
-  echo "Usage: $0 MERGE_SCRIPTS_DIR BRANCH1 BRANCH2" >&2
+  echo "Usage: $0 CLONE_DIR BRANCH1 BRANCH2" >&2
   exit 1
 fi
 

From f7c1c2b9fbc8ec900bb92d8022eef486234098ea Mon Sep 17 00:00:00 2001
From: Michael Ernst <mernst@cs.washington.edu>
Date: Thu, 14 Sep 2023 09:52:23 -0700
Subject: [PATCH 3/6] Check settings of environment variables (#213)

---
 src/scripts/run_repo_tests.sh | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/scripts/run_repo_tests.sh b/src/scripts/run_repo_tests.sh
index 8948f987a1..81cc0ab604 100755
--- a/src/scripts/run_repo_tests.sh
+++ b/src/scripts/run_repo_tests.sh
@@ -32,18 +32,19 @@ else
 fi
 
 if [ -z "${JAVA8_HOME:+isset}" ] ; then echo "JAVA8_HOME is not set"; exit 1; fi
+if [ ! -d "${JAVA8_HOME}" ] ; then echo "JAVA8_HOME is set to a nonexistent directory: ${JAVA8_HOME}"; exit 1; fi
 if [ -z "${JAVA11_HOME:+isset}" ] ; then echo "JAVA11_HOME is not set"; exit 1; fi
+if [ ! -d "${JAVA11_HOME}" ] ; then echo "JAVA11_HOME is set to a nonexistent directory: ${JAVA11_HOME}"; exit 1; fi
 if [ -z "${JAVA17_HOME:+isset}" ] ; then echo "JAVA17_HOME is not set"; exit 1; fi
+if [ ! -d "${JAVA17_HOME}" ] ; then echo "JAVA17_HOME is set to a nonexistent directory: ${JAVA17_HOME}"; exit 1; fi
+
+ORIG_PATH="${PATH}"
 
 # shellcheck disable=SC2153 # Not a typo of JAVA_HOME.
 for javaX_home in $JAVA8_HOME $JAVA11_HOME $JAVA17_HOME
 do
-  if [ ! -d "${javaX_home}" ] ; then
-    echo "No JDK ${javaX_home}"
-    continue
-  fi
-  export JAVA_HOME=${javaX_home}
-  export PATH="$JAVA_HOME/bin:$PATH"
+  export JAVA_HOME="${javaX_home}"
+  export PATH="$JAVA_HOME/bin:$ORIG_PATH"
   echo "Running tests with JAVA_HOME=$JAVA_HOME"
   ${command}
   rc=$?

From 06c9934ddf87f121829d238dcd4e99342d2eff4d Mon Sep 17 00:00:00 2001
From: Michael Ernst <mernst@cs.washington.edu>
Date: Thu, 14 Sep 2023 14:51:01 -0700
Subject: [PATCH 4/6] Mike's code review changes (#207)

---
 src/python/cache_utils.py                     | 12 +++++-----
 src/python/latex_output.py                    |  2 +-
 src/python/merge_differ.py                    | 15 ++++++------
 src/python/merge_tester.py                    | 15 ++++++------
 src/python/merge_tools_comparator.py          |  1 -
 src/python/repo.py                            | 20 +++++++++-------
 src/python/split_repos.py                     |  2 +-
 src/python/test_repo_heads.py                 |  2 +-
 src/python/write_head_hashes.py               | 24 ++++++++++---------
 src/scripts/merge_tools/gitmerge_resolve.sh   |  4 ++--
 .../merge_tools/resolve-import-conflicts      |  8 +++----
 .../resolve-import-conflicts-in-file.py       | 18 +++++++++-----
 src/scripts/merge_tools/spork.sh              |  2 +-
 13 files changed, 67 insertions(+), 58 deletions(-)

diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py
index 29548dfdde..eab41d6f91 100755
--- a/src/python/cache_utils.py
+++ b/src/python/cache_utils.py
@@ -3,8 +3,8 @@
 There will be 4 caches which are stored on disk after running the script:
 1) cache/sha_cache_entry:  A cache that maps the commit hash to a sha256 hash of the repository.
 2) cache/test_cache: A cache that maps a sha256 to test results.
-3) cache/merge_results:A cache that maps a merge to the result 
-        of the merge (sha256, runtime and MERGE_STATE)
+3) cache/merge_results: A cache that maps a merge to the result 
+        of the merge (sha256, run time, and MERGE_STATE).
 4) cache/merge_diffs: A cache that stores the diff between merge tools.
 """
 
@@ -19,9 +19,9 @@
 
 
 def slug_repo_name(repo_slug: str) -> str:
-    """Given a GitHub repository slug (owner/reponame), returns the reponame.
+    """Given a GitHub repository slug ("owner/reponame"), returns the reponame.
     Args:
-        repo_slug (str): The slug of the repository, which is 'owner/reponame'.
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
     Returns:
         str: The reponame.
     """
@@ -52,8 +52,8 @@ def get_cache_path(repo_slug: str, cache_prefix: Path) -> Path:
     Returns:
         Path: The path to the cache file.
     """
-    cache_entry_name = slug_repo_name(repo_slug) + ".json"
-    cache_path = cache_prefix / cache_entry_name
+    cache_file_name = slug_repo_name(repo_slug) + ".json"
+    cache_path = cache_prefix / cache_file_name
     cache_path.parent.mkdir(parents=True, exist_ok=True)
     return cache_path
 
diff --git a/src/python/latex_output.py b/src/python/latex_output.py
index c28d827a03..6dee4cf18f 100755
--- a/src/python/latex_output.py
+++ b/src/python/latex_output.py
@@ -13,7 +13,7 @@
 This script generates all the tables and plots for the paper. It requires the
 following input files:
 - full_repos_csv: csv file containing the full list of repositories
-- repos_head_passes_csv: csv file containing the list of valid repositories
+- repos_head_passes_csv: csv file containing the list of repositories whose head passes tests
 - tested_merges_path: path to the folder containing the merge results
 - merges_path: path to the folder containing all found merges.
 - output_dir: path to the folder where the LaTeX files will be saved
diff --git a/src/python/merge_differ.py b/src/python/merge_differ.py
index b30a73a6a1..bc7abeef03 100755
--- a/src/python/merge_differ.py
+++ b/src/python/merge_differ.py
@@ -25,14 +25,15 @@
 if os.getenv("TERM", "dumb") == "dumb":
     tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)  # type: ignore
 
-TIMEOUT_TESTING_PARENT = 60 * 30  # 30 minutes
-TIMEOUT_TESTING_MERGE = 60 * 45  # 45 minutes
+TIMEOUT_TESTING_PARENT = 60 * 30  # 30 minutes, in seconds
+TIMEOUT_TESTING_MERGE = 60 * 45  # 45 minutes, in seconds
 
 
 def get_merge_fingerprint(
     merge_data: pd.Series, merge_tool: MERGE_TOOL, cache_prefix: Path
 ) -> Union[Tuple[None, None], Tuple[Repository, str]]:
-    """Returns the repo and the fingerprint of a merge, or None.
+    """Returns the repo and the fingerprint of a merge,
+    or (None, None) if the merge is not successful.
     Does some sanity-checking too.
     Args:
         merge_data: The merge data.
@@ -107,7 +108,6 @@ def merge_differ(args: Tuple[pd.Series, Path]) -> None:
             if repo2 is None or merge_fingerprint2 is None:
                 continue
 
-            # Use lexicographic order to prevent duplicates
             diff_file = diff_file_prefix / diff_file_name(
                 merge_fingerprint1, merge_fingerprint2
             )
@@ -130,6 +130,7 @@ def diff_file_name(sha1: str, sha2: str) -> Path:
     Returns:
         Path: The name of the diff file.
     """
+    # Use lexicographic order to prevent duplicates
     if sha1 < sha2:
         return Path(sha1 + "_" + sha2 + ".txt")
     return Path(sha2 + "_" + sha1 + ".txt")
@@ -148,7 +149,7 @@ def diff_file_name(sha1: str, sha2: str) -> Path:
 
     repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx")
 
-    print("merge_differ: Started listing diffs to compute")
+    print("merge_differ: Started collecting diffs to compute")
     merge_differ_arguments = []
     for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
         merges_repo = []
@@ -206,8 +207,8 @@ def diff_file_name(sha1: str, sha2: str) -> Path:
     random.seed(42)
     random.shuffle(merge_differ_arguments)
 
-    print("merge_differ: Finished listing diffs to compute")
-    print("merge_differ: Number of tested merges:", len(merge_differ_arguments))
+    print("merge_differ: Finished collecting diffs to compute")
+    print("merge_differ: Number of merges to test:", len(merge_differ_arguments))
 
     print("merge_differ: Started Diffing")
     with multiprocessing.Pool(processes=num_processes()) as pool:
diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py
index 884275a6ab..c880493ab4 100755
--- a/src/python/merge_tester.py
+++ b/src/python/merge_tester.py
@@ -106,6 +106,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
                     merge_fingerprint,
                     merge_data[merge_tool.name + "_merge_fingerprint"],
                 )
+            # Update the status from merge success to test result.
             merge_data[merge_tool.name] = result.name
             del repo
         assert merge_tool.name in merge_data
@@ -126,7 +127,7 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
 
     repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx")
 
-    print("merge_tester: Started listing merges to test")
+    print("merge_tester: Started collecting merges to test")
     merge_tester_arguments = []
     for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
         repo_slug = repository_data["repository"]
@@ -170,7 +171,7 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
     random.seed(42)
     random.shuffle(merge_tester_arguments)
 
-    print("merge_tester: Finished listing merges to test")
+    print("merge_tester: Finished collecting merges to test")
     print("merge_tester: Number of merges to test:", len(merge_tester_arguments))
 
     print("merge_tester: Started Testing")
@@ -210,12 +211,10 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
                     "because it does not contain any merges.",
                 )
                 continue
-            n_total_merges += len(df)
-            n_total_merges_parents_pass += len(df[df["parents pass"]])
-            continue
-        df = pd.DataFrame(repo_result[repo_slug])
-        df.sort_index(inplace=True)
-        df.to_csv(output_file, index_label="idx")
+        else:
+            df = pd.DataFrame(repo_result[repo_slug])
+            df.sort_index(inplace=True)
+            df.to_csv(output_file, index_label="idx")
         n_total_merges += len(df)
         n_total_merges_parents_pass += len(df[df["parents pass"]])
 
diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py
index 9c091b2230..69b5bb5f00 100755
--- a/src/python/merge_tools_comparator.py
+++ b/src/python/merge_tools_comparator.py
@@ -219,7 +219,6 @@ def merger(  # pylint: disable=too-many-locals
     repo_result = {repo_slug: [] for repo_slug in repos["repository"]}
     print("merge_tools_comparator: Constructing Output")
     n_new_compared = 0
-    n_new_tested = 0
     for i in tqdm(range(len(merger_arguments))):
         repo_slug = merger_arguments[i][0]
         merge_data = merger_arguments[i][1]
diff --git a/src/python/repo.py b/src/python/repo.py
index 41d9265f58..c886456224 100755
--- a/src/python/repo.py
+++ b/src/python/repo.py
@@ -63,16 +63,14 @@ def compute_explanation(
     command: List[str],
     source: Union[subprocess.TimeoutExpired, subprocess.CompletedProcess],
 ) -> str:
-    """Poduces the explanation string of a timedout process or
+    """Produces the explanation string of a timedout process or
     a completed process.
     """
     explanation = "Run Command: " + " ".join(command) + "\nTimed out"
-    explanation += (
-        "\nstdout:\n" + source.stdout.decode("utf-8") if source.stdout else ""
-    )
-    explanation += (
-        "\nstderr:\n" + source.stderr.decode("utf-8") if source.stderr else ""
-    )
+    if source.stdout:
+        explanation += "\nstdout:\n" + source.stdout.decode("utf-8")
+    if source.stderr:
+        explanation += "\nstderr:\n" + source.stderr.decode("utf-8")
     return explanation
 
 
@@ -184,7 +182,7 @@ def _merge_and_test(  # pylint: disable=too-many-arguments
             left_commit (str): The left commit to merge.
             right_commit (str): The right commit to merge.
             timeout (int): The timeout limit, in seconds.
-            n_tests (int): The number of times to perform the test.
+            n_tests (int): The number of times to run the test.
         Returns:
             TEST_STATE: The result of the test.
             str: The tree fingerprint of the merge result.
@@ -396,6 +394,8 @@ def merge(  # pylint: disable=too-many-locals
 
     def compute_tree_fingerprint(self) -> str:
         """Computes the tree fingerprint of the repository.
+        This function must never be run after running tests,
+        since running tests might write output files.
         Returns:
             str: The tree fingerprint.
         """
@@ -459,7 +459,9 @@ def _checkout_and_test(
         timeout: int,
         n_tests: int,
     ) -> TEST_STATE:
-        """Checks out the given commit and tests the repository.
+        """Helper function for `checkout_and_test`,
+        which checks out the given commit and tests the repository.
+        This function does not check the cache.
         Args:
             commit (str): The commit to checkout.
             timeout (int): The timeout limit, in seconds.
diff --git a/src/python/split_repos.py b/src/python/split_repos.py
index a77db5d6f4..035377d0b6 100755
--- a/src/python/split_repos.py
+++ b/src/python/split_repos.py
@@ -21,7 +21,7 @@
     parser.add_argument("--output_file", type=Path)
     args = parser.parse_args()
     df: pd.DataFrame = pd.read_csv(args.repos_csv, index_col="idx")
-    # Shuffle the dataframe so the ordering of the list doesn't bias the output
+    # Shuffle the dataframe so the ordering of the list doesn't bias the output.
     df = df.sample(frac=1, random_state=42)
     df = np.array_split(df, args.num_machines)[args.machine_id]
     df.sort_index(inplace=True)
diff --git a/src/python/test_repo_heads.py b/src/python/test_repo_heads.py
index d0e720e846..bd668230ac 100755
--- a/src/python/test_repo_heads.py
+++ b/src/python/test_repo_heads.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Tests the HEAD of a repo and validates it if the test passes.
+"""Tests the HEAD commits of multiple repos and validates it if the test passes.
 
 usage: python3 test_repo_heads.py --repos_csv_with_hashes <repos_csv_with_hashes.csv>
                                  --output_path <repos_head_passes.csv>
diff --git a/src/python/write_head_hashes.py b/src/python/write_head_hashes.py
index ccb04ab419..b01b79dc6b 100755
--- a/src/python/write_head_hashes.py
+++ b/src/python/write_head_hashes.py
@@ -2,14 +2,14 @@
 """Write the hash of the HEAD of the default branch for each repository to its own file.
 If the file already exists, do nothing.
 After this is done, the resulting files are used indefinitely, for reproducible results.
-Note: the default branch is often named "main" or "master".
+The default branch is often named "main" or "master".
 
 usage: python3 write_head_hashes.py --repos_csv <repos.csv>
                                     --output_path <repos_head_passes.csv>
 
 Input: a csv of repos.
 The input file `repos.csv` must contain a header, one of whose columns is "repository".
-That column contains "ORGANIZATION/REPO" for a GitHub repository.
+That column contains a slug ("ORGANIZATION/REPO") for a GitHub repository.
 Output: Write one file per repository, with the hash of the HEAD of the default branch
 as column "head hash".
 """
@@ -25,9 +25,12 @@
 import git.repo
 from variables import REPOS_PATH
 
+if os.getenv("TERM", "dumb") == "dumb":
+    tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)  # type: ignore
+
 
 def clone_repo(repo_slug: str) -> git.repo.Repo:
-    """Clones a repository, or runs `git fetch` if it is already cloned.
+    """Clones a repository, or runs `git fetch` if the repository is already cloned.
     Args:
         repo_slug (str): The slug of the repository, which is "owner/reponame".
     """
@@ -53,23 +56,19 @@ def clone_repo(repo_slug: str) -> git.repo.Repo:
 
 
 def num_processes() -> int:
-    """Comput the number of cpus to be used
+    """Compute the number of CPUs to be used
     Returns:
-        int: the number of cpus to be used.
+        int: the number of CPUs to be used.
     """
     cpu_count = os.cpu_count() or 1
     processes_used = int(0.7 * cpu_count) if cpu_count > 3 else cpu_count
     return processes_used
 
 
-if os.getenv("TERM", "dumb") == "dumb":
-    tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)  # type: ignore
-
-
 def get_latest_hash(args):
     """Collects the latest hash of the HEAD of the default branch for a repo.
     Args:
-        arg (idx,row): Information regarding that repo.
+        Tuple[idx,row]: Information regarding that repo.
     Returns:
         pd.Series: repo information with the hash of the HEAD
     """
@@ -105,7 +104,7 @@ def get_latest_hash(args):
 
     # If file exists ignore this step
     if os.path.isfile(args.output_path):
-        print("write_head_hashes: test_repo_heads: Cached")
+        print("write_head_hashes: Cached")
         sys.exit(0)
 
     df = pd.read_csv(args.repos_csv, index_col="idx")
@@ -120,7 +119,10 @@ def get_latest_hash(args):
             )
         )
 
+    print("write_head_hashes: Finished cloning repos and collecting head hashes")
+
     result_df = pd.DataFrame([i for i in get_latest_hash_result if i is not None])
     result_df = result_df.set_index(result_df.columns[0]).reset_index(drop=True)
+    print("write_head_hashes: Started storing repo HEAD hashes")
     result_df.to_csv(args.output_path, index_label="idx")
     print("write_head_hashes: Finished storing repo HEAD hashes")
diff --git a/src/scripts/merge_tools/gitmerge_resolve.sh b/src/scripts/merge_tools/gitmerge_resolve.sh
index b6871d44dc..928a97960a 100755
--- a/src/scripts/merge_tools/gitmerge_resolve.sh
+++ b/src/scripts/merge_tools/gitmerge_resolve.sh
@@ -11,11 +11,11 @@ strategy="-s resolve"
 status=$?
 
 if [ "$status" -ne 0 ]; then
-  echo "Fixing conflicts"
+  echo "Removing filenames from conflict markers."
   cd "$clone_dir" || exit 1
   readarray -t files < <(grep -l -r '^\(<<<<<<<\||||||||\|>>>>>>>\) .merge_file_')
   for file in "${files[@]}" ; do
-    echo "Fixing $file"
+    echo "Removing filenames from conflict markers in $file"
     sed -i 's/^\(\(<<<<<<<\||||||||\|>>>>>>>\) .merge_file\)_[a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9]$/\1/' "$file"
   done
 fi
diff --git a/src/scripts/merge_tools/resolve-import-conflicts b/src/scripts/merge_tools/resolve-import-conflicts
index 138c90c94e..feaee8b83c 100755
--- a/src/scripts/merge_tools/resolve-import-conflicts
+++ b/src/scripts/merge_tools/resolve-import-conflicts
@@ -1,12 +1,12 @@
 #!/bin/bash
 
 # This script edits files to remove conflict markers related to Java imports.
-# Works on all files given on the command line; if none are given,
-# works on all files in or under the current directory.
+# It works on all files given on the command line;
+# if none are given, it works on all files in or under the current directory.
 
-# This is not a git mergetool.  A git mergetool is given the base, parent 1, and
+# This script is not a git mergetool.  A git mergetool is given the base, parent 1, and
 # parent 2 files, all without conflict markers.
-# However, this can be run instead of a git mergetool, or after a git mergetool.
+# However, this script can be run instead of a git mergetool, or after a git mergetool.
 
 if [ "$#" -eq 0 ] ; then
   readarray -t files < <(grep -l -r '^<<<<<<< HEAD' .)
diff --git a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py
index 539b461f28..9c4f7878c0 100755
--- a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py
+++ b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py
@@ -1,6 +1,8 @@
 #! /usr/bin/env python
 
-"""Edits a file in place to remove conflict markers related to Java imports."""
+"""Edits a file in place to remove conflict markers related to Java imports.
+It simplistically leaves all the imports that appear in either parent.
+"""
 
 
 # TODO: merge both scripts into one.
@@ -11,16 +13,15 @@
 import tempfile
 
 if len(sys.argv) != 2:
-    print("Provide exactly one command-line argument")
+    print(
+        "resolve-import-conflicts-in-file: Provide exactly one command-line argument."
+    )
     sys.exit(1)
 
 filename = sys.argv[1]
 with open(filename) as file:
     lines = file.readlines()
 
-# The state is: nonconflict, in left, in original, in right.
-state = "nonconflict"
-
 
 def all_import_lines(lines):
     """Return true if every line is a Java import line."""
@@ -52,10 +53,13 @@ def merge(base, parent1, parent2):
 
 
 def looking_at_conflict(start_index, lines):  # pylint: disable=R0911
-    """Tests whether the following text starts a conflict.
+    """Tests whether the text starting at line `start_index` is the beginning of a conflict.
     If not, returns None.
     If so, returns a 4-tuple of (base, parent1, parent2, num_lines_in_conflict)
     where the first 3 elements of the tuple are lists of lines.
+    Args:
+        start_index: an index into `lines`.
+        lines: all the lines of the file with name `filename`.
     """
 
     if not lines[start_index].startswith("<<<<<<<"):
@@ -119,6 +123,8 @@ def looking_at_conflict(start_index, lines):  # pylint: disable=R0911
     return (base, parent1, parent2, index - start_index)
 
 
+## Main starts here.
+
 with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp:
     file_len = len(lines)
     i = 0
diff --git a/src/scripts/merge_tools/spork.sh b/src/scripts/merge_tools/spork.sh
index 53ea3b5ddb..0f3a695f81 100755
--- a/src/scripts/merge_tools/spork.sh
+++ b/src/scripts/merge_tools/spork.sh
@@ -13,7 +13,7 @@ if [ "$#" -ne 3 ]; then
   exit 1
 fi
 
-# Kill all java processes that are running for over an hour (to avoid memory leaks)
+# Kill all Java processes that are running for over an hour (to avoid memory leaks).
 killall -9 java --older-than 1h
 
 SCRIPT_PATH="$(dirname "$0")"; SCRIPT_PATH="$(eval "cd \"$SCRIPT_PATH\" && pwd")"

From 5914031a6744f8bf684f49622432c0dbfc734317 Mon Sep 17 00:00:00 2001
From: Michael Ernst <mernst@cs.washington.edu>
Date: Thu, 14 Sep 2023 16:46:37 -0700
Subject: [PATCH 5/6] Add comments (#216)

---
 src/python/merge_tools_comparator.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py
index 69b5bb5f00..1818263a59 100755
--- a/src/python/merge_tools_comparator.py
+++ b/src/python/merge_tools_comparator.py
@@ -207,6 +207,7 @@ def merger(  # pylint: disable=too-many-locals
     random.shuffle(merger_arguments)
 
     print("merge_tools_comparator: Finished Constructing Inputs")
+    # New merges are merges whose analysis does not appear in the output folder.
     print("merge_tools_comparator: Number of new merges:", len(merger_arguments))
 
     print("merge_tools_comparator: Started Merging")
@@ -280,10 +281,13 @@ def merger(  # pylint: disable=too-many-locals
         df.to_csv(output_file, index_label="idx")
         n_total_compared += len(df)
 
+    # This is the number of merges whose "two merge tools differ" bit has been set (to true or
+    # false).
     print(
         "merge_tools_comparator: Number of merge tool outputs that have been newly compared:",
         n_new_compared,
     )
+    # This is the number of merges whose "two merge tools differ" bit has been to true.
     print(
         "merge_tools_comparator: Total number of merge tool outputs that have been compared:",
         n_total_compared,

From a76a1a698b333bd1ddfd549293e8fad926d38878 Mon Sep 17 00:00:00 2001
From: Michael Ernst <mernst@cs.washington.edu>
Date: Tue, 19 Sep 2023 09:10:50 -0700
Subject: [PATCH 6/6] Reduce code duplication (#212)