From 606b16c5572ad5bc7db17ed37a75dcf0150e41cc Mon Sep 17 00:00:00 2001
From: Benedikt schesch <b.schesch@gmail.com>
Date: Sun, 24 Sep 2023 13:34:56 -0700
Subject: [PATCH] Improvements

---
 run.sh                                        |   2 +-
 src/python/cache_utils.py                     | 211 +++++++-----------
 ...olders.py => delete_cache_placeholders.py} |  12 +-
 src/python/latex_output.py                    |  12 +-
 src/python/merge_differ.py                    |  35 ++-
 src/python/merge_tester.py                    |  10 +-
 src/python/merge_tools_comparator.py          |  63 +++---
 src/python/repo.py                            |  55 ++---
 src/python/test_repo_heads.py                 |   5 +-
 src/python/variables.py                       |   5 +-
 src/scripts/merge_tools/intellimerge.sh       |   3 +-
 src/scripts/utils/run_multiple_machines.sh    |   3 +-
 src/scripts/utils/run_remotely.sh             |   2 +-
 13 files changed, 175 insertions(+), 243 deletions(-)
 rename src/python/{clean_cache_placeholders.py => delete_cache_placeholders.py} (65%)

diff --git a/run.sh b/run.sh
index 9b1c1594a9..bc592a5872 100755
--- a/run.sh
+++ b/run.sh
@@ -73,7 +73,7 @@ if [ -d "$CACHE_DIR" ]; then
     find "$CACHE_DIR" -name "*.lock" -delete
 fi
 
-python3 src/python/clean_cache_placeholders.py \
+python3 src/python/delete_cache_placeholders.py \
     --cache_dir "$CACHE_DIR"
 
 python3 src/python/write_head_hashes.py \
diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py
index b7501a9e6a..2ef4864ca6 100755
--- a/src/python/cache_utils.py
+++ b/src/python/cache_utils.py
@@ -19,9 +19,6 @@
 CACHE_BACKOFF_TIME = 2 * 60  # 2 minutes, in seconds
 TIMEOUT = 90 * 60  # 90 minutes, in seconds
 
-# TODO: In this file, please place the externally-visible or exported functions
-# first, and then a comment marker of some kind, and then the "private" ones.
-
 
 def slug_repo_name(repo_slug: str) -> str:
     """Given a GitHub repository slug ("owner/reponame"), returns the reponame.
@@ -33,99 +30,13 @@ def slug_repo_name(repo_slug: str) -> str:
     return repo_slug.split("/")[1]
 
 
-# TODO: Throughout, please use "_directory" instead of "_prefix" in
-# variable names, when the variable actually holds a directory.  I
-# feel this will be clearer.
-def get_cache_lock(repo_slug: str, cache_prefix: Path):
-    """Returns a lock for the cache of a repository.
-    Initially the lock is unlocked; the caller must explictly
-    lock and unlock the lock.
-    Args:
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        fasteners.InterProcessLock: A lock for the repository.
-    """
-    lock_path = cache_prefix / "locks" / (slug_repo_name(repo_slug) + ".lock")
-    lock_path.parent.mkdir(parents=True, exist_ok=True)
-    lock = fasteners.InterProcessLock(lock_path)
-    return lock
-
-
-def get_cache_path(repo_slug: str, cache_prefix: Path) -> Path:
-    """Returns the path to the cache file.
-    Args:
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        Path: The path to the cache file.
-    """
-    cache_file_name = slug_repo_name(repo_slug) + ".json"
-    cache_path = cache_prefix / cache_file_name
-    cache_path.parent.mkdir(parents=True, exist_ok=True)
-    return cache_path
-
-
-def is_in_cache(
-    cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path
-) -> bool:
-    """Checks if the key is in the cache.
-    Args:
-        cache_key (Union[Tuple,str]): The key to check.
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path prefix to the cache directory.
-    Returns:
-        bool: True if the repository is in the cache, False otherwise.
-    """
-    cache = load_cache(repo_slug, cache_prefix)
-    return cache_key in cache
-
-
-def load_cache(repo_slug: str, cache_prefix: Path) -> dict:
-    # TODO: "the cache": which one?  Also, does this load a cache or a cache
-    # entry?  Please define those terms to help the reader.  This method returns
-    # a cache, which is a dict.  `cache_lookup` returns a cache entry, which is
-    # also a dict.  What are they dicts from and to?
-    """Loads the cache.
-    Args:
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        dict: The cache.
-    """
-    cache_path = get_cache_path(repo_slug, cache_prefix)
-    if not cache_path.exists():
-        return {}
-    with open(cache_path, "r") as f:
-        cache_data = json.load(f)
-        return cache_data
-
-
-def cache_lookup(
-    cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path
-) -> dict:
-    """Loads the cache and returns the value for the given key.
-    Args:
-        cache_key (Union[Tuple,str]): The key to check.
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        dict: The cache.
-    """
-    cache = load_cache(repo_slug, cache_prefix)
-    return cache[cache_key]
-
-
-# TODO: The distinction between write_to_cache and set_in_cache is not clear to
-# me.  If it is only locking, then please give them the same name, but with one
-# suffixed by "_without_lock".  Also please indicate under what circumstances
-# each should be called.
-def write_to_cache(
+def set_in_cache(
     cache_key: Union[Tuple, str],
     cache_value: Union[str, dict, None],
     repo_slug: str,
-    cache_prefix: Path,
+    cache_directory: Path,
     overwrite: bool = True,
+    acquire_lock: bool = True,
 ) -> None:
     """Puts an entry in the cache, then writes the cache to disk.
     This function is not thread-safe.
@@ -133,14 +44,15 @@ def write_to_cache(
         cache_key (Union[Tuple,str]): The key to check.
         cache_value (dict): The value to write.
         repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
+        cache_directory (Path): The path to the cache directory.
         overwrite (bool, optional) = True: Whether to overwrite an existing cache entry.
              If False, attempting to overwrite an existing cache entry throws an exception.
     """
-    cache_path = get_cache_path(repo_slug, cache_prefix)
-    cache = load_cache(repo_slug, cache_prefix)
-    if cache_prefix != Path("cache-small/sha_cache_entry"):
-        print("write_to_cache:", cache_key, cache_value, repo_slug, cache_prefix)
+    if acquire_lock:
+        lock = get_cache_lock(repo_slug, cache_directory)
+        lock.acquire()
+    cache_path = get_cache_path(repo_slug, cache_directory)
+    cache = load_cache(repo_slug, cache_directory)
     if cache_key in cache and not overwrite:
         raise ValueError("Cache key already exists")
     cache[cache_key] = cache_value
@@ -148,57 +60,36 @@ def write_to_cache(
     with open(cache_path, "w") as f:
         f.write(output)
         f.flush()
+    if acquire_lock:
+        lock.release()  # type: ignore
 
 
-def set_in_cache(
-    cache_key: Union[Tuple, str],
-    cache_value: Union[str, dict, None],
-    repo_slug: str,
-    cache_prefix: Path,
-    overwrite: bool = True,
-) -> None:
-    """Puts an entry in the cache, then writes the cache to disk.
-    This function is thread-safe.
-    Args:
-        cache_key (Union[Tuple,str]): The key to check.
-        cache_value (dict): The value to write.
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-        overwrite (bool, optional) = True: Whether to overwrite the cache if it already exists.
-    """
-    lock = get_cache_lock(repo_slug, cache_prefix)
-    lock.acquire()
-    write_to_cache(cache_key, cache_value, repo_slug, cache_prefix, overwrite)
-    lock.release()
-
-
-# TODO: I'm not clear what is the difference between `cache_lookup` and
-# `lookup_in_cache`.  Please clarify, and indicate when each should be called.
 def lookup_in_cache(
     cache_key: Union[Tuple, str],
     repo_slug: str,
-    cache_prefix: Path,
+    cache_directory: Path,
     set_run: bool = False,
 ) -> Union[str, dict, None]:
     """Checks if the cache is available and loads a specific entry.
     Args:
         cache_key (Union[Tuple,str]): The key to check.
         repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
+        cache_directory (Path): The path to the cache directory.
         set_run (bool, optional) = False: Wheter to insert an empty cache entry
             if it does not exist. This is useful for preventing multiple runs from
             attempting to insert the same cache entry.
     Returns:
         Union[dict,None]: The cache entry if it exists, None otherwise.
     """
-    lock = get_cache_lock(repo_slug, cache_prefix)
+    lock = get_cache_lock(repo_slug, cache_directory)
     lock.acquire()
-    cache_entry = get_cache_path(repo_slug, cache_prefix)
+    cache_entry = get_cache_path(repo_slug, cache_directory)
     cache_entry.parent.mkdir(parents=True, exist_ok=True)
-    if is_in_cache(cache_key, repo_slug, cache_prefix):
+    if is_in_cache(cache_key, repo_slug, cache_directory):
         total_time = 0
         while True:
-            cache_data = cache_lookup(cache_key, repo_slug, cache_prefix)
+            cache = load_cache(repo_slug, cache_directory)
+            cache_data = cache[cache_key]
             if cache_data is not None:
                 break
             lock.release()
@@ -210,6 +101,70 @@ def lookup_in_cache(
         lock.release()
         return cache_data
     if set_run:
-        write_to_cache(cache_key, None, repo_slug, cache_prefix)
+        set_in_cache(cache_key, None, repo_slug, cache_directory, acquire_lock=False)
     lock.release()
     return None
+
+
+# ====================== Internal functions ======================
+
+
+def get_cache_lock(repo_slug: str, cache_directory: Path):
+    """Returns a lock for the cache of a repository.
+    Initially the lock is unlocked; the caller must explictly
+    lock and unlock the lock.
+    Args:
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path to the cache directory.
+    Returns:
+        fasteners.InterProcessLock: A lock for the repository.
+    """
+    lock_path = cache_directory / "locks" / (slug_repo_name(repo_slug) + ".lock")
+    lock_path.parent.mkdir(parents=True, exist_ok=True)
+    lock = fasteners.InterProcessLock(lock_path)
+    return lock
+
+
+def get_cache_path(repo_slug: str, cache_directory: Path) -> Path:
+    """Returns the path to the cache file.
+    Args:
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path to the cache directory.
+    Returns:
+        Path: The path to the cache file.
+    """
+    cache_file_name = slug_repo_name(repo_slug) + ".json"
+    cache_path = cache_directory / cache_file_name
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    return cache_path
+
+
+def is_in_cache(
+    cache_key: Union[Tuple, str], repo_slug: str, cache_directory: Path
+) -> bool:
+    """Checks if the key is in the cache.
+    Args:
+        cache_key (Union[Tuple,str]): The key to check.
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path prefix to the cache directory.
+    Returns:
+        bool: True if the repository is in the cache, False otherwise.
+    """
+    cache = load_cache(repo_slug, cache_directory)
+    return cache_key in cache
+
+
+def load_cache(repo_slug: str, cache_directory: Path) -> dict:
+    """Loads the cache associated to the repo_slug found in the cache directory.
+    Args:
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path to the cache directory.
+    Returns:
+        dict: The cache.
+    """
+    cache_path = get_cache_path(repo_slug, cache_directory)
+    if not cache_path.exists():
+        return {}
+    with open(cache_path, "r") as f:
+        cache_data = json.load(f)
+        return cache_data
diff --git a/src/python/clean_cache_placeholders.py b/src/python/delete_cache_placeholders.py
similarity index 65%
rename from src/python/clean_cache_placeholders.py
rename to src/python/delete_cache_placeholders.py
index c2d861c7b3..22acd35c11 100755
--- a/src/python/clean_cache_placeholders.py
+++ b/src/python/delete_cache_placeholders.py
@@ -1,6 +1,4 @@
 #!/usr/bin/env python3
-# TODO: This file uses "clean", "remove", and "delete".  Please choose one term
-# and stick to it, in both the code and in the file name.
 """ 
 Deletes all placeholders from the cache. Placeholders are created when a
 a process starts; it indicates that is has started and is still running.
@@ -8,12 +6,10 @@
 result. This script deletes all placeholders from the cache.
 
 Usage:
-    python clean_cache_placeholders.py --cache_path <path_to_cache>
+    python delete_cache_placeholders.py --cache_directory <path_to_cache>
 
 Args:
-TODO: Throughout, please don't use "_path" as a variable suffix, because it is
-# not sufficiently precise.  Instead, use either "_file" or "_directory".
-    cache_path (str): Path to cache directory
+    cache_directory (str): Path to cache directory
 """
 
 from argparse import ArgumentParser
@@ -27,10 +23,10 @@
     )
     args = parser.parse_args()
 
-    cache_path = Path(args.cache_dir)
+    cache_directory = Path(args.cache_dir)
 
     n_deleted = 0
-    for file in cache_path.glob("**/*.json"):
+    for file in cache_directory.glob("**/*.json"):
         with open(file, "r") as f:
             data = json.load(f)
 
diff --git a/src/python/latex_output.py b/src/python/latex_output.py
index ef75f08243..54b01bedff 100755
--- a/src/python/latex_output.py
+++ b/src/python/latex_output.py
@@ -14,10 +14,9 @@
 following input files:
 - full_repos_csv: csv file containing the full list of repositories
 - repos_head_passes_csv: csv file containing the list of repositories whose head passes tests
-- tested_merges_path: path to the folder containing the merge results
-- merges_path: path to the folder containing all found merges.
-TODO: Throughout, be consistent about "directory" vs "folder".
-- output_dir: path to the folder where the LaTeX files will be saved
+- tested_merges_path: path to the directory containing the merge results
+- merges_path: path to the directory containing all found merges.
+- output_dir: path to the directory where the LaTeX files will be saved
 """
 
 
@@ -77,7 +76,7 @@ def latex_def(name, value) -> str:
 
 
 # Dictonary that lists the different subsets of merge tools for which plots
-# and tables are generated. The key is the folder name which will contain all figures
+# and tables are generated. The key is the directory name which will contain all figures
 # that will be used and the value is the list of plots to contain.
 PLOTS = {
     "all": [merge_tool.name for merge_tool in MERGE_TOOL],
@@ -93,9 +92,6 @@ def latex_def(name, value) -> str:
     ],
 }
 
-# TODO: I suggest that these should be declared in repo.py, or move the enums to
-# a different file where this can also be defined.  Or, define predicates for
-# clients to use rather than exporting these lists.
 MERGE_CORRECT_NAMES = [
     TEST_STATE.Tests_passed.name,
 ]
diff --git a/src/python/merge_differ.py b/src/python/merge_differ.py
index bc36bd7cb6..574eddef00 100755
--- a/src/python/merge_differ.py
+++ b/src/python/merge_differ.py
@@ -4,10 +4,10 @@
                                 --merges_path <merges_path>
                                 --cache_dir <cache_dir>
 This script compares the results of two merge tools on the same merge.
-# TODO: Where is the output?  Standard out?
 It outputs the diff of the two merge results if their results differ.
-# TODO: Does "ignores" mean there is no output, or is it something deeper?
-It ignores merges that have the same result or merges that are not successful.
+It does not produce output for merges that have the same result or merges that 
+are not successful.
+The output is written in cache_dir/merge_diffs.
 """
 
 import os
@@ -32,7 +32,7 @@
 
 
 def get_merge_fingerprint(
-    merge_data: pd.Series, merge_tool: MERGE_TOOL, cache_prefix: Path
+    merge_data: pd.Series, merge_tool: MERGE_TOOL, cache_directory: Path
 ) -> Union[Tuple[None, None], Tuple[Repository, str]]:
     """Returns the repo and the fingerprint of a merge,
     or (None, None) if the merge is not successful.
@@ -51,7 +51,7 @@ def get_merge_fingerprint(
         return None, None
     left = merge_data["left"]
     right = merge_data["right"]
-    repo = Repository(repo_slug, cache_prefix=cache_prefix)
+    repo = Repository(repo_slug, cache_directory=cache_directory)
     (
         merge_status,
         merge_fingerprint,
@@ -81,39 +81,34 @@ def get_merge_fingerprint(
 
 
 def merge_differ(args: Tuple[pd.Series, Path]) -> None:
-    # TODO: What does this funtion do with the diff?  I think it writes to a file?  Or does it populate a cache too?
     """Diffs the results of every two merge tools on the same merge.
-    Does not diff merges that have the same result or merges that are not successful.
+    Writes the diff to a file.
     Args:
         args (Tuple[pd.Series,Path]): A tuple containing the merge data and the cache prefix.
-    Returns:
-        TODO: How is "The result of the test." related to the diff of two merge tools?  I don't see this function returning anything.
-        dict: The result of the test.
     """
-    merge_data, cache_prefix = args
+    merge_data, cache_directory = args
 
     if not merge_data["parents pass"]:
         return
 
-    # TODO: See other comment about "_prefix" in variable names; change throughout.
-    diff_file_prefix = cache_prefix / "merge_diffs"
-    diff_file_prefix.mkdir(parents=True, exist_ok=True)
+    diff_file_directory = cache_directory / "merge_diffs"
+    diff_file_directory.mkdir(parents=True, exist_ok=True)
 
     for merge_tool1 in MERGE_TOOL:
         repo1, merge_fingerprint1 = get_merge_fingerprint(
-            merge_data, merge_tool1, cache_prefix
+            merge_data, merge_tool1, cache_directory
         )
         if repo1 is None or merge_fingerprint1 is None:
             continue
 
         for merge_tool2 in MERGE_TOOL:
             repo2, merge_fingerprint2 = get_merge_fingerprint(
-                merge_data, merge_tool2, cache_prefix
+                merge_data, merge_tool2, cache_directory
             )
             if repo2 is None or merge_fingerprint2 is None:
                 continue
 
-            diff_file = diff_file_prefix / diff_file_name(
+            diff_file = diff_file_directory / diff_file_name(
                 merge_fingerprint1, merge_fingerprint2
             )
             if diff_file.exists():
@@ -137,9 +132,8 @@ def diff_file_name(sha1: str, sha2: str) -> Path:
     """
     # Use lexicographic order to prevent duplicates
     if sha1 < sha2:
-        # TODO: Why does this use ".txt" rather than ".diff" as the file extension?
-        return Path(sha1 + "_" + sha2 + ".txt")
-    return Path(sha2 + "_" + sha1 + ".txt")
+        return Path(sha1 + "_" + sha2 + ".diff")
+    return Path(sha2 + "_" + sha1 + ".diff")
 
 
 if __name__ == "__main__":
@@ -150,7 +144,6 @@ def diff_file_name(sha1: str, sha2: str) -> Path:
     parser.add_argument("--cache_dir", type=Path, default="cache/")
     args = parser.parse_args()
     cache_dir = Path(args.cache_dir)
-    # TODO: See comment elsewhere about "_path" in variable names.
     cache_diffs_path = cache_dir / "merge_diffs"
     cache_diffs_path.mkdir(parents=True, exist_ok=True)
 
diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py
index bc840976f7..d0c51a8701 100755
--- a/src/python/merge_tester.py
+++ b/src/python/merge_tester.py
@@ -42,7 +42,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
     Returns:
         pd.Series: The result of the test.
     """
-    repo_slug, merge_data, cache_prefix = args
+    repo_slug, merge_data, cache_directory = args
     while psutil.cpu_percent() > 90:
         print(
             "merge_tester: Waiting for CPU load to come down",
@@ -56,7 +56,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
     merge_data["parents pass"] = False
     # TODO: It's not clear to me whether "left" and "right" are keys for branches, or shas, or something else.  How about improving the key name?  (We can change the find-merge-conflicts program if it is the source of the problem.)
     for branch in ["left", "right"]:
-        repo = Repository(repo_slug, cache_prefix=cache_prefix)
+        repo = Repository(repo_slug, cache_directory=cache_directory)
         repo.checkout(merge_data[branch])
         tree_fingerprint = repo.compute_tree_fingerprint()
         if tree_fingerprint != merge_data[f"{branch}_tree_fingerprint"]:
@@ -74,15 +74,12 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
         merge_data[f"{branch} test result"] = test_result.name
         if test_result != TEST_STATE.Tests_passed:
             return merge_data
-        # TODO: It looks suspicious to me that the repo is only sometimes, not always, deleted.
-        del repo
 
     merge_data["parents pass"] = True
 
     for merge_tool in MERGE_TOOL:
         if is_merge_success(merge_data[merge_tool.name]):
-            # TODO: I suggest abstracting the body of this loop into a separate function, since it stands on its own logically.
-            repo = Repository(repo_slug, cache_prefix=cache_prefix)
+            repo = Repository(repo_slug, cache_directory=cache_directory)
             (
                 result,
                 merge_fingerprint,
@@ -111,7 +108,6 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
                 )
             # Update the status from merge success to test result.
             merge_data[merge_tool.name] = result.name
-            del repo
         assert merge_tool.name in merge_data
     return merge_data
 
diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py
index 2fcb574c1e..4b06bb7295 100755
--- a/src/python/merge_tools_comparator.py
+++ b/src/python/merge_tools_comparator.py
@@ -51,16 +51,12 @@ def merger(  # pylint: disable=too-many-locals
     Returns:
         dict: A dictionary containing the merge result.
     """
-    repo_slug, merge_data, cache_prefix = args
+    repo_slug, merge_data, cache_directory = args
 
-    # TODO: I think this is a key, not an entry.  Please clarify.
-    # TODO: Why are these two SHAs (if they are SHAs) not sorted as is done elsewhere?
-    cache_entry = merge_data["left"] + "_" + merge_data["right"]
-    merge_cache_prefix = cache_prefix / "merge_results"
+    cache_key = merge_data["left"] + "_" + merge_data["right"]
+    merge_cache_directory = cache_directory / "merge_results"
 
-    result = lookup_in_cache(cache_entry, repo_slug, merge_cache_prefix, True)
-    # if result is None:
-    #     raise Exception(cache_entry," HERE")
+    result = lookup_in_cache(cache_key, repo_slug, merge_cache_directory, True)
     if result is not None and isinstance(result, dict):
         return result
 
@@ -74,9 +70,9 @@ def merger(  # pylint: disable=too-many-locals
             merge_tool.name,
         )
         cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []}
-            # TODO: I suggest abstracting out the body of this loop as a function.
+        # TODO: I suggest abstracting out the body of this loop as a function.
         for i in range(N_REPETITIONS):
-            repo = Repository(repo_slug, cache_prefix=cache_prefix)
+            repo = Repository(repo_slug, cache_directory=cache_directory)
             (
                 merge_status,
                 merge_fingerprint,
@@ -91,7 +87,7 @@ def merger(  # pylint: disable=too-many-locals
                 timeout=TIMEOUT_MERGING,
             )
             log_file: Path = (
-                cache_prefix
+                cache_directory
                 / "merge_logs"
                 / slug_repo_name(repo_slug)
                 / merge_data["left"]
@@ -132,11 +128,34 @@ def merger(  # pylint: disable=too-many-locals
             else:
                 assert cache_data["left_tree_fingerprint"] == left_fingerprint
                 assert cache_data["right_tree_fingerprint"] == right_fingerprint
-            del repo
-    set_in_cache(cache_entry, cache_data, repo_slug, merge_cache_prefix)
+    set_in_cache(cache_key, cache_data, repo_slug, merge_cache_directory)
     return cache_data
 
 
+def check_if_two_merges_differ(cache_data: dict) -> bool:
+    """Returns true if two merge tools differ on the same merge.
+    Args:
+        cache_data (dict): A dictionary containing the merge data.
+    Returns:
+        bool: True if two merge tools differ on the same merge.
+    """
+    for merge_tool1 in MERGE_TOOL:
+        for merge_tool2 in MERGE_TOOL:
+            if is_merge_success(
+                cache_data[merge_tool1.name]["results"][0]
+            ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]):
+                return True
+            if is_merge_success(
+                cache_data[merge_tool1.name]["results"][0]
+            ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]):
+                if (
+                    cache_data[merge_tool1.name]["merge_fingerprint"]
+                    != cache_data[merge_tool2.name]["merge_fingerprint"]
+                ):
+                    return True
+    return False
+
+
 if __name__ == "__main__":
     print("merge_tools_comparator: Start")
     parser = argparse.ArgumentParser()
@@ -228,22 +247,8 @@ def merger(  # pylint: disable=too-many-locals
         repo_slug = merger_arguments[i][0]
         merge_data = merger_arguments[i][1]
         cache_data = merger_results[i]
-        two_merge_tools_differ = False
-        # TODO: I suggest abstracting this loop into a function.  That will also make it easier to return early, which can be done just with "return".  (Currently it makes more iterations even after discovering that two merge tools do differ.)
-        for merge_tool1 in MERGE_TOOL:
-            for merge_tool2 in MERGE_TOOL:
-                if is_merge_success(
-                    cache_data[merge_tool1.name]["results"][0]
-                ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]):
-                    two_merge_tools_differ = True
-                if is_merge_success(
-                    cache_data[merge_tool1.name]["results"][0]
-                ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]):
-                    if (
-                        cache_data[merge_tool1.name]["merge_fingerprint"]
-                        != cache_data[merge_tool2.name]["merge_fingerprint"]
-                    ):
-                        two_merge_tools_differ = True
+        two_merge_tools_differ = check_if_two_merges_differ(cache_data)
+
         merge_data["two merge tools differ"] = two_merge_tools_differ
         merge_data["left_tree_fingerprint"] = cache_data["left_tree_fingerprint"]
         merge_data["right_tree_fingerprint"] = cache_data["right_tree_fingerprint"]
diff --git a/src/python/repo.py b/src/python/repo.py
index 04c6e88c72..c332c564c3 100755
--- a/src/python/repo.py
+++ b/src/python/repo.py
@@ -76,12 +76,10 @@ def compute_explanation(
 
 def repo_test(wcopy_dir: Path, timeout: int) -> Tuple[TEST_STATE, str]:
     """Returns the result of run_repo_tests.sh on the given working copy.
-    TODO: What is the definition of "the entire test"?  How is it different than the test process?
-    If the test process passes then the entire test is marked as passed.
-    If the test process timeouts then the entire test is marked as timeout.
+    If the test process passes then the function returns and marks it as passed.
+    If the test process timeouts then the function returns and marks it as timedout.
     Args:
-        TODO: Minor: Throughout, you don't need "The path of" at the beginning of comments.  Or change it to "The directory of".
-        wcopy_dir (Path): The path of the working copy (the clone).
+        wcopy_dir (Path): The directory of the working copy (the clone).
         timeout (int): Test timeout limit, in seconds.
     Returns:
         TEST_STATE: The result of the test.
@@ -114,23 +112,23 @@ class Repository:
     def __init__(
         self,
         repo_slug: str,
-        cache_prefix: Path = Path(""),
+        cache_directory: Path = Path(""),
     ) -> None:
         """Initializes the repository.
         Args:
             repo_slug (str): The slug of the repository, which is "owner/reponame".
-            cache_prefix (Path): The prefix of the cache.
+            cache_directory (Path): The prefix of the cache.
         """
         self.repo_slug = repo_slug
         self.path = REPOS_PATH / repo_slug.split("/")[1]
         workdir_id = uuid.uuid4().hex
-        self.workdir = WORKDIR_PREFIX / workdir_id
+        self.workdir = WORKDIR_DIRECTORY / workdir_id
         self.workdir.mkdir(parents=True, exist_ok=True)
         self.repo_path = self.workdir / self.path.name
         shutil.copytree(self.path, self.repo_path)
         self.repo = Repo(self.repo_path)
-        self.test_cache_prefix = cache_prefix / "test_cache"
-        self.sha_cache_prefix = cache_prefix / "sha_cache_entry"
+        self.test_cache_directory = cache_directory / "test_cache"
+        self.sha_cache_directory = cache_directory / "sha_cache_entry"
 
     def checkout(self, commit: str) -> Tuple[bool, str]:
         """Checks out the given commit.
@@ -154,13 +152,13 @@ def checkout(self, commit: str) -> Tuple[bool, str]:
                 + str(e)
             )
             cache_entry = {"sha": None, "explanation": explanation}
-            set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_prefix)
+            set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_directory)
             return False, explanation
         cache_entry = {
             "sha": self.compute_tree_fingerprint(),
             "explanation": explanation,
         }
-        set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_prefix)
+        set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_directory)
         return True, explanation
 
     def _merge_and_test(  # pylint: disable=too-many-arguments
@@ -190,11 +188,8 @@ def _merge_and_test(  # pylint: disable=too-many-arguments
             str: The tree fingerprint of the merge result.
             str: The left fingerprint.
             str: The right fingerprint.
-            TODO: Is this time to merge, time to test, or time to merge and test?
-            TODO: This is a ssingle number, but n_tests tells how many times to
-            TODO: run the test.  Is this an average, median, overall, or something
-            TODO: else?  (I have the same question elsewhere.)
-            float: The time it took to run the merge, in seconds.
+            float: The time it took to run the merge, in seconds. Does not include
+                the test time.
         """
         (
             merge_status,
@@ -287,7 +282,7 @@ def create_branch(
                 commit,
                 {"sha": None, "explanation": explanation},
                 self.repo_slug,
-                self.sha_cache_prefix,
+                self.sha_cache_directory,
             )
             return None, explanation
         tree_fingerprint = self.compute_tree_fingerprint()
@@ -335,7 +330,7 @@ def merge(  # pylint: disable=too-many-locals
                 cache_entry_name,
                 {"sha": None, "explanation": explanation},
                 self.repo_slug,
-                self.sha_cache_prefix,
+                self.sha_cache_directory,
             )
             return (
                 MERGE_STATE.Git_checkout_failed,
@@ -387,7 +382,7 @@ def merge(  # pylint: disable=too-many-locals
             cache_entry_name,
             cache_entry,
             self.repo_slug,
-            self.sha_cache_prefix,
+            self.sha_cache_directory,
         )
         return (
             merge_status,
@@ -421,18 +416,18 @@ def compute_tree_fingerprint(self) -> str:
     def get_sha_cache_entry(
         self, commit: str, start_merge: bool = False
     ) -> Union[None, dict]:
-        # TODO: is a SHA cache a cache from a SHA to something, or a cache from something to a SHA?
-        """Gets a SHA cache entry.
+        """The SHA cache maps a commit to a tree fingerprint.
+        This function checks if the commit is in the cache.
         Args:
             commit (str): The commit to check.
             start_merge (bool, optional) = False: Whether to indicate that the merge starts if the
                 commit is not in the cache.
         Returns:
-            # TODO: What is the structure of the dict this returns?
-            Union[None,dict]: The cache entry if the commit is in the cache, None otherwise.
+            Union[None, dict]: The cache entry if the commit is in the cache, None otherwise.
+                The dict contains the sha entry and an explanation entry.
         """
         cache = lookup_in_cache(
-            commit, self.repo_slug, self.sha_cache_prefix, set_run=start_merge
+            commit, self.repo_slug, self.sha_cache_directory, set_run=start_merge
         )
         if cache is None:
             return None
@@ -453,7 +448,7 @@ def get_test_cache_entry(
                     None otherwise.
         """
         cache = lookup_in_cache(
-            sha, self.repo_slug, self.test_cache_prefix, set_run=start_test
+            sha, self.repo_slug, self.test_cache_directory, set_run=start_test
         )
         if cache is None:
             return None
@@ -511,10 +506,8 @@ def checkout_and_test(
     def test(self, timeout: int, n_tests: int) -> TEST_STATE:
         """Tests the repository. The test results of multiple runs is combined into one result.
         If one of the runs passes then the entire test is marked as passed.
-        # TODO: Should this line be preceded by "otherwise"?  Is it possible that one run passes and another times out?
-        # TODO: I would think the test should be marked as timeout only if every run is timeout.
         If one of the runs timeouts then the entire test is marked as timeout.
-        # TODO: Otherwise the test is marked as failure?
+        Otherwise all runs must fail for the entire test to be marked as failed.
         Args:
             timeout (int): The timeout limit, in seconds.
             n_tests (int): The number of times to run the test suite.
@@ -534,7 +527,7 @@ def test(self, timeout: int, n_tests: int) -> TEST_STATE:
             test_state, test_output = repo_test(self.repo_path, timeout)
             test_log_file = Path(
                 os.path.join(
-                    self.test_cache_prefix,
+                    self.test_cache_directory,
                     "logs",
                     slug_repo_name(self.repo_slug),
                     sha + "_" + str(i) + ".log",
@@ -551,7 +544,7 @@ def test(self, timeout: int, n_tests: int) -> TEST_STATE:
             if test_state in (TEST_STATE.Tests_passed, TEST_STATE.Tests_timedout):
                 break
 
-        set_in_cache(sha, cache_data, self.repo_slug, self.test_cache_prefix)
+        set_in_cache(sha, cache_data, self.repo_slug, self.test_cache_directory)
         return TEST_STATE[cache_data["test_result"]]
 
     def __del__(self) -> None:
diff --git a/src/python/test_repo_heads.py b/src/python/test_repo_heads.py
index 3604942293..714007b2f1 100755
--- a/src/python/test_repo_heads.py
+++ b/src/python/test_repo_heads.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
-# TODO: What does it mean to "validate it"?
-"""Tests the HEAD commits of multiple repos and validates it if the test passes.
+"""Tests the HEAD commits of multiple repos and considers them as valid if the test passes.
 
 usage: python3 test_repo_heads.py --repos_csv_with_hashes <repos_csv_with_hashes.csv>
                                  --output_path <repos_head_passes.csv>
@@ -43,7 +42,7 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE:
     repo_slug = repo_info["repository"]
     print("test_repo_heads:", repo_slug, ": head_passes_tests : started")
 
-    repo = Repository(repo_slug, cache_prefix=cache)
+    repo = Repository(repo_slug, cache_directory=cache)
     test_state = repo.checkout_and_test(
         repo_info["head hash"], timeout=TIMEOUT_TESTING, n_tests=3
     )
diff --git a/src/python/variables.py b/src/python/variables.py
index 1ce260b1f0..d81c733af9 100644
--- a/src/python/variables.py
+++ b/src/python/variables.py
@@ -9,8 +9,9 @@
 CACHE_BACKOFF_TIME = 2 * 60  # 2 minutes, in seconds
 DELETE_WORKDIRS = True
 REPOS_PATH = Path("repos")
-# TODO: What is this used for?  Please give it a more descriptive name.
-WORKDIR_PREFIX = Path(".workdir")
+WORKDIR_DIRECTORY = Path(
+    ".workdir"
+)  # Merges and testing will be performed in this directory.
 
 TIMEOUT_MERGING = 60 * 15  # 15 minutes, in seconds
 N_REPETITIONS = 3
diff --git a/src/scripts/merge_tools/intellimerge.sh b/src/scripts/merge_tools/intellimerge.sh
index c79bd27073..f4fa8d157c 100755
--- a/src/scripts/merge_tools/intellimerge.sh
+++ b/src/scripts/merge_tools/intellimerge.sh
@@ -21,8 +21,7 @@ intellimerge_absolutepath="${ROOT_DIR}/${intellimerge_relativepath}"
 clone_dir=$1
 branch1=$2
 branch2=$3
-# TODO: Will this be in the repository? Would it be safer to put it under /tmp or /scratch?
-temp_dir=".workdir/intelli_temp_$$/"
+temp_dir="/tmp/intelli_temp_$$/"
 mkdir $temp_dir
 
 # run intellimerge
diff --git a/src/scripts/utils/run_multiple_machines.sh b/src/scripts/utils/run_multiple_machines.sh
index 2c2b7a5351..3db4743239 100755
--- a/src/scripts/utils/run_multiple_machines.sh
+++ b/src/scripts/utils/run_multiple_machines.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
-# TODO: A branch of what?  This repository?  (Same question in `run_remotely.sh`.)
-# Run the code for a certain branch on multiple remote machines.
+# Run the code of a specific branch of this repository on multiple remote machines.
 # Usage: ./run_multiple_machines.sh <branch> <machine_address_path> <root_path_on_machine>
 # <branch> is the branch to run
 # <machine_address_path> is the path to a file containing the addresses of the machines (one address per line)
diff --git a/src/scripts/utils/run_remotely.sh b/src/scripts/utils/run_remotely.sh
index 44b60dd87c..86f9751ef8 100755
--- a/src/scripts/utils/run_remotely.sh
+++ b/src/scripts/utils/run_remotely.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-# Run the code for a certain branch on a remote machine.
+# Run the code of a specific branch of this repository on a remote machine.
 # Usage: ./run_remotely.sh <branch> <machine_ssh> <machine_id> <num_machines> <root_path_on_machine>
 # <branch> is the branch to run
 # <machine_ssh> is the ssh address of the machine