Improvements

benedikt-schesch · Sep 24, 2023 · 606b16c · 606b16c
1 parent 18ed5f6
commit 606b16c
Show file tree

Hide file tree

Showing 13 changed files with 175 additions and 243 deletions.
diff --git a/run.sh b/run.sh
@@ -73,7 +73,7 @@ if [ -d "$CACHE_DIR" ]; then
     find "$CACHE_DIR" -name "*.lock" -delete
 fi
 
-python3 src/python/clean_cache_placeholders.py \
+python3 src/python/delete_cache_placeholders.py \
     --cache_dir "$CACHE_DIR"
 
 python3 src/python/write_head_hashes.py \

diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py
@@ -19,9 +19,6 @@
 CACHE_BACKOFF_TIME = 2 * 60  # 2 minutes, in seconds
 TIMEOUT = 90 * 60  # 90 minutes, in seconds
 
-# TODO: In this file, please place the externally-visible or exported functions
-# first, and then a comment marker of some kind, and then the "private" ones.
-
 
 def slug_repo_name(repo_slug: str) -> str:
     """Given a GitHub repository slug ("owner/reponame"), returns the reponame.
@@ -33,172 +30,66 @@ def slug_repo_name(repo_slug: str) -> str:
     return repo_slug.split("/")[1]
 
 
-# TODO: Throughout, please use "_directory" instead of "_prefix" in
-# variable names, when the variable actually holds a directory.  I
-# feel this will be clearer.
-def get_cache_lock(repo_slug: str, cache_prefix: Path):
-    """Returns a lock for the cache of a repository.
-    Initially the lock is unlocked; the caller must explictly
-    lock and unlock the lock.
-    Args:
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        fasteners.InterProcessLock: A lock for the repository.
-    """
-    lock_path = cache_prefix / "locks" / (slug_repo_name(repo_slug) + ".lock")
-    lock_path.parent.mkdir(parents=True, exist_ok=True)
-    lock = fasteners.InterProcessLock(lock_path)
-    return lock
-
-
-def get_cache_path(repo_slug: str, cache_prefix: Path) -> Path:
-    """Returns the path to the cache file.
-    Args:
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        Path: The path to the cache file.
-    """
-    cache_file_name = slug_repo_name(repo_slug) + ".json"
-    cache_path = cache_prefix / cache_file_name
-    cache_path.parent.mkdir(parents=True, exist_ok=True)
-    return cache_path
-
-
-def is_in_cache(
-    cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path
-) -> bool:
-    """Checks if the key is in the cache.
-    Args:
-        cache_key (Union[Tuple,str]): The key to check.
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path prefix to the cache directory.
-    Returns:
-        bool: True if the repository is in the cache, False otherwise.
-    """
-    cache = load_cache(repo_slug, cache_prefix)
-    return cache_key in cache
-
-
-def load_cache(repo_slug: str, cache_prefix: Path) -> dict:
-    # TODO: "the cache": which one?  Also, does this load a cache or a cache
-    # entry?  Please define those terms to help the reader.  This method returns
-    # a cache, which is a dict.  `cache_lookup` returns a cache entry, which is
-    # also a dict.  What are they dicts from and to?
-    """Loads the cache.
-    Args:
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        dict: The cache.
-    """
-    cache_path = get_cache_path(repo_slug, cache_prefix)
-    if not cache_path.exists():
-        return {}
-    with open(cache_path, "r") as f:
-        cache_data = json.load(f)
-        return cache_data
-
-
-def cache_lookup(
-    cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path
-) -> dict:
-    """Loads the cache and returns the value for the given key.
-    Args:
-        cache_key (Union[Tuple,str]): The key to check.
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        dict: The cache.
-    """
-    cache = load_cache(repo_slug, cache_prefix)
-    return cache[cache_key]
-
-
-# TODO: The distinction between write_to_cache and set_in_cache is not clear to
-# me.  If it is only locking, then please give them the same name, but with one
-# suffixed by "_without_lock".  Also please indicate under what circumstances
-# each should be called.
-def write_to_cache(
+def set_in_cache(
     cache_key: Union[Tuple, str],
     cache_value: Union[str, dict, None],
     repo_slug: str,
-    cache_prefix: Path,
+    cache_directory: Path,
     overwrite: bool = True,
+    acquire_lock: bool = True,
 ) -> None:
     """Puts an entry in the cache, then writes the cache to disk.
     This function is not thread-safe.
     Args:
         cache_key (Union[Tuple,str]): The key to check.
         cache_value (dict): The value to write.
         repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
+        cache_directory (Path): The path to the cache directory.
         overwrite (bool, optional) = True: Whether to overwrite an existing cache entry.
              If False, attempting to overwrite an existing cache entry throws an exception.
     """
-    cache_path = get_cache_path(repo_slug, cache_prefix)
-    cache = load_cache(repo_slug, cache_prefix)
-    if cache_prefix != Path("cache-small/sha_cache_entry"):
-        print("write_to_cache:", cache_key, cache_value, repo_slug, cache_prefix)
+    if acquire_lock:
+        lock = get_cache_lock(repo_slug, cache_directory)
+        lock.acquire()
+    cache_path = get_cache_path(repo_slug, cache_directory)
+    cache = load_cache(repo_slug, cache_directory)
     if cache_key in cache and not overwrite:
         raise ValueError("Cache key already exists")
     cache[cache_key] = cache_value
     output = json.dumps(cache, indent=4)
     with open(cache_path, "w") as f:
         f.write(output)
         f.flush()
+    if acquire_lock:
+        lock.release()  # type: ignore
 
 
-def set_in_cache(
-    cache_key: Union[Tuple, str],
-    cache_value: Union[str, dict, None],
-    repo_slug: str,
-    cache_prefix: Path,
-    overwrite: bool = True,
-) -> None:
-    """Puts an entry in the cache, then writes the cache to disk.
-    This function is thread-safe.
-    Args:
-        cache_key (Union[Tuple,str]): The key to check.
-        cache_value (dict): The value to write.
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-        overwrite (bool, optional) = True: Whether to overwrite the cache if it already exists.
-    """
-    lock = get_cache_lock(repo_slug, cache_prefix)
-    lock.acquire()
-    write_to_cache(cache_key, cache_value, repo_slug, cache_prefix, overwrite)
-    lock.release()
-
-
-# TODO: I'm not clear what is the difference between `cache_lookup` and
-# `lookup_in_cache`.  Please clarify, and indicate when each should be called.
 def lookup_in_cache(
     cache_key: Union[Tuple, str],
     repo_slug: str,
-    cache_prefix: Path,
+    cache_directory: Path,
     set_run: bool = False,
 ) -> Union[str, dict, None]:
     """Checks if the cache is available and loads a specific entry.
     Args:
         cache_key (Union[Tuple,str]): The key to check.
         repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
+        cache_directory (Path): The path to the cache directory.
         set_run (bool, optional) = False: Wheter to insert an empty cache entry
             if it does not exist. This is useful for preventing multiple runs from
             attempting to insert the same cache entry.
     Returns:
         Union[dict,None]: The cache entry if it exists, None otherwise.
     """
-    lock = get_cache_lock(repo_slug, cache_prefix)
+    lock = get_cache_lock(repo_slug, cache_directory)
     lock.acquire()
-    cache_entry = get_cache_path(repo_slug, cache_prefix)
+    cache_entry = get_cache_path(repo_slug, cache_directory)
     cache_entry.parent.mkdir(parents=True, exist_ok=True)
-    if is_in_cache(cache_key, repo_slug, cache_prefix):
+    if is_in_cache(cache_key, repo_slug, cache_directory):
         total_time = 0
         while True:
-            cache_data = cache_lookup(cache_key, repo_slug, cache_prefix)
+            cache = load_cache(repo_slug, cache_directory)
+            cache_data = cache[cache_key]
             if cache_data is not None:
                 break
             lock.release()
@@ -210,6 +101,70 @@ def lookup_in_cache(
         lock.release()
         return cache_data
     if set_run:
-        write_to_cache(cache_key, None, repo_slug, cache_prefix)
+        set_in_cache(cache_key, None, repo_slug, cache_directory, acquire_lock=False)
     lock.release()
     return None
+
+
+# ====================== Internal functions ======================
+
+
+def get_cache_lock(repo_slug: str, cache_directory: Path):
+    """Returns a lock for the cache of a repository.
+    Initially the lock is unlocked; the caller must explictly
+    lock and unlock the lock.
+    Args:
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path to the cache directory.
+    Returns:
+        fasteners.InterProcessLock: A lock for the repository.
+    """
+    lock_path = cache_directory / "locks" / (slug_repo_name(repo_slug) + ".lock")
+    lock_path.parent.mkdir(parents=True, exist_ok=True)
+    lock = fasteners.InterProcessLock(lock_path)
+    return lock
+
+
+def get_cache_path(repo_slug: str, cache_directory: Path) -> Path:
+    """Returns the path to the cache file.
+    Args:
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path to the cache directory.
+    Returns:
+        Path: The path to the cache file.
+    """
+    cache_file_name = slug_repo_name(repo_slug) + ".json"
+    cache_path = cache_directory / cache_file_name
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    return cache_path
+
+
+def is_in_cache(
+    cache_key: Union[Tuple, str], repo_slug: str, cache_directory: Path
+) -> bool:
+    """Checks if the key is in the cache.
+    Args:
+        cache_key (Union[Tuple,str]): The key to check.
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path prefix to the cache directory.
+    Returns:
+        bool: True if the repository is in the cache, False otherwise.
+    """
+    cache = load_cache(repo_slug, cache_directory)
+    return cache_key in cache
+
+
+def load_cache(repo_slug: str, cache_directory: Path) -> dict:
+    """Loads the cache associated to the repo_slug found in the cache directory.
+    Args:
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path to the cache directory.
+    Returns:
+        dict: The cache.
+    """
+    cache_path = get_cache_path(repo_slug, cache_directory)
+    if not cache_path.exists():
+        return {}
+    with open(cache_path, "r") as f:
+        cache_data = json.load(f)
+        return cache_data
diff --git a/src/python/clean_cache_placeholders.py → src/python/delete_cache_placeholders.py b/src/python/clean_cache_placeholders.py → src/python/delete_cache_placeholders.py
@@ -1,19 +1,15 @@
 #!/usr/bin/env python3
-# TODO: This file uses "clean", "remove", and "delete".  Please choose one term
-# and stick to it, in both the code and in the file name.
 """ 
 Deletes all placeholders from the cache. Placeholders are created when a
 a process starts; it indicates that is has started and is still running.
 If the process fails, the placeholder is not replaced with the actual
 result. This script deletes all placeholders from the cache.
 
 Usage:
-    python clean_cache_placeholders.py --cache_path <path_to_cache>
+    python delete_cache_placeholders.py --cache_directory <path_to_cache>
 
 Args:
-TODO: Throughout, please don't use "_path" as a variable suffix, because it is
-# not sufficiently precise.  Instead, use either "_file" or "_directory".
-    cache_path (str): Path to cache directory
+    cache_directory (str): Path to cache directory
 """
 
 from argparse import ArgumentParser
@@ -27,10 +23,10 @@
     )
     args = parser.parse_args()
 
-    cache_path = Path(args.cache_dir)
+    cache_directory = Path(args.cache_dir)
 
     n_deleted = 0
-    for file in cache_path.glob("**/*.json"):
+    for file in cache_directory.glob("**/*.json"):
         with open(file, "r") as f:
             data = json.load(f)
 

diff --git a/src/python/latex_output.py b/src/python/latex_output.py
@@ -14,10 +14,9 @@
 following input files:
 - full_repos_csv: csv file containing the full list of repositories
 - repos_head_passes_csv: csv file containing the list of repositories whose head passes tests
-- tested_merges_path: path to the folder containing the merge results
-- merges_path: path to the folder containing all found merges.
-TODO: Throughout, be consistent about "directory" vs "folder".
-- output_dir: path to the folder where the LaTeX files will be saved
+- tested_merges_path: path to the directory containing the merge results
+- merges_path: path to the directory containing all found merges.
+- output_dir: path to the directory where the LaTeX files will be saved
 """
 
 
@@ -77,7 +76,7 @@ def latex_def(name, value) -> str:
 
 
 # Dictonary that lists the different subsets of merge tools for which plots
-# and tables are generated. The key is the folder name which will contain all figures
+# and tables are generated. The key is the directory name which will contain all figures
 # that will be used and the value is the list of plots to contain.
 PLOTS = {
     "all": [merge_tool.name for merge_tool in MERGE_TOOL],
@@ -93,9 +92,6 @@ def latex_def(name, value) -> str:
     ],
 }
 
-# TODO: I suggest that these should be declared in repo.py, or move the enums to
-# a different file where this can also be defined.  Or, define predicates for
-# clients to use rather than exporting these lists.
 MERGE_CORRECT_NAMES = [
     TEST_STATE.Tests_passed.name,
 ]