Mike's code review comments (#208)

benedikt-schesch · Sep 27, 2023 · d5a8de8 · d5a8de8
1 parent f18f2ce
commit d5a8de8
Show file tree

Hide file tree

Showing 20 changed files with 636 additions and 477 deletions.
diff --git a/run.sh b/run.sh
@@ -73,7 +73,7 @@ if [ -d "$CACHE_DIR" ]; then
     find "$CACHE_DIR" -name "*.lock" -delete
 fi
 
-python3 src/python/clean_cache_placeholders.py \
+python3 src/python/delete_cache_placeholders.py \
     --cache_dir "$CACHE_DIR"
 
 python3 src/python/write_head_hashes.py \

diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
-""" Contains all the functions related to the caches. 
-There will be 4 caches which are stored on disk after running the script:
+""" Contains all the functions related to the caches. The functions to interact with each
+of the caches are in this file. Each cache is interacted with through the functions 
+of this file. The caches are all JSON files and are stored in the cache directory.
+There will be 4 caches in total which are stored on disk after running the run.sh script:
 1) cache/sha_cache_entry:  A cache that maps the commit hash to a sha256 hash of the repository.
 2) cache/test_cache: A cache that maps a sha256 to test results.
 3) cache/merge_results: A cache that maps a merge to the result 
@@ -28,159 +30,61 @@ def slug_repo_name(repo_slug: str) -> str:
     return repo_slug.split("/")[1]
 
 
-def get_cache_lock(repo_slug: str, cache_prefix: Path):
-    """Returns a lock for the cache of a repository.
-    Initially the lock is unlocked; the caller must explictly
-    lock and unlock the lock.
-    Args:
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        fasteners.InterProcessLock: A lock for the repository.
-    """
-    lock_path = cache_prefix / "locks" / (slug_repo_name(repo_slug) + ".lock")
-    lock_path.parent.mkdir(parents=True, exist_ok=True)
-    lock = fasteners.InterProcessLock(lock_path)
-    return lock
-
-
-def get_cache_path(repo_slug: str, cache_prefix: Path) -> Path:
-    """Returns the path to the cache file.
-    Args:
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        Path: The path to the cache file.
-    """
-    cache_file_name = slug_repo_name(repo_slug) + ".json"
-    cache_path = cache_prefix / cache_file_name
-    cache_path.parent.mkdir(parents=True, exist_ok=True)
-    return cache_path
-
-
-def is_in_cache(
-    cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path
-) -> bool:
-    """Checks if the key is in the cache.
-    Args:
-        cache_key (Union[Tuple,str]): The key to check.
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path prefix to the cache directory.
-    Returns:
-        bool: True if the repository is in the cache, False otherwise.
-    """
-    cache = load_cache(repo_slug, cache_prefix)
-    return cache_key in cache
-
-
-def load_cache(repo_slug: str, cache_prefix: Path) -> dict:
-    """Loads the cache.
-    Args:
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        dict: The cache.
-    """
-    cache_path = get_cache_path(repo_slug, cache_prefix)
-    if not cache_path.exists():
-        return {}
-    with open(cache_path, "r") as f:
-        cache_data = json.load(f)
-        return cache_data
-
-
-def cache_lookup(
-    cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path
-) -> dict:
-    """Loads the cache and returns the value for the given key.
-    Args:
-        cache_key (Union[Tuple,str]): The key to check.
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-    Returns:
-        dict: The cache.
-    """
-    cache = load_cache(repo_slug, cache_prefix)
-    return cache[cache_key]
-
-
-def write_to_cache(
+def set_in_cache(
     cache_key: Union[Tuple, str],
     cache_value: Union[str, dict, None],
     repo_slug: str,
-    cache_prefix: Path,
-    overwrite: bool = True,
+    cache_directory: Path,
+    acquire_lock: bool = True,
 ) -> None:
     """Puts an entry in the cache, then writes the cache to disk.
     This function is not thread-safe.
     Args:
         cache_key (Union[Tuple,str]): The key to check.
         cache_value (dict): The value to write.
         repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-        overwrite (bool, optional) = True: Whether to overwrite an existing cache entry.
-             If False, attempting to overwrite an existing cache entry throws an exception.
+        cache_directory (Path): The path to the cache directory.
     """
-    cache_path = get_cache_path(repo_slug, cache_prefix)
-    cache = load_cache(repo_slug, cache_prefix)
-    if cache_prefix != Path("cache-small/sha_cache_entry"):
-        print("write_to_cache:", cache_key, cache_value, repo_slug, cache_prefix)
-    if cache_key in cache and not overwrite:
-        raise ValueError("Cache key already exists")
+    if acquire_lock:
+        lock = get_cache_lock(repo_slug, cache_directory)
+        lock.acquire()
+    cache_path = get_cache_path(repo_slug, cache_directory)
+    cache = load_cache(repo_slug, cache_directory)
     cache[cache_key] = cache_value
     output = json.dumps(cache, indent=4)
     with open(cache_path, "w") as f:
         f.write(output)
         f.flush()
-
-
-def set_in_cache(
-    cache_key: Union[Tuple, str],
-    cache_value: Union[str, dict, None],
-    repo_slug: str,
-    cache_prefix: Path,
-    overwrite: bool = True,
-) -> None:
-    """Puts an entry in the cache, then writes the cache to disk.
-    This function is thread-safe.
-    Args:
-        cache_key (Union[Tuple,str]): The key to check.
-        cache_value (dict): The value to write.
-        repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
-        overwrite (bool, optional) = True: Whether to overwrite the cache if it already exists.
-    """
-    lock = get_cache_lock(repo_slug, cache_prefix)
-    lock.acquire()
-    write_to_cache(cache_key, cache_value, repo_slug, cache_prefix, overwrite)
-    lock.release()
+    if acquire_lock:
+        lock.release()  # type: ignore
 
 
 def lookup_in_cache(
     cache_key: Union[Tuple, str],
     repo_slug: str,
-    cache_prefix: Path,
+    cache_directory: Path,
     set_run: bool = False,
 ) -> Union[str, dict, None]:
     """Checks if the cache is available and loads a specific entry.
     Args:
         cache_key (Union[Tuple,str]): The key to check.
         repo_slug (str): The slug of the repository, which is "owner/reponame".
-        cache_prefix (Path): The path to the cache directory.
+        cache_directory (Path): The path to the cache directory.
         set_run (bool, optional) = False: Wheter to insert an empty cache entry
             if it does not exist. This is useful for preventing multiple runs from
             attempting to insert the same cache entry.
     Returns:
         Union[dict,None]: The cache entry if it exists, None otherwise.
     """
-    lock = get_cache_lock(repo_slug, cache_prefix)
+    lock = get_cache_lock(repo_slug, cache_directory)
     lock.acquire()
-    cache_entry = get_cache_path(repo_slug, cache_prefix)
+    cache_entry = get_cache_path(repo_slug, cache_directory)
     cache_entry.parent.mkdir(parents=True, exist_ok=True)
-    if is_in_cache(cache_key, repo_slug, cache_prefix):
+    if is_in_cache(cache_key, repo_slug, cache_directory):
         total_time = 0
         while True:
-            cache_data = cache_lookup(cache_key, repo_slug, cache_prefix)
+            cache = load_cache(repo_slug, cache_directory)
+            cache_data = cache[cache_key]
             if cache_data is not None:
                 break
             lock.release()
@@ -192,6 +96,70 @@ def lookup_in_cache(
         lock.release()
         return cache_data
     if set_run:
-        write_to_cache(cache_key, None, repo_slug, cache_prefix)
+        set_in_cache(cache_key, None, repo_slug, cache_directory, acquire_lock=False)
     lock.release()
     return None
+
+
+# ====================== Internal functions ======================
+
+
+def get_cache_lock(repo_slug: str, cache_directory: Path):
+    """Returns a lock for the cache of a repository.
+    Initially the lock is unlocked; the caller must explictly
+    lock and unlock the lock.
+    Args:
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path to the cache directory.
+    Returns:
+        fasteners.InterProcessLock: A lock for the repository.
+    """
+    lock_path = cache_directory / "locks" / (slug_repo_name(repo_slug) + ".lock")
+    lock_path.parent.mkdir(parents=True, exist_ok=True)
+    lock = fasteners.InterProcessLock(lock_path)
+    return lock
+
+
+def get_cache_path(repo_slug: str, cache_directory: Path) -> Path:
+    """Returns the path to the cache file.
+    Args:
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path to the cache directory.
+    Returns:
+        Path: The path to the cache file.
+    """
+    cache_file_name = slug_repo_name(repo_slug) + ".json"
+    cache_path = cache_directory / cache_file_name
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    return cache_path
+
+
+def is_in_cache(
+    cache_key: Union[Tuple, str], repo_slug: str, cache_directory: Path
+) -> bool:
+    """Checks if the key is in the cache.
+    Args:
+        cache_key (Union[Tuple,str]): The key to check.
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path prefix to the cache directory.
+    Returns:
+        bool: True if the repository is in the cache, False otherwise.
+    """
+    cache = load_cache(repo_slug, cache_directory)
+    return cache_key in cache
+
+
+def load_cache(repo_slug: str, cache_directory: Path) -> dict:
+    """Loads the cache associated to the repo_slug found in the cache directory.
+    Args:
+        repo_slug (str): The slug of the repository, which is "owner/reponame".
+        cache_directory (Path): The path to the cache directory.
+    Returns:
+        dict: The cache.
+    """
+    cache_path = get_cache_path(repo_slug, cache_directory)
+    if not cache_path.exists():
+        return {}
+    with open(cache_path, "r") as f:
+        cache_data = json.load(f)
+        return cache_data
diff --git a/src/python/clean_cache_placeholders.py → src/python/delete_cache_placeholders.py b/src/python/clean_cache_placeholders.py → src/python/delete_cache_placeholders.py
@@ -6,10 +6,10 @@
 result. This script deletes all placeholders from the cache.
 
 Usage:
-    python clean_cache_placeholders.py --cache_path <path_to_cache>
+    python delete_cache_placeholders.py --cache_directory <path_to_cache>
 
 Args:
-    cache_path (str): Path to cache directory
+    cache_directory (str): Path to cache directory
 """
 
 from argparse import ArgumentParser
@@ -23,10 +23,10 @@
     )
     args = parser.parse_args()
 
-    cache_path = Path(args.cache_dir)
+    cache_directory = Path(args.cache_dir)
 
     n_deleted = 0
-    for file in cache_path.glob("**/*.json"):
+    for file in cache_directory.glob("**/*.json"):
         with open(file, "r") as f:
             data = json.load(f)
 

diff --git a/src/python/latex_output.py b/src/python/latex_output.py
@@ -14,9 +14,9 @@
 following input files:
 - full_repos_csv: csv file containing the full list of repositories
 - repos_head_passes_csv: csv file containing the list of repositories whose head passes tests
-- tested_merges_path: path to the folder containing the merge results
-- merges_path: path to the folder containing all found merges.
-- output_dir: path to the folder where the LaTeX files will be saved
+- tested_merges_path: path to the directory containing the merge results
+- merges_path: path to the directory containing all found merges.
+- output_dir: path to the directory where the LaTeX files will be saved
 """
 
 
@@ -76,7 +76,7 @@ def latex_def(name, value) -> str:
 
 
 # Dictonary that lists the different subsets of merge tools for which plots
-# and tables are generated. The key is the folder name which will contain all figures
+# and tables are generated. The key is the directory name which will contain all figures
 # that will be used and the value is the list of plots to contain.
 PLOTS = {
     "all": [merge_tool.name for merge_tool in MERGE_TOOL],