Skip to content

Commit

Permalink
Improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
benedikt-schesch committed Sep 24, 2023
1 parent 18ed5f6 commit 606b16c
Show file tree
Hide file tree
Showing 13 changed files with 175 additions and 243 deletions.
2 changes: 1 addition & 1 deletion run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ if [ -d "$CACHE_DIR" ]; then
find "$CACHE_DIR" -name "*.lock" -delete
fi

python3 src/python/clean_cache_placeholders.py \
python3 src/python/delete_cache_placeholders.py \
--cache_dir "$CACHE_DIR"

python3 src/python/write_head_hashes.py \
Expand Down
211 changes: 83 additions & 128 deletions src/python/cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@
CACHE_BACKOFF_TIME = 2 * 60 # 2 minutes, in seconds
TIMEOUT = 90 * 60 # 90 minutes, in seconds

# TODO: In this file, please place the externally-visible or exported functions
# first, and then a comment marker of some kind, and then the "private" ones.


def slug_repo_name(repo_slug: str) -> str:
"""Given a GitHub repository slug ("owner/reponame"), returns the reponame.
Expand All @@ -33,172 +30,66 @@ def slug_repo_name(repo_slug: str) -> str:
return repo_slug.split("/")[1]


# TODO: Throughout, please use "_directory" instead of "_prefix" in
# variable names, when the variable actually holds a directory. I
# feel this will be clearer.
def get_cache_lock(repo_slug: str, cache_prefix: Path):
"""Returns a lock for the cache of a repository.
Initially the lock is unlocked; the caller must explictly
lock and unlock the lock.
Args:
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_prefix (Path): The path to the cache directory.
Returns:
fasteners.InterProcessLock: A lock for the repository.
"""
lock_path = cache_prefix / "locks" / (slug_repo_name(repo_slug) + ".lock")
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock = fasteners.InterProcessLock(lock_path)
return lock


def get_cache_path(repo_slug: str, cache_prefix: Path) -> Path:
"""Returns the path to the cache file.
Args:
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_prefix (Path): The path to the cache directory.
Returns:
Path: The path to the cache file.
"""
cache_file_name = slug_repo_name(repo_slug) + ".json"
cache_path = cache_prefix / cache_file_name
cache_path.parent.mkdir(parents=True, exist_ok=True)
return cache_path


def is_in_cache(
cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path
) -> bool:
"""Checks if the key is in the cache.
Args:
cache_key (Union[Tuple,str]): The key to check.
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_prefix (Path): The path prefix to the cache directory.
Returns:
bool: True if the repository is in the cache, False otherwise.
"""
cache = load_cache(repo_slug, cache_prefix)
return cache_key in cache


def load_cache(repo_slug: str, cache_prefix: Path) -> dict:
# TODO: "the cache": which one? Also, does this load a cache or a cache
# entry? Please define those terms to help the reader. This method returns
# a cache, which is a dict. `cache_lookup` returns a cache entry, which is
# also a dict. What are they dicts from and to?
"""Loads the cache.
Args:
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_prefix (Path): The path to the cache directory.
Returns:
dict: The cache.
"""
cache_path = get_cache_path(repo_slug, cache_prefix)
if not cache_path.exists():
return {}
with open(cache_path, "r") as f:
cache_data = json.load(f)
return cache_data


def cache_lookup(
cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path
) -> dict:
"""Loads the cache and returns the value for the given key.
Args:
cache_key (Union[Tuple,str]): The key to check.
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_prefix (Path): The path to the cache directory.
Returns:
dict: The cache.
"""
cache = load_cache(repo_slug, cache_prefix)
return cache[cache_key]


# TODO: The distinction between write_to_cache and set_in_cache is not clear to
# me. If it is only locking, then please give them the same name, but with one
# suffixed by "_without_lock". Also please indicate under what circumstances
# each should be called.
def write_to_cache(
def set_in_cache(
cache_key: Union[Tuple, str],
cache_value: Union[str, dict, None],
repo_slug: str,
cache_prefix: Path,
cache_directory: Path,
overwrite: bool = True,
acquire_lock: bool = True,
) -> None:
"""Puts an entry in the cache, then writes the cache to disk.
This function is not thread-safe.
Args:
cache_key (Union[Tuple,str]): The key to check.
cache_value (dict): The value to write.
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_prefix (Path): The path to the cache directory.
cache_directory (Path): The path to the cache directory.
overwrite (bool, optional) = True: Whether to overwrite an existing cache entry.
If False, attempting to overwrite an existing cache entry throws an exception.
"""
cache_path = get_cache_path(repo_slug, cache_prefix)
cache = load_cache(repo_slug, cache_prefix)
if cache_prefix != Path("cache-small/sha_cache_entry"):
print("write_to_cache:", cache_key, cache_value, repo_slug, cache_prefix)
if acquire_lock:
lock = get_cache_lock(repo_slug, cache_directory)
lock.acquire()
cache_path = get_cache_path(repo_slug, cache_directory)
cache = load_cache(repo_slug, cache_directory)
if cache_key in cache and not overwrite:
raise ValueError("Cache key already exists")
cache[cache_key] = cache_value
output = json.dumps(cache, indent=4)
with open(cache_path, "w") as f:
f.write(output)
f.flush()
if acquire_lock:
lock.release() # type: ignore


def set_in_cache(
cache_key: Union[Tuple, str],
cache_value: Union[str, dict, None],
repo_slug: str,
cache_prefix: Path,
overwrite: bool = True,
) -> None:
"""Puts an entry in the cache, then writes the cache to disk.
This function is thread-safe.
Args:
cache_key (Union[Tuple,str]): The key to check.
cache_value (dict): The value to write.
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_prefix (Path): The path to the cache directory.
overwrite (bool, optional) = True: Whether to overwrite the cache if it already exists.
"""
lock = get_cache_lock(repo_slug, cache_prefix)
lock.acquire()
write_to_cache(cache_key, cache_value, repo_slug, cache_prefix, overwrite)
lock.release()


# TODO: I'm not clear what is the difference between `cache_lookup` and
# `lookup_in_cache`. Please clarify, and indicate when each should be called.
def lookup_in_cache(
cache_key: Union[Tuple, str],
repo_slug: str,
cache_prefix: Path,
cache_directory: Path,
set_run: bool = False,
) -> Union[str, dict, None]:
"""Checks if the cache is available and loads a specific entry.
Args:
cache_key (Union[Tuple,str]): The key to check.
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_prefix (Path): The path to the cache directory.
cache_directory (Path): The path to the cache directory.
set_run (bool, optional) = False: Wheter to insert an empty cache entry
if it does not exist. This is useful for preventing multiple runs from
attempting to insert the same cache entry.
Returns:
Union[dict,None]: The cache entry if it exists, None otherwise.
"""
lock = get_cache_lock(repo_slug, cache_prefix)
lock = get_cache_lock(repo_slug, cache_directory)
lock.acquire()
cache_entry = get_cache_path(repo_slug, cache_prefix)
cache_entry = get_cache_path(repo_slug, cache_directory)
cache_entry.parent.mkdir(parents=True, exist_ok=True)
if is_in_cache(cache_key, repo_slug, cache_prefix):
if is_in_cache(cache_key, repo_slug, cache_directory):
total_time = 0
while True:
cache_data = cache_lookup(cache_key, repo_slug, cache_prefix)
cache = load_cache(repo_slug, cache_directory)
cache_data = cache[cache_key]
if cache_data is not None:
break
lock.release()
Expand All @@ -210,6 +101,70 @@ def lookup_in_cache(
lock.release()
return cache_data
if set_run:
write_to_cache(cache_key, None, repo_slug, cache_prefix)
set_in_cache(cache_key, None, repo_slug, cache_directory, acquire_lock=False)
lock.release()
return None


# ====================== Internal functions ======================


def get_cache_lock(repo_slug: str, cache_directory: Path):
"""Returns a lock for the cache of a repository.
Initially the lock is unlocked; the caller must explictly
lock and unlock the lock.
Args:
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_directory (Path): The path to the cache directory.
Returns:
fasteners.InterProcessLock: A lock for the repository.
"""
lock_path = cache_directory / "locks" / (slug_repo_name(repo_slug) + ".lock")
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock = fasteners.InterProcessLock(lock_path)
return lock


def get_cache_path(repo_slug: str, cache_directory: Path) -> Path:
"""Returns the path to the cache file.
Args:
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_directory (Path): The path to the cache directory.
Returns:
Path: The path to the cache file.
"""
cache_file_name = slug_repo_name(repo_slug) + ".json"
cache_path = cache_directory / cache_file_name
cache_path.parent.mkdir(parents=True, exist_ok=True)
return cache_path


def is_in_cache(
cache_key: Union[Tuple, str], repo_slug: str, cache_directory: Path
) -> bool:
"""Checks if the key is in the cache.
Args:
cache_key (Union[Tuple,str]): The key to check.
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_directory (Path): The path prefix to the cache directory.
Returns:
bool: True if the repository is in the cache, False otherwise.
"""
cache = load_cache(repo_slug, cache_directory)
return cache_key in cache


def load_cache(repo_slug: str, cache_directory: Path) -> dict:
"""Loads the cache associated to the repo_slug found in the cache directory.
Args:
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_directory (Path): The path to the cache directory.
Returns:
dict: The cache.
"""
cache_path = get_cache_path(repo_slug, cache_directory)
if not cache_path.exists():
return {}
with open(cache_path, "r") as f:
cache_data = json.load(f)
return cache_data
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
#!/usr/bin/env python3
# TODO: This file uses "clean", "remove", and "delete". Please choose one term
# and stick to it, in both the code and in the file name.
"""
Deletes all placeholders from the cache. Placeholders are created when a
a process starts; it indicates that is has started and is still running.
If the process fails, the placeholder is not replaced with the actual
result. This script deletes all placeholders from the cache.
Usage:
python clean_cache_placeholders.py --cache_path <path_to_cache>
python delete_cache_placeholders.py --cache_directory <path_to_cache>
Args:
TODO: Throughout, please don't use "_path" as a variable suffix, because it is
# not sufficiently precise. Instead, use either "_file" or "_directory".
cache_path (str): Path to cache directory
cache_directory (str): Path to cache directory
"""

from argparse import ArgumentParser
Expand All @@ -27,10 +23,10 @@
)
args = parser.parse_args()

cache_path = Path(args.cache_dir)
cache_directory = Path(args.cache_dir)

n_deleted = 0
for file in cache_path.glob("**/*.json"):
for file in cache_directory.glob("**/*.json"):
with open(file, "r") as f:
data = json.load(f)

Expand Down
12 changes: 4 additions & 8 deletions src/python/latex_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,9 @@
following input files:
- full_repos_csv: csv file containing the full list of repositories
- repos_head_passes_csv: csv file containing the list of repositories whose head passes tests
- tested_merges_path: path to the folder containing the merge results
- merges_path: path to the folder containing all found merges.
TODO: Throughout, be consistent about "directory" vs "folder".
- output_dir: path to the folder where the LaTeX files will be saved
- tested_merges_path: path to the directory containing the merge results
- merges_path: path to the directory containing all found merges.
- output_dir: path to the directory where the LaTeX files will be saved
"""


Expand Down Expand Up @@ -77,7 +76,7 @@ def latex_def(name, value) -> str:


# Dictonary that lists the different subsets of merge tools for which plots
# and tables are generated. The key is the folder name which will contain all figures
# and tables are generated. The key is the directory name which will contain all figures
# that will be used and the value is the list of plots to contain.
PLOTS = {
"all": [merge_tool.name for merge_tool in MERGE_TOOL],
Expand All @@ -93,9 +92,6 @@ def latex_def(name, value) -> str:
],
}

# TODO: I suggest that these should be declared in repo.py, or move the enums to
# a different file where this can also be defined. Or, define predicates for
# clients to use rather than exporting these lists.
MERGE_CORRECT_NAMES = [
TEST_STATE.Tests_passed.name,
]
Expand Down
Loading

0 comments on commit 606b16c

Please sign in to comment.