From 606b16c5572ad5bc7db17ed37a75dcf0150e41cc Mon Sep 17 00:00:00 2001 From: Benedikt schesch Date: Sun, 24 Sep 2023 13:34:56 -0700 Subject: [PATCH] Improvements --- run.sh | 2 +- src/python/cache_utils.py | 211 +++++++----------- ...olders.py => delete_cache_placeholders.py} | 12 +- src/python/latex_output.py | 12 +- src/python/merge_differ.py | 35 ++- src/python/merge_tester.py | 10 +- src/python/merge_tools_comparator.py | 63 +++--- src/python/repo.py | 55 ++--- src/python/test_repo_heads.py | 5 +- src/python/variables.py | 5 +- src/scripts/merge_tools/intellimerge.sh | 3 +- src/scripts/utils/run_multiple_machines.sh | 3 +- src/scripts/utils/run_remotely.sh | 2 +- 13 files changed, 175 insertions(+), 243 deletions(-) rename src/python/{clean_cache_placeholders.py => delete_cache_placeholders.py} (65%) diff --git a/run.sh b/run.sh index 9b1c1594a9..bc592a5872 100755 --- a/run.sh +++ b/run.sh @@ -73,7 +73,7 @@ if [ -d "$CACHE_DIR" ]; then find "$CACHE_DIR" -name "*.lock" -delete fi -python3 src/python/clean_cache_placeholders.py \ +python3 src/python/delete_cache_placeholders.py \ --cache_dir "$CACHE_DIR" python3 src/python/write_head_hashes.py \ diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py index b7501a9e6a..2ef4864ca6 100755 --- a/src/python/cache_utils.py +++ b/src/python/cache_utils.py @@ -19,9 +19,6 @@ CACHE_BACKOFF_TIME = 2 * 60 # 2 minutes, in seconds TIMEOUT = 90 * 60 # 90 minutes, in seconds -# TODO: In this file, please place the externally-visible or exported functions -# first, and then a comment marker of some kind, and then the "private" ones. - def slug_repo_name(repo_slug: str) -> str: """Given a GitHub repository slug ("owner/reponame"), returns the reponame. @@ -33,99 +30,13 @@ def slug_repo_name(repo_slug: str) -> str: return repo_slug.split("/")[1] -# TODO: Throughout, please use "_directory" instead of "_prefix" in -# variable names, when the variable actually holds a directory. I -# feel this will be clearer. -def get_cache_lock(repo_slug: str, cache_prefix: Path): - """Returns a lock for the cache of a repository. - Initially the lock is unlocked; the caller must explictly - lock and unlock the lock. - Args: - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - fasteners.InterProcessLock: A lock for the repository. - """ - lock_path = cache_prefix / "locks" / (slug_repo_name(repo_slug) + ".lock") - lock_path.parent.mkdir(parents=True, exist_ok=True) - lock = fasteners.InterProcessLock(lock_path) - return lock - - -def get_cache_path(repo_slug: str, cache_prefix: Path) -> Path: - """Returns the path to the cache file. - Args: - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - Path: The path to the cache file. - """ - cache_file_name = slug_repo_name(repo_slug) + ".json" - cache_path = cache_prefix / cache_file_name - cache_path.parent.mkdir(parents=True, exist_ok=True) - return cache_path - - -def is_in_cache( - cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path -) -> bool: - """Checks if the key is in the cache. - Args: - cache_key (Union[Tuple,str]): The key to check. - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path prefix to the cache directory. - Returns: - bool: True if the repository is in the cache, False otherwise. - """ - cache = load_cache(repo_slug, cache_prefix) - return cache_key in cache - - -def load_cache(repo_slug: str, cache_prefix: Path) -> dict: - # TODO: "the cache": which one? Also, does this load a cache or a cache - # entry? Please define those terms to help the reader. This method returns - # a cache, which is a dict. `cache_lookup` returns a cache entry, which is - # also a dict. What are they dicts from and to? - """Loads the cache. - Args: - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - dict: The cache. - """ - cache_path = get_cache_path(repo_slug, cache_prefix) - if not cache_path.exists(): - return {} - with open(cache_path, "r") as f: - cache_data = json.load(f) - return cache_data - - -def cache_lookup( - cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path -) -> dict: - """Loads the cache and returns the value for the given key. - Args: - cache_key (Union[Tuple,str]): The key to check. - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - dict: The cache. - """ - cache = load_cache(repo_slug, cache_prefix) - return cache[cache_key] - - -# TODO: The distinction between write_to_cache and set_in_cache is not clear to -# me. If it is only locking, then please give them the same name, but with one -# suffixed by "_without_lock". Also please indicate under what circumstances -# each should be called. -def write_to_cache( +def set_in_cache( cache_key: Union[Tuple, str], cache_value: Union[str, dict, None], repo_slug: str, - cache_prefix: Path, + cache_directory: Path, overwrite: bool = True, + acquire_lock: bool = True, ) -> None: """Puts an entry in the cache, then writes the cache to disk. This function is not thread-safe. @@ -133,14 +44,15 @@ def write_to_cache( cache_key (Union[Tuple,str]): The key to check. cache_value (dict): The value to write. repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. + cache_directory (Path): The path to the cache directory. overwrite (bool, optional) = True: Whether to overwrite an existing cache entry. If False, attempting to overwrite an existing cache entry throws an exception. """ - cache_path = get_cache_path(repo_slug, cache_prefix) - cache = load_cache(repo_slug, cache_prefix) - if cache_prefix != Path("cache-small/sha_cache_entry"): - print("write_to_cache:", cache_key, cache_value, repo_slug, cache_prefix) + if acquire_lock: + lock = get_cache_lock(repo_slug, cache_directory) + lock.acquire() + cache_path = get_cache_path(repo_slug, cache_directory) + cache = load_cache(repo_slug, cache_directory) if cache_key in cache and not overwrite: raise ValueError("Cache key already exists") cache[cache_key] = cache_value @@ -148,57 +60,36 @@ def write_to_cache( with open(cache_path, "w") as f: f.write(output) f.flush() + if acquire_lock: + lock.release() # type: ignore -def set_in_cache( - cache_key: Union[Tuple, str], - cache_value: Union[str, dict, None], - repo_slug: str, - cache_prefix: Path, - overwrite: bool = True, -) -> None: - """Puts an entry in the cache, then writes the cache to disk. - This function is thread-safe. - Args: - cache_key (Union[Tuple,str]): The key to check. - cache_value (dict): The value to write. - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - overwrite (bool, optional) = True: Whether to overwrite the cache if it already exists. - """ - lock = get_cache_lock(repo_slug, cache_prefix) - lock.acquire() - write_to_cache(cache_key, cache_value, repo_slug, cache_prefix, overwrite) - lock.release() - - -# TODO: I'm not clear what is the difference between `cache_lookup` and -# `lookup_in_cache`. Please clarify, and indicate when each should be called. def lookup_in_cache( cache_key: Union[Tuple, str], repo_slug: str, - cache_prefix: Path, + cache_directory: Path, set_run: bool = False, ) -> Union[str, dict, None]: """Checks if the cache is available and loads a specific entry. Args: cache_key (Union[Tuple,str]): The key to check. repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. + cache_directory (Path): The path to the cache directory. set_run (bool, optional) = False: Wheter to insert an empty cache entry if it does not exist. This is useful for preventing multiple runs from attempting to insert the same cache entry. Returns: Union[dict,None]: The cache entry if it exists, None otherwise. """ - lock = get_cache_lock(repo_slug, cache_prefix) + lock = get_cache_lock(repo_slug, cache_directory) lock.acquire() - cache_entry = get_cache_path(repo_slug, cache_prefix) + cache_entry = get_cache_path(repo_slug, cache_directory) cache_entry.parent.mkdir(parents=True, exist_ok=True) - if is_in_cache(cache_key, repo_slug, cache_prefix): + if is_in_cache(cache_key, repo_slug, cache_directory): total_time = 0 while True: - cache_data = cache_lookup(cache_key, repo_slug, cache_prefix) + cache = load_cache(repo_slug, cache_directory) + cache_data = cache[cache_key] if cache_data is not None: break lock.release() @@ -210,6 +101,70 @@ def lookup_in_cache( lock.release() return cache_data if set_run: - write_to_cache(cache_key, None, repo_slug, cache_prefix) + set_in_cache(cache_key, None, repo_slug, cache_directory, acquire_lock=False) lock.release() return None + + +# ====================== Internal functions ====================== + + +def get_cache_lock(repo_slug: str, cache_directory: Path): + """Returns a lock for the cache of a repository. + Initially the lock is unlocked; the caller must explictly + lock and unlock the lock. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path to the cache directory. + Returns: + fasteners.InterProcessLock: A lock for the repository. + """ + lock_path = cache_directory / "locks" / (slug_repo_name(repo_slug) + ".lock") + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock = fasteners.InterProcessLock(lock_path) + return lock + + +def get_cache_path(repo_slug: str, cache_directory: Path) -> Path: + """Returns the path to the cache file. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path to the cache directory. + Returns: + Path: The path to the cache file. + """ + cache_file_name = slug_repo_name(repo_slug) + ".json" + cache_path = cache_directory / cache_file_name + cache_path.parent.mkdir(parents=True, exist_ok=True) + return cache_path + + +def is_in_cache( + cache_key: Union[Tuple, str], repo_slug: str, cache_directory: Path +) -> bool: + """Checks if the key is in the cache. + Args: + cache_key (Union[Tuple,str]): The key to check. + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path prefix to the cache directory. + Returns: + bool: True if the repository is in the cache, False otherwise. + """ + cache = load_cache(repo_slug, cache_directory) + return cache_key in cache + + +def load_cache(repo_slug: str, cache_directory: Path) -> dict: + """Loads the cache associated to the repo_slug found in the cache directory. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path to the cache directory. + Returns: + dict: The cache. + """ + cache_path = get_cache_path(repo_slug, cache_directory) + if not cache_path.exists(): + return {} + with open(cache_path, "r") as f: + cache_data = json.load(f) + return cache_data diff --git a/src/python/clean_cache_placeholders.py b/src/python/delete_cache_placeholders.py similarity index 65% rename from src/python/clean_cache_placeholders.py rename to src/python/delete_cache_placeholders.py index c2d861c7b3..22acd35c11 100755 --- a/src/python/clean_cache_placeholders.py +++ b/src/python/delete_cache_placeholders.py @@ -1,6 +1,4 @@ #!/usr/bin/env python3 -# TODO: This file uses "clean", "remove", and "delete". Please choose one term -# and stick to it, in both the code and in the file name. """ Deletes all placeholders from the cache. Placeholders are created when a a process starts; it indicates that is has started and is still running. @@ -8,12 +6,10 @@ result. This script deletes all placeholders from the cache. Usage: - python clean_cache_placeholders.py --cache_path + python delete_cache_placeholders.py --cache_directory Args: -TODO: Throughout, please don't use "_path" as a variable suffix, because it is -# not sufficiently precise. Instead, use either "_file" or "_directory". - cache_path (str): Path to cache directory + cache_directory (str): Path to cache directory """ from argparse import ArgumentParser @@ -27,10 +23,10 @@ ) args = parser.parse_args() - cache_path = Path(args.cache_dir) + cache_directory = Path(args.cache_dir) n_deleted = 0 - for file in cache_path.glob("**/*.json"): + for file in cache_directory.glob("**/*.json"): with open(file, "r") as f: data = json.load(f) diff --git a/src/python/latex_output.py b/src/python/latex_output.py index ef75f08243..54b01bedff 100755 --- a/src/python/latex_output.py +++ b/src/python/latex_output.py @@ -14,10 +14,9 @@ following input files: - full_repos_csv: csv file containing the full list of repositories - repos_head_passes_csv: csv file containing the list of repositories whose head passes tests -- tested_merges_path: path to the folder containing the merge results -- merges_path: path to the folder containing all found merges. -TODO: Throughout, be consistent about "directory" vs "folder". -- output_dir: path to the folder where the LaTeX files will be saved +- tested_merges_path: path to the directory containing the merge results +- merges_path: path to the directory containing all found merges. +- output_dir: path to the directory where the LaTeX files will be saved """ @@ -77,7 +76,7 @@ def latex_def(name, value) -> str: # Dictonary that lists the different subsets of merge tools for which plots -# and tables are generated. The key is the folder name which will contain all figures +# and tables are generated. The key is the directory name which will contain all figures # that will be used and the value is the list of plots to contain. PLOTS = { "all": [merge_tool.name for merge_tool in MERGE_TOOL], @@ -93,9 +92,6 @@ def latex_def(name, value) -> str: ], } -# TODO: I suggest that these should be declared in repo.py, or move the enums to -# a different file where this can also be defined. Or, define predicates for -# clients to use rather than exporting these lists. MERGE_CORRECT_NAMES = [ TEST_STATE.Tests_passed.name, ] diff --git a/src/python/merge_differ.py b/src/python/merge_differ.py index bc36bd7cb6..574eddef00 100755 --- a/src/python/merge_differ.py +++ b/src/python/merge_differ.py @@ -4,10 +4,10 @@ --merges_path --cache_dir This script compares the results of two merge tools on the same merge. -# TODO: Where is the output? Standard out? It outputs the diff of the two merge results if their results differ. -# TODO: Does "ignores" mean there is no output, or is it something deeper? -It ignores merges that have the same result or merges that are not successful. +It does not produce output for merges that have the same result or merges that +are not successful. +The output is written in cache_dir/merge_diffs. """ import os @@ -32,7 +32,7 @@ def get_merge_fingerprint( - merge_data: pd.Series, merge_tool: MERGE_TOOL, cache_prefix: Path + merge_data: pd.Series, merge_tool: MERGE_TOOL, cache_directory: Path ) -> Union[Tuple[None, None], Tuple[Repository, str]]: """Returns the repo and the fingerprint of a merge, or (None, None) if the merge is not successful. @@ -51,7 +51,7 @@ def get_merge_fingerprint( return None, None left = merge_data["left"] right = merge_data["right"] - repo = Repository(repo_slug, cache_prefix=cache_prefix) + repo = Repository(repo_slug, cache_directory=cache_directory) ( merge_status, merge_fingerprint, @@ -81,39 +81,34 @@ def get_merge_fingerprint( def merge_differ(args: Tuple[pd.Series, Path]) -> None: - # TODO: What does this funtion do with the diff? I think it writes to a file? Or does it populate a cache too? """Diffs the results of every two merge tools on the same merge. - Does not diff merges that have the same result or merges that are not successful. + Writes the diff to a file. Args: args (Tuple[pd.Series,Path]): A tuple containing the merge data and the cache prefix. - Returns: - TODO: How is "The result of the test." related to the diff of two merge tools? I don't see this function returning anything. - dict: The result of the test. """ - merge_data, cache_prefix = args + merge_data, cache_directory = args if not merge_data["parents pass"]: return - # TODO: See other comment about "_prefix" in variable names; change throughout. - diff_file_prefix = cache_prefix / "merge_diffs" - diff_file_prefix.mkdir(parents=True, exist_ok=True) + diff_file_directory = cache_directory / "merge_diffs" + diff_file_directory.mkdir(parents=True, exist_ok=True) for merge_tool1 in MERGE_TOOL: repo1, merge_fingerprint1 = get_merge_fingerprint( - merge_data, merge_tool1, cache_prefix + merge_data, merge_tool1, cache_directory ) if repo1 is None or merge_fingerprint1 is None: continue for merge_tool2 in MERGE_TOOL: repo2, merge_fingerprint2 = get_merge_fingerprint( - merge_data, merge_tool2, cache_prefix + merge_data, merge_tool2, cache_directory ) if repo2 is None or merge_fingerprint2 is None: continue - diff_file = diff_file_prefix / diff_file_name( + diff_file = diff_file_directory / diff_file_name( merge_fingerprint1, merge_fingerprint2 ) if diff_file.exists(): @@ -137,9 +132,8 @@ def diff_file_name(sha1: str, sha2: str) -> Path: """ # Use lexicographic order to prevent duplicates if sha1 < sha2: - # TODO: Why does this use ".txt" rather than ".diff" as the file extension? - return Path(sha1 + "_" + sha2 + ".txt") - return Path(sha2 + "_" + sha1 + ".txt") + return Path(sha1 + "_" + sha2 + ".diff") + return Path(sha2 + "_" + sha1 + ".diff") if __name__ == "__main__": @@ -150,7 +144,6 @@ def diff_file_name(sha1: str, sha2: str) -> Path: parser.add_argument("--cache_dir", type=Path, default="cache/") args = parser.parse_args() cache_dir = Path(args.cache_dir) - # TODO: See comment elsewhere about "_path" in variable names. cache_diffs_path = cache_dir / "merge_diffs" cache_diffs_path.mkdir(parents=True, exist_ok=True) diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py index bc840976f7..d0c51a8701 100755 --- a/src/python/merge_tester.py +++ b/src/python/merge_tester.py @@ -42,7 +42,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: Returns: pd.Series: The result of the test. """ - repo_slug, merge_data, cache_prefix = args + repo_slug, merge_data, cache_directory = args while psutil.cpu_percent() > 90: print( "merge_tester: Waiting for CPU load to come down", @@ -56,7 +56,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: merge_data["parents pass"] = False # TODO: It's not clear to me whether "left" and "right" are keys for branches, or shas, or something else. How about improving the key name? (We can change the find-merge-conflicts program if it is the source of the problem.) for branch in ["left", "right"]: - repo = Repository(repo_slug, cache_prefix=cache_prefix) + repo = Repository(repo_slug, cache_directory=cache_directory) repo.checkout(merge_data[branch]) tree_fingerprint = repo.compute_tree_fingerprint() if tree_fingerprint != merge_data[f"{branch}_tree_fingerprint"]: @@ -74,15 +74,12 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: merge_data[f"{branch} test result"] = test_result.name if test_result != TEST_STATE.Tests_passed: return merge_data - # TODO: It looks suspicious to me that the repo is only sometimes, not always, deleted. - del repo merge_data["parents pass"] = True for merge_tool in MERGE_TOOL: if is_merge_success(merge_data[merge_tool.name]): - # TODO: I suggest abstracting the body of this loop into a separate function, since it stands on its own logically. - repo = Repository(repo_slug, cache_prefix=cache_prefix) + repo = Repository(repo_slug, cache_directory=cache_directory) ( result, merge_fingerprint, @@ -111,7 +108,6 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: ) # Update the status from merge success to test result. merge_data[merge_tool.name] = result.name - del repo assert merge_tool.name in merge_data return merge_data diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py index 2fcb574c1e..4b06bb7295 100755 --- a/src/python/merge_tools_comparator.py +++ b/src/python/merge_tools_comparator.py @@ -51,16 +51,12 @@ def merger( # pylint: disable=too-many-locals Returns: dict: A dictionary containing the merge result. """ - repo_slug, merge_data, cache_prefix = args + repo_slug, merge_data, cache_directory = args - # TODO: I think this is a key, not an entry. Please clarify. - # TODO: Why are these two SHAs (if they are SHAs) not sorted as is done elsewhere? - cache_entry = merge_data["left"] + "_" + merge_data["right"] - merge_cache_prefix = cache_prefix / "merge_results" + cache_key = merge_data["left"] + "_" + merge_data["right"] + merge_cache_directory = cache_directory / "merge_results" - result = lookup_in_cache(cache_entry, repo_slug, merge_cache_prefix, True) - # if result is None: - # raise Exception(cache_entry," HERE") + result = lookup_in_cache(cache_key, repo_slug, merge_cache_directory, True) if result is not None and isinstance(result, dict): return result @@ -74,9 +70,9 @@ def merger( # pylint: disable=too-many-locals merge_tool.name, ) cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []} - # TODO: I suggest abstracting out the body of this loop as a function. + # TODO: I suggest abstracting out the body of this loop as a function. for i in range(N_REPETITIONS): - repo = Repository(repo_slug, cache_prefix=cache_prefix) + repo = Repository(repo_slug, cache_directory=cache_directory) ( merge_status, merge_fingerprint, @@ -91,7 +87,7 @@ def merger( # pylint: disable=too-many-locals timeout=TIMEOUT_MERGING, ) log_file: Path = ( - cache_prefix + cache_directory / "merge_logs" / slug_repo_name(repo_slug) / merge_data["left"] @@ -132,11 +128,34 @@ def merger( # pylint: disable=too-many-locals else: assert cache_data["left_tree_fingerprint"] == left_fingerprint assert cache_data["right_tree_fingerprint"] == right_fingerprint - del repo - set_in_cache(cache_entry, cache_data, repo_slug, merge_cache_prefix) + set_in_cache(cache_key, cache_data, repo_slug, merge_cache_directory) return cache_data +def check_if_two_merges_differ(cache_data: dict) -> bool: + """Returns true if two merge tools differ on the same merge. + Args: + cache_data (dict): A dictionary containing the merge data. + Returns: + bool: True if two merge tools differ on the same merge. + """ + for merge_tool1 in MERGE_TOOL: + for merge_tool2 in MERGE_TOOL: + if is_merge_success( + cache_data[merge_tool1.name]["results"][0] + ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): + return True + if is_merge_success( + cache_data[merge_tool1.name]["results"][0] + ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): + if ( + cache_data[merge_tool1.name]["merge_fingerprint"] + != cache_data[merge_tool2.name]["merge_fingerprint"] + ): + return True + return False + + if __name__ == "__main__": print("merge_tools_comparator: Start") parser = argparse.ArgumentParser() @@ -228,22 +247,8 @@ def merger( # pylint: disable=too-many-locals repo_slug = merger_arguments[i][0] merge_data = merger_arguments[i][1] cache_data = merger_results[i] - two_merge_tools_differ = False - # TODO: I suggest abstracting this loop into a function. That will also make it easier to return early, which can be done just with "return". (Currently it makes more iterations even after discovering that two merge tools do differ.) - for merge_tool1 in MERGE_TOOL: - for merge_tool2 in MERGE_TOOL: - if is_merge_success( - cache_data[merge_tool1.name]["results"][0] - ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): - two_merge_tools_differ = True - if is_merge_success( - cache_data[merge_tool1.name]["results"][0] - ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): - if ( - cache_data[merge_tool1.name]["merge_fingerprint"] - != cache_data[merge_tool2.name]["merge_fingerprint"] - ): - two_merge_tools_differ = True + two_merge_tools_differ = check_if_two_merges_differ(cache_data) + merge_data["two merge tools differ"] = two_merge_tools_differ merge_data["left_tree_fingerprint"] = cache_data["left_tree_fingerprint"] merge_data["right_tree_fingerprint"] = cache_data["right_tree_fingerprint"] diff --git a/src/python/repo.py b/src/python/repo.py index 04c6e88c72..c332c564c3 100755 --- a/src/python/repo.py +++ b/src/python/repo.py @@ -76,12 +76,10 @@ def compute_explanation( def repo_test(wcopy_dir: Path, timeout: int) -> Tuple[TEST_STATE, str]: """Returns the result of run_repo_tests.sh on the given working copy. - TODO: What is the definition of "the entire test"? How is it different than the test process? - If the test process passes then the entire test is marked as passed. - If the test process timeouts then the entire test is marked as timeout. + If the test process passes then the function returns and marks it as passed. + If the test process timeouts then the function returns and marks it as timedout. Args: - TODO: Minor: Throughout, you don't need "The path of" at the beginning of comments. Or change it to "The directory of". - wcopy_dir (Path): The path of the working copy (the clone). + wcopy_dir (Path): The directory of the working copy (the clone). timeout (int): Test timeout limit, in seconds. Returns: TEST_STATE: The result of the test. @@ -114,23 +112,23 @@ class Repository: def __init__( self, repo_slug: str, - cache_prefix: Path = Path(""), + cache_directory: Path = Path(""), ) -> None: """Initializes the repository. Args: repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The prefix of the cache. + cache_directory (Path): The prefix of the cache. """ self.repo_slug = repo_slug self.path = REPOS_PATH / repo_slug.split("/")[1] workdir_id = uuid.uuid4().hex - self.workdir = WORKDIR_PREFIX / workdir_id + self.workdir = WORKDIR_DIRECTORY / workdir_id self.workdir.mkdir(parents=True, exist_ok=True) self.repo_path = self.workdir / self.path.name shutil.copytree(self.path, self.repo_path) self.repo = Repo(self.repo_path) - self.test_cache_prefix = cache_prefix / "test_cache" - self.sha_cache_prefix = cache_prefix / "sha_cache_entry" + self.test_cache_directory = cache_directory / "test_cache" + self.sha_cache_directory = cache_directory / "sha_cache_entry" def checkout(self, commit: str) -> Tuple[bool, str]: """Checks out the given commit. @@ -154,13 +152,13 @@ def checkout(self, commit: str) -> Tuple[bool, str]: + str(e) ) cache_entry = {"sha": None, "explanation": explanation} - set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_prefix) + set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_directory) return False, explanation cache_entry = { "sha": self.compute_tree_fingerprint(), "explanation": explanation, } - set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_prefix) + set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_directory) return True, explanation def _merge_and_test( # pylint: disable=too-many-arguments @@ -190,11 +188,8 @@ def _merge_and_test( # pylint: disable=too-many-arguments str: The tree fingerprint of the merge result. str: The left fingerprint. str: The right fingerprint. - TODO: Is this time to merge, time to test, or time to merge and test? - TODO: This is a ssingle number, but n_tests tells how many times to - TODO: run the test. Is this an average, median, overall, or something - TODO: else? (I have the same question elsewhere.) - float: The time it took to run the merge, in seconds. + float: The time it took to run the merge, in seconds. Does not include + the test time. """ ( merge_status, @@ -287,7 +282,7 @@ def create_branch( commit, {"sha": None, "explanation": explanation}, self.repo_slug, - self.sha_cache_prefix, + self.sha_cache_directory, ) return None, explanation tree_fingerprint = self.compute_tree_fingerprint() @@ -335,7 +330,7 @@ def merge( # pylint: disable=too-many-locals cache_entry_name, {"sha": None, "explanation": explanation}, self.repo_slug, - self.sha_cache_prefix, + self.sha_cache_directory, ) return ( MERGE_STATE.Git_checkout_failed, @@ -387,7 +382,7 @@ def merge( # pylint: disable=too-many-locals cache_entry_name, cache_entry, self.repo_slug, - self.sha_cache_prefix, + self.sha_cache_directory, ) return ( merge_status, @@ -421,18 +416,18 @@ def compute_tree_fingerprint(self) -> str: def get_sha_cache_entry( self, commit: str, start_merge: bool = False ) -> Union[None, dict]: - # TODO: is a SHA cache a cache from a SHA to something, or a cache from something to a SHA? - """Gets a SHA cache entry. + """The SHA cache maps a commit to a tree fingerprint. + This function checks if the commit is in the cache. Args: commit (str): The commit to check. start_merge (bool, optional) = False: Whether to indicate that the merge starts if the commit is not in the cache. Returns: - # TODO: What is the structure of the dict this returns? - Union[None,dict]: The cache entry if the commit is in the cache, None otherwise. + Union[None, dict]: The cache entry if the commit is in the cache, None otherwise. + The dict contains the sha entry and an explanation entry. """ cache = lookup_in_cache( - commit, self.repo_slug, self.sha_cache_prefix, set_run=start_merge + commit, self.repo_slug, self.sha_cache_directory, set_run=start_merge ) if cache is None: return None @@ -453,7 +448,7 @@ def get_test_cache_entry( None otherwise. """ cache = lookup_in_cache( - sha, self.repo_slug, self.test_cache_prefix, set_run=start_test + sha, self.repo_slug, self.test_cache_directory, set_run=start_test ) if cache is None: return None @@ -511,10 +506,8 @@ def checkout_and_test( def test(self, timeout: int, n_tests: int) -> TEST_STATE: """Tests the repository. The test results of multiple runs is combined into one result. If one of the runs passes then the entire test is marked as passed. - # TODO: Should this line be preceded by "otherwise"? Is it possible that one run passes and another times out? - # TODO: I would think the test should be marked as timeout only if every run is timeout. If one of the runs timeouts then the entire test is marked as timeout. - # TODO: Otherwise the test is marked as failure? + Otherwise all runs must fail for the entire test to be marked as failed. Args: timeout (int): The timeout limit, in seconds. n_tests (int): The number of times to run the test suite. @@ -534,7 +527,7 @@ def test(self, timeout: int, n_tests: int) -> TEST_STATE: test_state, test_output = repo_test(self.repo_path, timeout) test_log_file = Path( os.path.join( - self.test_cache_prefix, + self.test_cache_directory, "logs", slug_repo_name(self.repo_slug), sha + "_" + str(i) + ".log", @@ -551,7 +544,7 @@ def test(self, timeout: int, n_tests: int) -> TEST_STATE: if test_state in (TEST_STATE.Tests_passed, TEST_STATE.Tests_timedout): break - set_in_cache(sha, cache_data, self.repo_slug, self.test_cache_prefix) + set_in_cache(sha, cache_data, self.repo_slug, self.test_cache_directory) return TEST_STATE[cache_data["test_result"]] def __del__(self) -> None: diff --git a/src/python/test_repo_heads.py b/src/python/test_repo_heads.py index 3604942293..714007b2f1 100755 --- a/src/python/test_repo_heads.py +++ b/src/python/test_repo_heads.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -# TODO: What does it mean to "validate it"? -"""Tests the HEAD commits of multiple repos and validates it if the test passes. +"""Tests the HEAD commits of multiple repos and considers them as valid if the test passes. usage: python3 test_repo_heads.py --repos_csv_with_hashes --output_path @@ -43,7 +42,7 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE: repo_slug = repo_info["repository"] print("test_repo_heads:", repo_slug, ": head_passes_tests : started") - repo = Repository(repo_slug, cache_prefix=cache) + repo = Repository(repo_slug, cache_directory=cache) test_state = repo.checkout_and_test( repo_info["head hash"], timeout=TIMEOUT_TESTING, n_tests=3 ) diff --git a/src/python/variables.py b/src/python/variables.py index 1ce260b1f0..d81c733af9 100644 --- a/src/python/variables.py +++ b/src/python/variables.py @@ -9,8 +9,9 @@ CACHE_BACKOFF_TIME = 2 * 60 # 2 minutes, in seconds DELETE_WORKDIRS = True REPOS_PATH = Path("repos") -# TODO: What is this used for? Please give it a more descriptive name. -WORKDIR_PREFIX = Path(".workdir") +WORKDIR_DIRECTORY = Path( + ".workdir" +) # Merges and testing will be performed in this directory. TIMEOUT_MERGING = 60 * 15 # 15 minutes, in seconds N_REPETITIONS = 3 diff --git a/src/scripts/merge_tools/intellimerge.sh b/src/scripts/merge_tools/intellimerge.sh index c79bd27073..f4fa8d157c 100755 --- a/src/scripts/merge_tools/intellimerge.sh +++ b/src/scripts/merge_tools/intellimerge.sh @@ -21,8 +21,7 @@ intellimerge_absolutepath="${ROOT_DIR}/${intellimerge_relativepath}" clone_dir=$1 branch1=$2 branch2=$3 -# TODO: Will this be in the repository? Would it be safer to put it under /tmp or /scratch? -temp_dir=".workdir/intelli_temp_$$/" +temp_dir="/tmp/intelli_temp_$$/" mkdir $temp_dir # run intellimerge diff --git a/src/scripts/utils/run_multiple_machines.sh b/src/scripts/utils/run_multiple_machines.sh index 2c2b7a5351..3db4743239 100755 --- a/src/scripts/utils/run_multiple_machines.sh +++ b/src/scripts/utils/run_multiple_machines.sh @@ -1,6 +1,5 @@ #!/bin/bash -# TODO: A branch of what? This repository? (Same question in `run_remotely.sh`.) -# Run the code for a certain branch on multiple remote machines. +# Run the code of a specific branch of this repository on multiple remote machines. # Usage: ./run_multiple_machines.sh # is the branch to run # is the path to a file containing the addresses of the machines (one address per line) diff --git a/src/scripts/utils/run_remotely.sh b/src/scripts/utils/run_remotely.sh index 44b60dd87c..86f9751ef8 100755 --- a/src/scripts/utils/run_remotely.sh +++ b/src/scripts/utils/run_remotely.sh @@ -1,5 +1,5 @@ #!/bin/sh -# Run the code for a certain branch on a remote machine. +# Run the code of a specific branch of this repository on a remote machine. # Usage: ./run_remotely.sh # is the branch to run # is the ssh address of the machine