diff --git a/run.sh b/run.sh index 9b1c1594a9..bc592a5872 100755 --- a/run.sh +++ b/run.sh @@ -73,7 +73,7 @@ if [ -d "$CACHE_DIR" ]; then find "$CACHE_DIR" -name "*.lock" -delete fi -python3 src/python/clean_cache_placeholders.py \ +python3 src/python/delete_cache_placeholders.py \ --cache_dir "$CACHE_DIR" python3 src/python/write_head_hashes.py \ diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py index eab41d6f91..4b8cea5855 100755 --- a/src/python/cache_utils.py +++ b/src/python/cache_utils.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 -""" Contains all the functions related to the caches. -There will be 4 caches which are stored on disk after running the script: +""" Contains all the functions related to the caches. The functions to interact with each +of the caches are in this file. Each cache is interacted with through the functions +of this file. The caches are all JSON files and are stored in the cache directory. +There will be 4 caches in total which are stored on disk after running the run.sh script: 1) cache/sha_cache_entry: A cache that maps the commit hash to a sha256 hash of the repository. 2) cache/test_cache: A cache that maps a sha256 to test results. 3) cache/merge_results: A cache that maps a merge to the result @@ -28,88 +30,12 @@ def slug_repo_name(repo_slug: str) -> str: return repo_slug.split("/")[1] -def get_cache_lock(repo_slug: str, cache_prefix: Path): - """Returns a lock for the cache of a repository. - Initially the lock is unlocked; the caller must explictly - lock and unlock the lock. - Args: - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - fasteners.InterProcessLock: A lock for the repository. - """ - lock_path = cache_prefix / "locks" / (slug_repo_name(repo_slug) + ".lock") - lock_path.parent.mkdir(parents=True, exist_ok=True) - lock = fasteners.InterProcessLock(lock_path) - return lock - - -def get_cache_path(repo_slug: str, cache_prefix: Path) -> Path: - """Returns the path to the cache file. - Args: - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - Path: The path to the cache file. - """ - cache_file_name = slug_repo_name(repo_slug) + ".json" - cache_path = cache_prefix / cache_file_name - cache_path.parent.mkdir(parents=True, exist_ok=True) - return cache_path - - -def is_in_cache( - cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path -) -> bool: - """Checks if the key is in the cache. - Args: - cache_key (Union[Tuple,str]): The key to check. - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path prefix to the cache directory. - Returns: - bool: True if the repository is in the cache, False otherwise. - """ - cache = load_cache(repo_slug, cache_prefix) - return cache_key in cache - - -def load_cache(repo_slug: str, cache_prefix: Path) -> dict: - """Loads the cache. - Args: - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - dict: The cache. - """ - cache_path = get_cache_path(repo_slug, cache_prefix) - if not cache_path.exists(): - return {} - with open(cache_path, "r") as f: - cache_data = json.load(f) - return cache_data - - -def cache_lookup( - cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path -) -> dict: - """Loads the cache and returns the value for the given key. - Args: - cache_key (Union[Tuple,str]): The key to check. - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - dict: The cache. - """ - cache = load_cache(repo_slug, cache_prefix) - return cache[cache_key] - - -def write_to_cache( +def set_in_cache( cache_key: Union[Tuple, str], cache_value: Union[str, dict, None], repo_slug: str, - cache_prefix: Path, - overwrite: bool = True, + cache_directory: Path, + acquire_lock: bool = True, ) -> None: """Puts an entry in the cache, then writes the cache to disk. This function is not thread-safe. @@ -117,70 +43,48 @@ def write_to_cache( cache_key (Union[Tuple,str]): The key to check. cache_value (dict): The value to write. repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - overwrite (bool, optional) = True: Whether to overwrite an existing cache entry. - If False, attempting to overwrite an existing cache entry throws an exception. + cache_directory (Path): The path to the cache directory. """ - cache_path = get_cache_path(repo_slug, cache_prefix) - cache = load_cache(repo_slug, cache_prefix) - if cache_prefix != Path("cache-small/sha_cache_entry"): - print("write_to_cache:", cache_key, cache_value, repo_slug, cache_prefix) - if cache_key in cache and not overwrite: - raise ValueError("Cache key already exists") + if acquire_lock: + lock = get_cache_lock(repo_slug, cache_directory) + lock.acquire() + cache_path = get_cache_path(repo_slug, cache_directory) + cache = load_cache(repo_slug, cache_directory) cache[cache_key] = cache_value output = json.dumps(cache, indent=4) with open(cache_path, "w") as f: f.write(output) f.flush() - - -def set_in_cache( - cache_key: Union[Tuple, str], - cache_value: Union[str, dict, None], - repo_slug: str, - cache_prefix: Path, - overwrite: bool = True, -) -> None: - """Puts an entry in the cache, then writes the cache to disk. - This function is thread-safe. - Args: - cache_key (Union[Tuple,str]): The key to check. - cache_value (dict): The value to write. - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - overwrite (bool, optional) = True: Whether to overwrite the cache if it already exists. - """ - lock = get_cache_lock(repo_slug, cache_prefix) - lock.acquire() - write_to_cache(cache_key, cache_value, repo_slug, cache_prefix, overwrite) - lock.release() + if acquire_lock: + lock.release() # type: ignore def lookup_in_cache( cache_key: Union[Tuple, str], repo_slug: str, - cache_prefix: Path, + cache_directory: Path, set_run: bool = False, ) -> Union[str, dict, None]: """Checks if the cache is available and loads a specific entry. Args: cache_key (Union[Tuple,str]): The key to check. repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. + cache_directory (Path): The path to the cache directory. set_run (bool, optional) = False: Wheter to insert an empty cache entry if it does not exist. This is useful for preventing multiple runs from attempting to insert the same cache entry. Returns: Union[dict,None]: The cache entry if it exists, None otherwise. """ - lock = get_cache_lock(repo_slug, cache_prefix) + lock = get_cache_lock(repo_slug, cache_directory) lock.acquire() - cache_entry = get_cache_path(repo_slug, cache_prefix) + cache_entry = get_cache_path(repo_slug, cache_directory) cache_entry.parent.mkdir(parents=True, exist_ok=True) - if is_in_cache(cache_key, repo_slug, cache_prefix): + if is_in_cache(cache_key, repo_slug, cache_directory): total_time = 0 while True: - cache_data = cache_lookup(cache_key, repo_slug, cache_prefix) + cache = load_cache(repo_slug, cache_directory) + cache_data = cache[cache_key] if cache_data is not None: break lock.release() @@ -192,6 +96,70 @@ def lookup_in_cache( lock.release() return cache_data if set_run: - write_to_cache(cache_key, None, repo_slug, cache_prefix) + set_in_cache(cache_key, None, repo_slug, cache_directory, acquire_lock=False) lock.release() return None + + +# ====================== Internal functions ====================== + + +def get_cache_lock(repo_slug: str, cache_directory: Path): + """Returns a lock for the cache of a repository. + Initially the lock is unlocked; the caller must explictly + lock and unlock the lock. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path to the cache directory. + Returns: + fasteners.InterProcessLock: A lock for the repository. + """ + lock_path = cache_directory / "locks" / (slug_repo_name(repo_slug) + ".lock") + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock = fasteners.InterProcessLock(lock_path) + return lock + + +def get_cache_path(repo_slug: str, cache_directory: Path) -> Path: + """Returns the path to the cache file. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path to the cache directory. + Returns: + Path: The path to the cache file. + """ + cache_file_name = slug_repo_name(repo_slug) + ".json" + cache_path = cache_directory / cache_file_name + cache_path.parent.mkdir(parents=True, exist_ok=True) + return cache_path + + +def is_in_cache( + cache_key: Union[Tuple, str], repo_slug: str, cache_directory: Path +) -> bool: + """Checks if the key is in the cache. + Args: + cache_key (Union[Tuple,str]): The key to check. + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path prefix to the cache directory. + Returns: + bool: True if the repository is in the cache, False otherwise. + """ + cache = load_cache(repo_slug, cache_directory) + return cache_key in cache + + +def load_cache(repo_slug: str, cache_directory: Path) -> dict: + """Loads the cache associated to the repo_slug found in the cache directory. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path to the cache directory. + Returns: + dict: The cache. + """ + cache_path = get_cache_path(repo_slug, cache_directory) + if not cache_path.exists(): + return {} + with open(cache_path, "r") as f: + cache_data = json.load(f) + return cache_data diff --git a/src/python/clean_cache_placeholders.py b/src/python/delete_cache_placeholders.py similarity index 81% rename from src/python/clean_cache_placeholders.py rename to src/python/delete_cache_placeholders.py index bf0f4b5cd1..22acd35c11 100755 --- a/src/python/clean_cache_placeholders.py +++ b/src/python/delete_cache_placeholders.py @@ -6,10 +6,10 @@ result. This script deletes all placeholders from the cache. Usage: - python clean_cache_placeholders.py --cache_path + python delete_cache_placeholders.py --cache_directory Args: - cache_path (str): Path to cache directory + cache_directory (str): Path to cache directory """ from argparse import ArgumentParser @@ -23,10 +23,10 @@ ) args = parser.parse_args() - cache_path = Path(args.cache_dir) + cache_directory = Path(args.cache_dir) n_deleted = 0 - for file in cache_path.glob("**/*.json"): + for file in cache_directory.glob("**/*.json"): with open(file, "r") as f: data = json.load(f) diff --git a/src/python/latex_output.py b/src/python/latex_output.py index 6dee4cf18f..54b01bedff 100755 --- a/src/python/latex_output.py +++ b/src/python/latex_output.py @@ -14,9 +14,9 @@ following input files: - full_repos_csv: csv file containing the full list of repositories - repos_head_passes_csv: csv file containing the list of repositories whose head passes tests -- tested_merges_path: path to the folder containing the merge results -- merges_path: path to the folder containing all found merges. -- output_dir: path to the folder where the LaTeX files will be saved +- tested_merges_path: path to the directory containing the merge results +- merges_path: path to the directory containing all found merges. +- output_dir: path to the directory where the LaTeX files will be saved """ @@ -76,7 +76,7 @@ def latex_def(name, value) -> str: # Dictonary that lists the different subsets of merge tools for which plots -# and tables are generated. The key is the folder name which will contain all figures +# and tables are generated. The key is the directory name which will contain all figures # that will be used and the value is the list of plots to contain. PLOTS = { "all": [merge_tool.name for merge_tool in MERGE_TOOL], diff --git a/src/python/merge_differ.py b/src/python/merge_differ.py index bc7abeef03..cd8e3a60fa 100755 --- a/src/python/merge_differ.py +++ b/src/python/merge_differ.py @@ -4,8 +4,10 @@ --merges_path --cache_dir This script compares the results of two merge tools on the same merge. -It outputs the diff any two merge results if their results differ. -It ignores merges that have the same result or merges that are not successful. +It outputs the diff of the two merge results if their results differ. +It does not produce output for merges that have the same result or merges that +are not successful. +The output is written in cache_dir/merge_diffs. """ import os @@ -30,7 +32,7 @@ def get_merge_fingerprint( - merge_data: pd.Series, merge_tool: MERGE_TOOL, cache_prefix: Path + merge_data: pd.Series, merge_tool: MERGE_TOOL, cache_directory: Path ) -> Union[Tuple[None, None], Tuple[Repository, str]]: """Returns the repo and the fingerprint of a merge, or (None, None) if the merge is not successful. @@ -49,7 +51,7 @@ def get_merge_fingerprint( return None, None left = merge_data["left"] right = merge_data["right"] - repo = Repository(repo_slug, cache_prefix=cache_prefix) + repo = Repository(repo_slug, cache_directory=cache_directory) ( merge_status, merge_fingerprint, @@ -79,36 +81,34 @@ def get_merge_fingerprint( def merge_differ(args: Tuple[pd.Series, Path]) -> None: - """Diffs the results of any two merge tools on the same merge. - Does not diff merges that have the same result or merges that are not successful. + """Diffs the results of every two merge tools on the same merge. + Writes the diff to a file. Args: args (Tuple[pd.Series,Path]): A tuple containing the merge data and the cache prefix. - Returns: - dict: The result of the test. """ - merge_data, cache_prefix = args + merge_data, cache_directory = args if not merge_data["parents pass"]: return - diff_file_prefix = cache_prefix / "merge_diffs" - diff_file_prefix.mkdir(parents=True, exist_ok=True) + diff_file_directory = cache_directory / "merge_diffs" + diff_file_directory.mkdir(parents=True, exist_ok=True) for merge_tool1 in MERGE_TOOL: repo1, merge_fingerprint1 = get_merge_fingerprint( - merge_data, merge_tool1, cache_prefix + merge_data, merge_tool1, cache_directory ) if repo1 is None or merge_fingerprint1 is None: continue for merge_tool2 in MERGE_TOOL: repo2, merge_fingerprint2 = get_merge_fingerprint( - merge_data, merge_tool2, cache_prefix + merge_data, merge_tool2, cache_directory ) if repo2 is None or merge_fingerprint2 is None: continue - diff_file = diff_file_prefix / diff_file_name( + diff_file = diff_file_directory / diff_file_name( merge_fingerprint1, merge_fingerprint2 ) if diff_file.exists(): @@ -118,8 +118,6 @@ def merge_differ(args: Tuple[pd.Series, Path]) -> None: command.append(str(repo2.repo_path)) with open(diff_file, "w") as f: subprocess.run(command, stdout=f, stderr=f) - del repo2 - del repo1 def diff_file_name(sha1: str, sha2: str) -> Path: @@ -132,8 +130,8 @@ def diff_file_name(sha1: str, sha2: str) -> Path: """ # Use lexicographic order to prevent duplicates if sha1 < sha2: - return Path(sha1 + "_" + sha2 + ".txt") - return Path(sha2 + "_" + sha1 + ".txt") + return Path(sha1 + "_" + sha2 + ".diff") + return Path(sha2 + "_" + sha1 + ".diff") if __name__ == "__main__": diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py index c880493ab4..cf2a396148 100755 --- a/src/python/merge_tester.py +++ b/src/python/merge_tester.py @@ -42,7 +42,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: Returns: pd.Series: The result of the test. """ - repo_slug, merge_data, cache_prefix = args + repo_slug, merge_data, cache_directory = args while psutil.cpu_percent() > 90: print( "merge_tester: Waiting for CPU load to come down", @@ -55,8 +55,9 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: merge_data["parents pass"] = False for branch in ["left", "right"]: - repo = Repository(repo_slug, cache_prefix=cache_prefix) - repo.checkout(merge_data[branch]) + commit_sha = merge_data[branch] + repo = Repository(repo_slug, cache_directory=cache_directory) + repo.checkout(commit_sha) tree_fingerprint = repo.compute_tree_fingerprint() if tree_fingerprint != merge_data[f"{branch}_tree_fingerprint"]: raise Exception( @@ -73,13 +74,12 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: merge_data[f"{branch} test result"] = test_result.name if test_result != TEST_STATE.Tests_passed: return merge_data - del repo merge_data["parents pass"] = True for merge_tool in MERGE_TOOL: if is_merge_success(merge_data[merge_tool.name]): - repo = Repository(repo_slug, cache_prefix=cache_prefix) + repo = Repository(repo_slug, cache_directory=cache_directory) ( result, merge_fingerprint, @@ -108,11 +108,58 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: ) # Update the status from merge success to test result. merge_data[merge_tool.name] = result.name - del repo assert merge_tool.name in merge_data return merge_data +def build_arguments( + args: argparse.Namespace, + repo_slug: str, +) -> list: + """Builds the arguments for the merge_tester function. + Args: + args (argparse.Namespace): The arguments of the script. + repo_slug (str): The slug of the repository. + Returns: + list: The arguments for the merge_tester function. + """ + merge_list_file = Path( + os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") + ) + output_file = Path( + os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") + ) + if not merge_list_file.exists(): + print( + "merge_tester:", + repo_slug, + "does not have a list of merges. Missing file: ", + merge_list_file, + ) + return [] + + if output_file.exists(): + print( + "merge_tester: Skipping", + repo_slug, + "because it is already computed.", + ) + return [] + try: + merges = pd.read_csv(merge_list_file, header=0, index_col="idx") + except pd.errors.EmptyDataError: + print( + "merge_tester: Skipping", + repo_slug, + "because it does not contain any merges.", + ) + return [] + return [ + (repo_slug, merge_data, Path(args.cache_dir)) + for _, merge_data in merges.iterrows() + ] + + def main(): # pylint: disable=too-many-locals,too-many-statements """Main function""" print("merge_tester: Start") @@ -131,41 +178,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements merge_tester_arguments = [] for _, repository_data in tqdm(repos.iterrows(), total=len(repos)): repo_slug = repository_data["repository"] - merge_list_file = Path( - os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") - ) - output_file = Path( - os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") - ) - if not merge_list_file.exists(): - print( - "merge_tester:", - repo_slug, - "does not have a list of merges. Missing file: ", - merge_list_file, - ) - continue - - if output_file.exists(): - print( - "merge_tester: Skipping", - repo_slug, - "because it is already computed.", - ) - continue - try: - merges = pd.read_csv(merge_list_file, header=0, index_col="idx") - except pd.errors.EmptyDataError: - print( - "merge_tester: Skipping", - repo_slug, - "because it does not contain any merges.", - ) - continue - merge_tester_arguments += [ - (repo_slug, merge_data, Path(args.cache_dir)) - for _, merge_data in merges.iterrows() - ] + merge_tester_arguments += build_arguments(args, repo_slug) # Shuffle input to reduce cache contention random.seed(42) diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py index 1818263a59..d149730d9d 100755 --- a/src/python/merge_tools_comparator.py +++ b/src/python/merge_tools_comparator.py @@ -51,14 +51,12 @@ def merger( # pylint: disable=too-many-locals Returns: dict: A dictionary containing the merge result. """ - repo_slug, merge_data, cache_prefix = args + repo_slug, merge_data, cache_directory = args - cache_entry = merge_data["left"] + "_" + merge_data["right"] - merge_cache_prefix = cache_prefix / "merge_results" + cache_key = merge_data["left"] + "_" + merge_data["right"] + merge_cache_directory = cache_directory / "merge_results" - result = lookup_in_cache(cache_entry, repo_slug, merge_cache_prefix, True) - # if result is None: - # raise Exception(cache_entry," HERE") + result = lookup_in_cache(cache_key, repo_slug, merge_cache_directory, True) if result is not None and isinstance(result, dict): return result @@ -71,9 +69,20 @@ def merger( # pylint: disable=too-many-locals merge_data["right"], merge_tool.name, ) - cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []} + cache_data[merge_tool.name] = {"results": [], "run_time": []} + log_file: Path = ( + cache_directory + / "merge_logs" + / slug_repo_name(repo_slug) + / merge_data["left"] + / merge_data["right"] + / (merge_tool.name + f".log") + ) + log_file.parent.mkdir(parents=True, exist_ok=True) + log_file.unlink(missing_ok=True) + for i in range(N_REPETITIONS): - repo = Repository(repo_slug, cache_prefix=cache_prefix) + repo = Repository(repo_slug, cache_directory=cache_directory) ( merge_status, merge_fingerprint, @@ -87,53 +96,111 @@ def merger( # pylint: disable=too-many-locals right_commit=merge_data["right"], timeout=TIMEOUT_MERGING, ) - log_file: Path = ( - cache_prefix - / "merge_logs" - / slug_repo_name(repo_slug) - / merge_data["left"] - / merge_data["right"] - / (merge_tool.name + f".log") - ) - log_file.parent.mkdir(parents=True, exist_ok=True) - if log_file.exists(): - log_file.unlink() - with open(log_file, "w") as f: - f.write(explanation) + if i == 0: # Only write the log file once. + with open(log_file, "w") as f: + f.write(explanation) + cache_data[merge_tool.name]["results"].append(merge_status.name) cache_data[merge_tool.name]["log_file"] = str(log_file) cache_data[merge_tool.name]["run_time"].append(run_time) if "merge_fingerprint" not in cache_data[merge_tool.name]: cache_data[merge_tool.name]["merge_fingerprint"] = merge_fingerprint - else: - if ( - cache_data[merge_tool.name]["merge_fingerprint"] - != merge_fingerprint - ): - raise Exception( - "merge_tools_comparator: Merge fingerprint mismatch", - i, - repo_slug, - merge_data["left"], - merge_data["right"], - merge_tool.name, - merge_status, - cache_data[merge_tool.name]["merge_fingerprint"], - merge_fingerprint, - cache_data, - ) - - if "left_tree_fingerprint" not in cache_data: cache_data["left_tree_fingerprint"] = left_fingerprint cache_data["right_tree_fingerprint"] = right_fingerprint else: + assert ( + cache_data[merge_tool.name]["merge_fingerprint"] + == merge_fingerprint + ) assert cache_data["left_tree_fingerprint"] == left_fingerprint assert cache_data["right_tree_fingerprint"] == right_fingerprint - del repo - set_in_cache(cache_entry, cache_data, repo_slug, merge_cache_prefix) + + set_in_cache(cache_key, cache_data, repo_slug, merge_cache_directory) return cache_data +def check_if_two_merges_differ(cache_data: dict) -> bool: + """Returns true if two merge tools differ on the same merge. + Args: + cache_data (dict): A dictionary containing the merge data. + Returns: + bool: True if two merge tools differ on the same merge. + """ + for merge_tool1 in MERGE_TOOL: + for merge_tool2 in MERGE_TOOL: + if is_merge_success( + cache_data[merge_tool1.name]["results"][0] + ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): + return True + if is_merge_success( + cache_data[merge_tool1.name]["results"][0] + ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): + if ( + cache_data[merge_tool1.name]["merge_fingerprint"] + != cache_data[merge_tool2.name]["merge_fingerprint"] + ): + return True + return False + + +def build_merge_arguments(args: argparse.Namespace, repo_slug: str): + """ + Creates the arguments for the merger function. + Args: + args (argparse.Namespace): The arguments to the script. + repo_slug (str): The repository slug. + Returns: + list: A list of arguments for the merger function. + """ + merge_list_file = Path( + os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") + ) + output_file = Path( + os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") + ) + if not merge_list_file.exists(): + raise Exception( + "merge_tools_comparator:", + repo_slug, + "does not have a list of merges. Missing file: ", + merge_list_file, + ) + + if output_file.exists(): + print( + "merge_tools_comparator: Skipping", + repo_slug, + "because it is already computed.", + ) + return [] + + merges = pd.read_csv( + merge_list_file, + names=["idx", "branch_name", "merge", "left", "right", "notes"], + dtype={ + "idx": int, + "branch_name": str, + "merge": str, + "left": str, + "right": str, + "notes": str, + }, + header=0, + index_col="idx", + ) + merges["notes"].replace(np.nan, "", inplace=True) + + if args.only_trivial_merges: + merges = merges[merges["notes"].str.contains("a parent is the base")] + elif not args.include_trivial_merges: + merges = merges[~merges["notes"].str.contains("a parent is the base")] + arguments = [ + (repo_slug, merge_data, Path(args.cache_dir)) + for _, merge_data in merges.iterrows() + ] + return arguments + + if __name__ == "__main__": print("merge_tools_comparator: Start") parser = argparse.ArgumentParser() @@ -152,55 +219,8 @@ def merger( # pylint: disable=too-many-locals print("merge_tools_comparator: Constructing Inputs") merger_arguments = [] for _, repository_data in tqdm(repos.iterrows(), total=len(repos)): - merges_repo = [] repo_slug = repository_data["repository"] - merge_list_file = Path( - os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") - ) - output_file = Path( - os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") - ) - if not merge_list_file.exists(): - raise Exception( - "merge_tools_comparator:", - repo_slug, - "does not have a list of merges. Missing file: ", - merge_list_file, - ) - - if output_file.exists(): - print( - "merge_tools_comparator: Skipping", - repo_slug, - "because it is already computed.", - ) - continue - - merges = pd.read_csv( - merge_list_file, - names=["idx", "branch_name", "merge", "left", "right", "notes"], - dtype={ - "idx": int, - "branch_name": str, - "merge": str, - "left": str, - "right": str, - "notes": str, - }, - header=0, - index_col="idx", - ) - merges["notes"].replace(np.nan, "", inplace=True) - - if args.only_trivial_merges: - merges = merges[merges["notes"].str.contains("a parent is the base")] - elif not args.include_trivial_merges: - merges = merges[~merges["notes"].str.contains("a parent is the base")] - - merger_arguments += [ - (repo_slug, merge_data, Path(args.cache_dir)) - for _, merge_data in merges.iterrows() - ] + merger_arguments += build_merge_arguments(args, repo_slug) # Shuffle input to reduce cache contention random.seed(42) @@ -224,21 +244,8 @@ def merger( # pylint: disable=too-many-locals repo_slug = merger_arguments[i][0] merge_data = merger_arguments[i][1] cache_data = merger_results[i] - two_merge_tools_differ = False - for merge_tool1 in MERGE_TOOL: - for merge_tool2 in MERGE_TOOL: - if is_merge_success( - cache_data[merge_tool1.name]["results"][0] - ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): - two_merge_tools_differ = True - if is_merge_success( - cache_data[merge_tool1.name]["results"][0] - ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): - if ( - cache_data[merge_tool1.name]["merge_fingerprint"] - != cache_data[merge_tool2.name]["merge_fingerprint"] - ): - two_merge_tools_differ = True + two_merge_tools_differ = check_if_two_merges_differ(cache_data) + merge_data["two merge tools differ"] = two_merge_tools_differ merge_data["left_tree_fingerprint"] = cache_data["left_tree_fingerprint"] merge_data["right_tree_fingerprint"] = cache_data["right_tree_fingerprint"] diff --git a/src/python/repo.py b/src/python/repo.py index c886456224..c332c564c3 100755 --- a/src/python/repo.py +++ b/src/python/repo.py @@ -76,10 +76,10 @@ def compute_explanation( def repo_test(wcopy_dir: Path, timeout: int) -> Tuple[TEST_STATE, str]: """Returns the result of run_repo_tests.sh on the given working copy. - If the test process passes then the entire test is marked as passed. - If the test process timeouts then the entire test is marked as timeout. + If the test process passes then the function returns and marks it as passed. + If the test process timeouts then the function returns and marks it as timedout. Args: - wcopy_dir (Path): The path of the working copy (the clone). + wcopy_dir (Path): The directory of the working copy (the clone). timeout (int): Test timeout limit, in seconds. Returns: TEST_STATE: The result of the test. @@ -112,23 +112,23 @@ class Repository: def __init__( self, repo_slug: str, - cache_prefix: Path = Path(""), + cache_directory: Path = Path(""), ) -> None: """Initializes the repository. Args: repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The prefix of the cache. + cache_directory (Path): The prefix of the cache. """ self.repo_slug = repo_slug self.path = REPOS_PATH / repo_slug.split("/")[1] workdir_id = uuid.uuid4().hex - self.workdir = WORKDIR_PREFIX / workdir_id + self.workdir = WORKDIR_DIRECTORY / workdir_id self.workdir.mkdir(parents=True, exist_ok=True) self.repo_path = self.workdir / self.path.name shutil.copytree(self.path, self.repo_path) self.repo = Repo(self.repo_path) - self.test_cache_prefix = cache_prefix / "test_cache" - self.sha_cache_prefix = cache_prefix / "sha_cache_entry" + self.test_cache_directory = cache_directory / "test_cache" + self.sha_cache_directory = cache_directory / "sha_cache_entry" def checkout(self, commit: str) -> Tuple[bool, str]: """Checks out the given commit. @@ -152,13 +152,13 @@ def checkout(self, commit: str) -> Tuple[bool, str]: + str(e) ) cache_entry = {"sha": None, "explanation": explanation} - set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_prefix) + set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_directory) return False, explanation cache_entry = { "sha": self.compute_tree_fingerprint(), "explanation": explanation, } - set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_prefix) + set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_directory) return True, explanation def _merge_and_test( # pylint: disable=too-many-arguments @@ -188,7 +188,8 @@ def _merge_and_test( # pylint: disable=too-many-arguments str: The tree fingerprint of the merge result. str: The left fingerprint. str: The right fingerprint. - float: The time it took to run the merge, in seconds. + float: The time it took to run the merge, in seconds. Does not include + the test time. """ ( merge_status, @@ -281,7 +282,7 @@ def create_branch( commit, {"sha": None, "explanation": explanation}, self.repo_slug, - self.sha_cache_prefix, + self.sha_cache_directory, ) return None, explanation tree_fingerprint = self.compute_tree_fingerprint() @@ -329,7 +330,7 @@ def merge( # pylint: disable=too-many-locals cache_entry_name, {"sha": None, "explanation": explanation}, self.repo_slug, - self.sha_cache_prefix, + self.sha_cache_directory, ) return ( MERGE_STATE.Git_checkout_failed, @@ -381,7 +382,7 @@ def merge( # pylint: disable=too-many-locals cache_entry_name, cache_entry, self.repo_slug, - self.sha_cache_prefix, + self.sha_cache_directory, ) return ( merge_status, @@ -415,16 +416,18 @@ def compute_tree_fingerprint(self) -> str: def get_sha_cache_entry( self, commit: str, start_merge: bool = False ) -> Union[None, dict]: - """Gets a SHA cache entry. + """The SHA cache maps a commit to a tree fingerprint. + This function checks if the commit is in the cache. Args: commit (str): The commit to check. start_merge (bool, optional) = False: Whether to indicate that the merge starts if the commit is not in the cache. Returns: - Union[None,dict]: The cache entry if the commit is in the cache, None otherwise. + Union[None, dict]: The cache entry if the commit is in the cache, None otherwise. + The dict contains the sha entry and an explanation entry. """ cache = lookup_in_cache( - commit, self.repo_slug, self.sha_cache_prefix, set_run=start_merge + commit, self.repo_slug, self.sha_cache_directory, set_run=start_merge ) if cache is None: return None @@ -445,7 +448,7 @@ def get_test_cache_entry( None otherwise. """ cache = lookup_in_cache( - sha, self.repo_slug, self.test_cache_prefix, set_run=start_test + sha, self.repo_slug, self.test_cache_directory, set_run=start_test ) if cache is None: return None @@ -504,6 +507,7 @@ def test(self, timeout: int, n_tests: int) -> TEST_STATE: """Tests the repository. The test results of multiple runs is combined into one result. If one of the runs passes then the entire test is marked as passed. If one of the runs timeouts then the entire test is marked as timeout. + Otherwise all runs must fail for the entire test to be marked as failed. Args: timeout (int): The timeout limit, in seconds. n_tests (int): The number of times to run the test suite. @@ -523,7 +527,7 @@ def test(self, timeout: int, n_tests: int) -> TEST_STATE: test_state, test_output = repo_test(self.repo_path, timeout) test_log_file = Path( os.path.join( - self.test_cache_prefix, + self.test_cache_directory, "logs", slug_repo_name(self.repo_slug), sha + "_" + str(i) + ".log", @@ -540,7 +544,7 @@ def test(self, timeout: int, n_tests: int) -> TEST_STATE: if test_state in (TEST_STATE.Tests_passed, TEST_STATE.Tests_timedout): break - set_in_cache(sha, cache_data, self.repo_slug, self.test_cache_prefix) + set_in_cache(sha, cache_data, self.repo_slug, self.test_cache_directory) return TEST_STATE[cache_data["test_result"]] def __del__(self) -> None: diff --git a/src/python/test_repo_heads.py b/src/python/test_repo_heads.py index bd668230ac..714007b2f1 100755 --- a/src/python/test_repo_heads.py +++ b/src/python/test_repo_heads.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Tests the HEAD commits of multiple repos and validates it if the test passes. +"""Tests the HEAD commits of multiple repos and considers them as valid if the test passes. usage: python3 test_repo_heads.py --repos_csv_with_hashes --output_path @@ -42,7 +42,7 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE: repo_slug = repo_info["repository"] print("test_repo_heads:", repo_slug, ": head_passes_tests : started") - repo = Repository(repo_slug, cache_prefix=cache) + repo = Repository(repo_slug, cache_directory=cache) test_state = repo.checkout_and_test( repo_info["head hash"], timeout=TIMEOUT_TESTING, n_tests=3 ) diff --git a/src/python/variables.py b/src/python/variables.py index 519c0fb9f9..d81c733af9 100644 --- a/src/python/variables.py +++ b/src/python/variables.py @@ -9,7 +9,9 @@ CACHE_BACKOFF_TIME = 2 * 60 # 2 minutes, in seconds DELETE_WORKDIRS = True REPOS_PATH = Path("repos") -WORKDIR_PREFIX = Path(".workdir") +WORKDIR_DIRECTORY = Path( + ".workdir" +) # Merges and testing will be performed in this directory. TIMEOUT_MERGING = 60 * 15 # 15 minutes, in seconds N_REPETITIONS = 3 diff --git a/src/scripts/merge_tools/git_hires_merge.sh b/src/scripts/merge_tools/git_hires_merge.sh index b1912eb703..a463e936b0 100755 --- a/src/scripts/merge_tools/git_hires_merge.sh +++ b/src/scripts/merge_tools/git_hires_merge.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./git-hires-merge.sh +# usage: ./git_hires_merge.sh clone_dir=$1 branch1=$2 diff --git a/src/scripts/merge_tools/gitmerge_ort.sh b/src/scripts/merge_tools/gitmerge_ort.sh index 51947da10e..d8a695734b 100755 --- a/src/scripts/merge_tools/gitmerge_ort.sh +++ b/src/scripts/merge_tools/gitmerge_ort.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-ort.sh +# usage: ./gitmerge_ort.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_ort_ignorespace.sh b/src/scripts/merge_tools/gitmerge_ort_ignorespace.sh index 2a40212493..ef1a5df5f8 100755 --- a/src/scripts/merge_tools/gitmerge_ort_ignorespace.sh +++ b/src/scripts/merge_tools/gitmerge_ort_ignorespace.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-ort-ignorespace.sh +# usage: ./gitmerge_ort_ignorespace.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_ort_imports.sh b/src/scripts/merge_tools/gitmerge_ort_imports.sh index 221280a6e0..eb83295136 100755 --- a/src/scripts/merge_tools/gitmerge_ort_imports.sh +++ b/src/scripts/merge_tools/gitmerge_ort_imports.sh @@ -1,12 +1,21 @@ #!/usr/bin/env sh -# usage: ./gitmerge-ort.sh +# usage: ./gitmerge_ort_imports.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 branch1=$2 branch2=$3 strategy="-s ort" -"$MERGE_SCRIPTS_DIR"/gitmerge.sh "$clone_dir" "$branch1" "$branch2" "$strategy" +if "$MERGE_SCRIPTS_DIR"/gitmerge.sh "$clone_dir" "$branch1" "$branch2" "$strategy"; then + exit 0 +fi -(cd "$clone_dir" && "$MERGE_SCRIPTS_DIR"/resolve-import-conflicts) +cd "$clone_dir" || exit 1 +if ! "$MERGE_SCRIPTS_DIR"/resolve-import-conflicts; then + echo "Conflict" + git merge --abort + exit 1 +fi + +exit 0 diff --git a/src/scripts/merge_tools/gitmerge_ort_imports_ignorespace.sh b/src/scripts/merge_tools/gitmerge_ort_imports_ignorespace.sh index 03770b22f7..41ee46a2e1 100755 --- a/src/scripts/merge_tools/gitmerge_ort_imports_ignorespace.sh +++ b/src/scripts/merge_tools/gitmerge_ort_imports_ignorespace.sh @@ -1,12 +1,21 @@ #!/usr/bin/env sh -# usage: ./gitmerge-ort-ignorespace.sh +# usage: ./gitmerge_ort_imports_ignorespace.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 branch1=$2 branch2=$3 strategy="-s ort -Xignore-space-change" -"$MERGE_SCRIPTS_DIR"/gitmerge.sh "$clone_dir" "$branch1" "$branch2" "$strategy" +if "$MERGE_SCRIPTS_DIR"/gitmerge.sh "$clone_dir" "$branch1" "$branch2" "$strategy"; then + exit 0 +fi -(cd "$clone_dir" && "$MERGE_SCRIPTS_DIR"/resolve-import-conflicts) +cd "$clone_dir" || exit 1 +if ! "$MERGE_SCRIPTS_DIR"/resolve-import-conflicts; then + echo "Conflict" + git merge --abort + exit 1 +fi + +exit 0 diff --git a/src/scripts/merge_tools/gitmerge_recursive_histogram.sh b/src/scripts/merge_tools/gitmerge_recursive_histogram.sh index 8e239dd933..6453c6e00e 100755 --- a/src/scripts/merge_tools/gitmerge_recursive_histogram.sh +++ b/src/scripts/merge_tools/gitmerge_recursive_histogram.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-recursive-histogram.sh +# usage: ./gitmerge_recursive_histogram.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_recursive_ignorespace.sh b/src/scripts/merge_tools/gitmerge_recursive_ignorespace.sh index 3535ace430..812d2dcd18 100755 --- a/src/scripts/merge_tools/gitmerge_recursive_ignorespace.sh +++ b/src/scripts/merge_tools/gitmerge_recursive_ignorespace.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-recursive-histogram.sh +# usage: ./gitmerge_recursive_ignorespace.sh MERGE_DIR="$(dirname "$0")" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_recursive_minimal.sh b/src/scripts/merge_tools/gitmerge_recursive_minimal.sh index 4c2aefb8ef..e7d916539a 100755 --- a/src/scripts/merge_tools/gitmerge_recursive_minimal.sh +++ b/src/scripts/merge_tools/gitmerge_recursive_minimal.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-recursive-minimal.sh +# usage: ./gitmerge_recursive_minimal.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_recursive_myers.sh b/src/scripts/merge_tools/gitmerge_recursive_myers.sh index df12c4306c..f95f093ba7 100755 --- a/src/scripts/merge_tools/gitmerge_recursive_myers.sh +++ b/src/scripts/merge_tools/gitmerge_recursive_myers.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-recursive-myers.sh +# usage: ./gitmerge_recursive_myers.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_recursive_patience.sh b/src/scripts/merge_tools/gitmerge_recursive_patience.sh index 6130ebbb4c..77e5a8e449 100755 --- a/src/scripts/merge_tools/gitmerge_recursive_patience.sh +++ b/src/scripts/merge_tools/gitmerge_recursive_patience.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-recursive-patience.sh +# usage: ./gitmerge_recursive_patience.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_resolve.sh b/src/scripts/merge_tools/gitmerge_resolve.sh index 928a97960a..859faf70e2 100755 --- a/src/scripts/merge_tools/gitmerge_resolve.sh +++ b/src/scripts/merge_tools/gitmerge_resolve.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# usage: ./gitmerge-resolve.sh +# usage: ./gitmerge_resolve.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/intellimerge.sh b/src/scripts/merge_tools/intellimerge.sh index dfb5358036..f4fa8d157c 100755 --- a/src/scripts/merge_tools/intellimerge.sh +++ b/src/scripts/merge_tools/intellimerge.sh @@ -21,7 +21,7 @@ intellimerge_absolutepath="${ROOT_DIR}/${intellimerge_relativepath}" clone_dir=$1 branch1=$2 branch2=$3 -temp_dir=".workdir/intelli_temp_$$/" +temp_dir="/tmp/intelli_temp_$$/" mkdir $temp_dir # run intellimerge diff --git a/src/scripts/merge_tools/resolve-conflicts.py b/src/scripts/merge_tools/resolve-conflicts.py new file mode 100755 index 0000000000..0266230039 --- /dev/null +++ b/src/scripts/merge_tools/resolve-conflicts.py @@ -0,0 +1,285 @@ +#! /usr/bin/env python + +# This is a helper script for `resolve-adjacent-conflicts` and +# `resolve-import-conflicts`. + +"""Edits a file in place to remove certain conflict markers. + +Usage: resolve-conflicts.py [options] + +--java_imports: Resolves conflicts related to Java import statements +The output includes every `import` statements that is in either of the parents. + +--adjacent_lines: Resolves conflicts on adjacent lines, by accepting both edits. + +Exit status is 0 (success) if no conflicts remain. +Exit status is 1 (failure) if conflicts remain. +""" + +from argparse import ArgumentParser +import itertools +import os +import shutil +import sys +import tempfile +from typing import List, Union, Tuple + + +def main(): # pylint: disable=too-many-locals + """The main entry point.""" + arg_parser = ArgumentParser() + arg_parser.add_argument("filename") + arg_parser.add_argument( + "--java_imports", + action="store_true", + help="If set, resolve conflicts related to Java import statements", + ) + arg_parser.add_argument( + "--adjacent_lines", + action="store_true", + help="If set, resolve conflicts on adjacent lines", + ) + args = arg_parser.parse_args() + filename = args.filename + + with open(filename) as file: + lines = file.readlines() + + # Exit status 0 means no conflicts remain, 1 means some merge conflict remains. + conflicts_remain = False + with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp: + file_len = len(lines) + i = 0 + while i < file_len: + conflict = looking_at_conflict(filename, i, lines) + if conflict is None: + tmp.write(lines[i]) + i = i + 1 + else: + (base, parent1, parent2, num_lines) = conflict + merged = merge( + base, parent1, parent2, args.java_imports, args.adjacent_lines + ) + if merged is None: + tmp.write(lines[i]) + i = i + 1 + conflicts_remain = True + else: + for line in merged: + tmp.write(line) + i = i + num_lines + + tmp.close() + shutil.copy(tmp.name, filename) + os.unlink(tmp.name) + + if conflicts_remain: + sys.exit(1) + else: + sys.exit(0) + + +def looking_at_conflict( # pylint: disable=too-many-return-statements + filename: str, start_index: int, lines: List[str] +) -> Union[None, Tuple[List[str], List[str], List[str], int]]: + """Tests whether the following text starts a conflict. + If not, returns None. + If so, returns a 4-tuple of (base, parent1, parent2, num_lines_in_conflict) + where the first 3 elements of the tuple are lists of lines. + """ + + if not lines[start_index].startswith("<<<<<<<"): + return None + + base = [] + parent1 = [] + parent2 = [] + + num_lines = len(lines) + index = start_index + 1 + if index == num_lines: + return None + while not ( + lines[index].startswith("|||||||") or lines[index].startswith("=======") + ): + parent1.append(lines[index]) + index = index + 1 + if index == num_lines: + print( + "Starting at line " + + str(start_index) + + ", did not find ||||||| or ======= in " + + filename + ) + return None + if lines[index].startswith("|||||||"): + index = index + 1 + if index == num_lines: + print("File ends with |||||||: " + filename) + return None + while not lines[index].startswith("======="): + base.append(lines[index]) + index = index + 1 + if index == num_lines: + print( + "Starting at line " + + str(start_index) + + ", did not find ======= in " + + filename + ) + return None + assert lines[index].startswith("=======") + index = index + 1 # skip over "=======" line + if index == num_lines: + print("File ends with =======: " + filename) + return None + while not lines[index].startswith(">>>>>>>"): + parent2.append(lines[index]) + index = index + 1 + if index == num_lines: + print( + "Starting at line " + + str(start_index) + + ", did not find >>>>>>> in " + + filename + ) + return None + index = index + 1 + + return (base, parent1, parent2, index - start_index) + + +def merge( + base: List[str], + parent1: List[str], + parent2: List[str], + java_imports: bool, + adjacent_lines: bool, +) -> Union[List[str], None]: + """Given text for the base and two parents, return merged text. + + Args: + base: a list of lines + parent1: a list of lines + parent2: a list of lines + + Returns: + a list of lines, or None if it cannot do merging. + """ + + print(base, parent1, parent2) + + if java_imports: + if ( + all_import_lines(base) + and all_import_lines(parent1) + and all_import_lines(parent2) + ): + # A simplistic merge that retains all import lines in either parent. + return list(set(parent1 + parent2)).sort() + + if adjacent_lines: + adjacent_line_merge = merge_edits_on_different_lines(base, parent1, parent2) + if adjacent_line_merge is not None: + return adjacent_line_merge + + return None + + +def all_import_lines(lines: List[str]) -> bool: + """Return true if every given line is a Java import line or is blank.""" + return all(line.startswith("import ") or line.strip() == "" for line in lines) + + +def merge_edits_on_different_lines( + base, parent1: List[str], parent2: List[str] +) -> Union[List[str], None]: + """Return a merged version, if at most parent1 or parent2 edits each line. + Otherwise, return None. + """ + + print("Entered merge_edits_on_different_lines") + + ### No lines are added or removed, only modified. + base_len = len(base) + result = None + if base_len == len(parent1) and base_len == len(parent2): + result = [] + for base_line, parent1_line, parent2_line in itertools.zip_longest( + base, parent1, parent2 + ): + print("Considering line:", base_line, parent1_line, parent2_line) + if parent1_line == parent2_line: + result.append(parent1_line) + elif base_line == parent1_line: + result.append(parent2_line) + elif base_line == parent2_line: + result.append(parent1_line) + else: + result = None + break + print("merge_edits_on_different_lines =>", result) + if result is not None: + print("merge_edits_on_different_lines =>", result) + return result + + ### Deletions at the beginning or end. + if base_len != 0: + result = merge_base_is_prefix_or_suffix(base, parent1, parent2) + if result is None: + result = merge_base_is_prefix_or_suffix(base, parent2, parent1) + if result is not None: + return result + + ### Interleaved deletions, with an empty merge outcome. + if base_len != 0: + if issubsequence(parent1, base) and issubsequence(parent2, base): + return [] + + print("merge_edits_on_different_lines =>", result) + return result + + +def merge_base_is_prefix_or_suffix( + base: List[str], parent1: List[str], parent2: List[str] +) -> Union[List[str], None]: + """Special cases when the base is a prefix or suffix of parent1. + That is, parent1 is pure additions at the beginning or end of base. Parent2 + deleted all the lines, possibly replacing them by something else. (We know + this because there is no common line in base and parent2. If there were, it + would also be in parent1, and the hunk would have been split into two at the + common line that's in all three texts.) + We know the relative position of the additions in parent1. + """ + base_len = len(base) + parent1_len = len(parent1) + parent2_len = len(parent2) + if base_len < parent1_len: + if parent1[:base_len] == base: + print("startswith", parent1, base) + return parent2 + parent1[base_len:] + if parent1[-base_len:] == base: + print("endswith", parent1, base) + return parent1[:-base_len] + parent2 + return None + + +def issubsequence(s1: List[str], s2: List[str]) -> bool: + """Returns true if s1 is subsequence of s2.""" + + # Iterative implementation. + + n, m = len(s1), len(s2) + i, j = 0, 0 + while i < n and j < m: + if s1[i] == s2[j]: + i += 1 + j += 1 + + # If i reaches end of s1, we found all characters of s1 in s2, + # so s1 is a subsequence of s2. + return i == n + + +if __name__ == "__main__": + main() diff --git a/src/scripts/merge_tools/resolve-import-conflicts b/src/scripts/merge_tools/resolve-import-conflicts index feaee8b83c..73db46473d 100755 --- a/src/scripts/merge_tools/resolve-import-conflicts +++ b/src/scripts/merge_tools/resolve-import-conflicts @@ -1,12 +1,24 @@ #!/bin/bash +# bash, not POSIX sh, because of "readarray". -# This script edits files to remove conflict markers related to Java imports. -# It works on all files given on the command line; -# if none are given, it works on all files in or under the current directory. +# This script edits files in place to remove conflict markers related to Java +# imports. For a given conflict that involves only `import` statements and +# blank lines, the output includes every `import` statement that is in either +# of the parents. This script leaves other conflicts untouched. + +# Usage: +# resolve-import-conflicts [file ...] +# +# The script works on all files given on the command line. +# If none are given, the script works on all files in or under the current directory. +# +# The exit status code is 0 (success) if all conflicts are resolved in all the files. +# The exit status code is 1 (failure) if any conflict remains. # This script is not a git mergetool. A git mergetool is given the base, parent 1, and # parent 2 files, all without conflict markers. -# However, this script can be run instead of a git mergetool, or after a git mergetool. +# However, this can be run after a git mergetool that leaves conflict markers +# in files, as the default git mergetool does. if [ "$#" -eq 0 ] ; then readarray -t files < <(grep -l -r '^<<<<<<< HEAD' .) @@ -19,7 +31,7 @@ SCRIPTDIR="$(cd "$(dirname "$0")" && pwd -P)" status=0 for file in "${files[@]}" ; do - if ! "${SCRIPTDIR}"/resolve-import-conflicts-in-file.py "$file" ; then + if ! "${SCRIPTDIR}"/resolve-conflicts.py --java_imports "$file" ; then status=1 fi done diff --git a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py deleted file mode 100755 index 9c4f7878c0..0000000000 --- a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py +++ /dev/null @@ -1,149 +0,0 @@ -#! /usr/bin/env python - -"""Edits a file in place to remove conflict markers related to Java imports. -It simplistically leaves all the imports that appear in either parent. -""" - - -# TODO: merge both scripts into one. - -import os -import shutil -import sys -import tempfile - -if len(sys.argv) != 2: - print( - "resolve-import-conflicts-in-file: Provide exactly one command-line argument." - ) - sys.exit(1) - -filename = sys.argv[1] -with open(filename) as file: - lines = file.readlines() - - -def all_import_lines(lines): - """Return true if every line is a Java import line.""" - return all(line.startswith("import ") for line in lines) - - -def merge(base, parent1, parent2): - """Given text for the base and two parents, return merged text. - - This currently does a simplistic merge that retains all lines in either parent. - - Args: - base: a list of lines - parent1: a list of lines - parent2: a list of lines - - Returns: - a list of lines, or None if it cannot do merging. - """ - - if ( - all_import_lines(base) - and all_import_lines(parent1) - and all_import_lines(parent2) - ): - return parent1 + parent2 - - return None - - -def looking_at_conflict(start_index, lines): # pylint: disable=R0911 - """Tests whether the text starting at line `start_index` is the beginning of a conflict. - If not, returns None. - If so, returns a 4-tuple of (base, parent1, parent2, num_lines_in_conflict) - where the first 3 elements of the tuple are lists of lines. - Args: - start_index: an index into `lines`. - lines: all the lines of the file with name `filename`. - """ - - if not lines[start_index].startswith("<<<<<<<"): - return None - - base = [] - parent1 = [] - parent2 = [] - - num_lines = len(lines) - index = start_index + 1 - if index == num_lines: - return None - while not ( - lines[index].startswith("|||||||") or lines[index].startswith("=======") - ): - parent1.append(lines[index]) - index = index + 1 - if index == num_lines: - print( - "Starting at line " - + start_index - + ", did not find ||||||| or ======= in " - + filename - ) - return None - if lines[index].startswith("|||||||"): - index = index + 1 - if index == num_lines: - print("File ends with |||||||: " + filename) - return None - while not lines[index].startswith("======="): - base.append(lines[index]) - index = index + 1 - if index == num_lines: - print( - "Starting at line " - + start_index - + ", did not find ======= in " - + filename - ) - return None - assert lines[index].startswith("=======") - index = index + 1 # skip over "=======" line - if index == num_lines: - print("File ends with =======: " + filename) - return None - while not lines[index].startswith(">>>>>>>"): - parent2.append(lines[index]) - index = index + 1 - if index == num_lines: - print( - "Starting at line " - + start_index - + ", did not find >>>>>>> in " - + filename - ) - return None - index = index + 1 - - return (base, parent1, parent2, index - start_index) - - -## Main starts here. - -with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp: - file_len = len(lines) - i = 0 - while i < file_len: - conflict = looking_at_conflict(i, lines) - if conflict is None: - tmp.write(lines[i]) - i = i + 1 - else: - (base, parent1, parent2, num_lines) = conflict - merged = merge(base, parent1, parent2) - if merged is None: - tmp.write(lines[i]) - i = i + 1 - else: - for line in merged: - tmp.write(line) - i = i + num_lines - - tmp.close() - shutil.copy(tmp.name, filename) - os.unlink(tmp.name) diff --git a/src/scripts/merge_tools/spork.sh b/src/scripts/merge_tools/spork.sh index 0f3a695f81..7748d5f2e2 100755 --- a/src/scripts/merge_tools/spork.sh +++ b/src/scripts/merge_tools/spork.sh @@ -14,6 +14,7 @@ if [ "$#" -ne 3 ]; then fi # Kill all Java processes that are running for over an hour (to avoid memory leaks). +# Spork tends to create Java processes that don't terminate even when the parent process is killed. killall -9 java --older-than 1h SCRIPT_PATH="$(dirname "$0")"; SCRIPT_PATH="$(eval "cd \"$SCRIPT_PATH\" && pwd")" diff --git a/src/scripts/run_repo_tests.sh b/src/scripts/run_repo_tests.sh index 81cc0ab604..1d647d9b27 100755 --- a/src/scripts/run_repo_tests.sh +++ b/src/scripts/run_repo_tests.sh @@ -7,7 +7,8 @@ set -o nounset -# Test side effects can be seen in the /tmp directory. +# Test side effects can be seen in the /tmp directory which are created by +# testing infrastructure. We delete them to avoid filling up the disk. # We delete all the files older than 2h and owned by the current user. find /tmp -maxdepth 1 -user "$(whoami)" -mmin +120 -exec rm -rf {} \; diff --git a/src/scripts/utils/run_multiple_machines.sh b/src/scripts/utils/run_multiple_machines.sh index afe42f1724..3db4743239 100755 --- a/src/scripts/utils/run_multiple_machines.sh +++ b/src/scripts/utils/run_multiple_machines.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Run the code for a certain branch on multiple remote machines. +# Run the code of a specific branch of this repository on multiple remote machines. # Usage: ./run_multiple_machines.sh # is the branch to run # is the path to a file containing the addresses of the machines (one address per line) diff --git a/src/scripts/utils/run_remotely.sh b/src/scripts/utils/run_remotely.sh index 44b60dd87c..86f9751ef8 100755 --- a/src/scripts/utils/run_remotely.sh +++ b/src/scripts/utils/run_remotely.sh @@ -1,5 +1,5 @@ #!/bin/sh -# Run the code for a certain branch on a remote machine. +# Run the code of a specific branch of this repository on a remote machine. # Usage: ./run_remotely.sh # is the branch to run # is the ssh address of the machine