From 61cc17fae069c52d8a90809724a41acb61e9d8ae Mon Sep 17 00:00:00 2001 From: Michael Ernst Date: Tue, 12 Sep 2023 11:25:06 -0700 Subject: [PATCH 1/7] Mike's code review comments --- src/python/cache_utils.py | 21 +++++++++++++++++++ src/python/clean_cache_placeholders.py | 4 ++++ src/python/latex_output.py | 4 ++++ src/python/merge_differ.py | 11 ++++++++-- src/python/merge_tester.py | 4 ++++ src/python/merge_tools_comparator.py | 5 +++++ src/python/repo.py | 11 ++++++++++ src/python/test_repo_head.py | 1 + src/python/variables.py | 1 + src/scripts/merge_tools/intellimerge.sh | 1 + .../resolve-import-conflicts-in-file.py | 1 + src/scripts/merge_tools/spork.sh | 3 ++- src/scripts/run_repo_tests.sh | 2 ++ src/scripts/utils/run_multiple_machines.sh | 1 + 14 files changed, 67 insertions(+), 3 deletions(-) diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py index 29548dfdde..2c3f469634 100755 --- a/src/python/cache_utils.py +++ b/src/python/cache_utils.py @@ -1,6 +1,11 @@ #!/usr/bin/env python3 """ Contains all the functions related to the caches. +TODO: Elsewhere, the code speaks of "the cache" (example: get_cache_lock), but here it speaks of 4 caches. I'm confused about the relationship. +TODO: Are there 4 caches overall, or 4 caches per repository, or something else? Please clarify. +TODO: "after running the script": Which script? There will be 4 caches which are stored on disk after running the script: +TODO: Is the cache directory literally named "cache"? Where is it on disk? +TODO: Are all the caches JSON files? 1) cache/sha_cache_entry: A cache that maps the commit hash to a sha256 hash of the repository. 2) cache/test_cache: A cache that maps a sha256 to test results. 3) cache/merge_results:A cache that maps a merge to the result @@ -17,6 +22,9 @@ CACHE_BACKOFF_TIME = 2 * 60 # 2 minutes, in seconds TIMEOUT = 90 * 60 # 90 minutes, in seconds +# TODO: In this file, please place the externally-visible or exported functions +# first, and then a comment marker of some kind, and then the "private" ones. + def slug_repo_name(repo_slug: str) -> str: """Given a GitHub repository slug (owner/reponame), returns the reponame. @@ -28,6 +36,9 @@ def slug_repo_name(repo_slug: str) -> str: return repo_slug.split("/")[1] +# TODO: Throughout, please use "_directory" instead of "_prefix" in +# variable names, when the variable actually holds a directory. I +# feel this will be clearer. def get_cache_lock(repo_slug: str, cache_prefix: Path): """Returns a lock for the cache of a repository. Initially the lock is unlocked; the caller must explictly @@ -74,6 +85,10 @@ def is_in_cache( def load_cache(repo_slug: str, cache_prefix: Path) -> dict: + # TODO: "the cache": which one? Also, does this load a cache or a cache + # entry? Please define those terms to help the reader. This method returns + # a cache, which is a dict. `cache_lookup` returns a cache entry, which is + # also a dict. What are they dicts from and to? """Loads the cache. Args: repo_slug (str): The slug of the repository, which is "owner/reponame". @@ -104,6 +119,10 @@ def cache_lookup( return cache[cache_key] +# TODO: The distinction between write_to_cache and set_in_cache is not clear to +# me. If it is only locking, then please give them the same name, but with one +# suffixed by "_without_lock". Also please indicate under what circumstances +# each should be called. def write_to_cache( cache_key: Union[Tuple, str], cache_value: Union[str, dict, None], @@ -156,6 +175,8 @@ def set_in_cache( lock.release() +# TODO: I'm not clear what is the difference between `cache_lookup` and +# `lookup_in_cache`. Please clarify, and indicate when each should be called. def lookup_in_cache( cache_key: Union[Tuple, str], repo_slug: str, diff --git a/src/python/clean_cache_placeholders.py b/src/python/clean_cache_placeholders.py index bf0f4b5cd1..c2d861c7b3 100755 --- a/src/python/clean_cache_placeholders.py +++ b/src/python/clean_cache_placeholders.py @@ -1,4 +1,6 @@ #!/usr/bin/env python3 +# TODO: This file uses "clean", "remove", and "delete". Please choose one term +# and stick to it, in both the code and in the file name. """ Deletes all placeholders from the cache. Placeholders are created when a a process starts; it indicates that is has started and is still running. @@ -9,6 +11,8 @@ python clean_cache_placeholders.py --cache_path Args: +TODO: Throughout, please don't use "_path" as a variable suffix, because it is +# not sufficiently precise. Instead, use either "_file" or "_directory". cache_path (str): Path to cache directory """ diff --git a/src/python/latex_output.py b/src/python/latex_output.py index c28d827a03..200a07cdd6 100755 --- a/src/python/latex_output.py +++ b/src/python/latex_output.py @@ -16,6 +16,7 @@ - repos_head_passes_csv: csv file containing the list of valid repositories - tested_merges_path: path to the folder containing the merge results - merges_path: path to the folder containing all found merges. +TODO: Throughout, be consistent about "directory" vs "folder". - output_dir: path to the folder where the LaTeX files will be saved """ @@ -92,6 +93,9 @@ def latex_def(name, value) -> str: ], } +# TODO: I suggest that these should be declared in repo.py, or move the enums to +# a different file where this can also be defined. Or, define predicates for +# clients to use rather than exporting these lists. MERGE_CORRECT_NAMES = [ TEST_STATE.Tests_passed.name, ] diff --git a/src/python/merge_differ.py b/src/python/merge_differ.py index bd2de8db4a..b85ad2c9ee 100755 --- a/src/python/merge_differ.py +++ b/src/python/merge_differ.py @@ -4,7 +4,9 @@ --merges_path --cache_dir This script compares the results of two merge tools on the same merge. -It outputs the diff any two merge results if their results differ. +# TODO: Where is the output? Standard out? +It outputs the diff of the two merge results if their results differ. +# TODO: Does "ignores" mean there is no output, or is it something deeper? It ignores merges that have the same result or merges that are not successful. """ @@ -78,11 +80,13 @@ def get_merge_fingerprint( def merge_differ(args: Tuple[pd.Series, Path]) -> None: - """Diffs the results of any two merge tools on the same merge. + # TODO: What does this funtion do with the diff? I think it writes to a file? Or does it populate a cache too? + """Diffs the results of every two merge tools on the same merge. Does not diff merges that have the same result or merges that are not successful. Args: args (Tuple[pd.Series,Path]): A tuple containing the merge data and the cache prefix. Returns: + TODO: How is "The result of the test." related to the diff of two merge tools? I don't see this function returning anything. dict: The result of the test. """ merge_data, cache_prefix = args @@ -90,6 +94,7 @@ def merge_differ(args: Tuple[pd.Series, Path]) -> None: if not merge_data["parents pass"]: return + # TODO: See other comment about "_prefix" in variable names; change throughout. diff_file_prefix = cache_prefix / "merge_diffs" diff_file_prefix.mkdir(parents=True, exist_ok=True) @@ -131,6 +136,7 @@ def diff_file_name(sha1: str, sha2: str) -> Path: Path: The name of the diff file. """ if sha1 < sha2: + # TODO: Why does this use ".txt" rather than ".diff" as the file extension? return Path(sha1 + "_" + sha2 + ".txt") return Path(sha2 + "_" + sha1 + ".txt") @@ -143,6 +149,7 @@ def diff_file_name(sha1: str, sha2: str) -> Path: parser.add_argument("--cache_dir", type=Path, default="cache/") args = parser.parse_args() cache_dir = Path(args.cache_dir) + # TODO: See comment elsewhere about "_path" in variable names. cache_diffs_path = cache_dir / "merge_diffs" cache_diffs_path.mkdir(parents=True, exist_ok=True) diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py index 4f239c367d..984bbd32a0 100755 --- a/src/python/merge_tester.py +++ b/src/python/merge_tester.py @@ -54,6 +54,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: print("merge_tester: Started ", repo_slug, merge_data["left"], merge_data["right"]) merge_data["parents pass"] = False + # TODO: It's not clear to me whether "left" and "right" are keys for branches, or shas, or something else. How about improving the key name? (We can change the find-merge-conflicts program if it is the source of the problem.) for branch in ["left", "right"]: repo = Repository(repo_slug, cache_prefix=cache_prefix) repo.checkout(merge_data[branch]) @@ -73,12 +74,14 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: merge_data[f"{branch} test result"] = test_result.name if test_result != TEST_STATE.Tests_passed: return merge_data + # TODO: It looks suspicious to me that the repo is only sometimes, not always, deleted. del repo merge_data["parents pass"] = True for merge_tool in MERGE_TOOL: if is_merge_sucess(merge_data[merge_tool.name]): + # TODO: I suggest abstracting the body of this loop into a separate function, since it stands on its own logically. repo = Repository(repo_slug, cache_prefix=cache_prefix) ( result, @@ -126,6 +129,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx") + # TODO: I suggest abstracting this loop into a separate function (through printing "number of merges to test"). print("merge_tester: Started listing merges to test") merge_tester_arguments = [] for _, repository_data in tqdm(repos.iterrows(), total=len(repos)): diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py index 77d32a75fe..5bc13072ae 100755 --- a/src/python/merge_tools_comparator.py +++ b/src/python/merge_tools_comparator.py @@ -53,6 +53,8 @@ def merger( # pylint: disable=too-many-locals """ repo_slug, merge_data, cache_prefix = args + # TODO: I think this is a key, not an entry. Please clarify. + # TODO: Why are these two SHAs (if they are SHAs) not sorted as is done elsewhere? cache_entry = merge_data["left"] + "_" + merge_data["right"] merge_cache_prefix = cache_prefix / "merge_results" @@ -73,6 +75,7 @@ def merger( # pylint: disable=too-many-locals ) cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []} for i in range(N_MERGES): + # TODO: I suggest abstracting out the body of this loop as a function. repo = Repository(repo_slug, cache_prefix=cache_prefix) ( merge_status, @@ -152,6 +155,7 @@ def merger( # pylint: disable=too-many-locals print("merge_tools_comparator: Constructing Inputs") merger_arguments = [] for _, repository_data in tqdm(repos.iterrows(), total=len(repos)): + # TODO: I suggest abstracting out the body of this loop as a function. merges_repo = [] repo_slug = repository_data["repository"] merge_list_file = Path( @@ -225,6 +229,7 @@ def merger( # pylint: disable=too-many-locals merge_data = merger_arguments[i][1] cache_data = merger_results[i] two_merge_tools_differ = False + # TODO: I suggest abstracting this loop into a function. That will also make it easier to return early, which can be done just with "return". (Currently it makes more iterations even after discovering that two merge tools do differ.) for merge_tool1 in MERGE_TOOL: for merge_tool2 in MERGE_TOOL: if is_merge_sucess( diff --git a/src/python/repo.py b/src/python/repo.py index 41d9265f58..97b9392ec3 100755 --- a/src/python/repo.py +++ b/src/python/repo.py @@ -78,9 +78,11 @@ def compute_explanation( def repo_test(wcopy_dir: Path, timeout: int) -> Tuple[TEST_STATE, str]: """Returns the result of run_repo_tests.sh on the given working copy. + TODO: What is the definition of "the entire test"? How is it different than the test process? If the test process passes then the entire test is marked as passed. If the test process timeouts then the entire test is marked as timeout. Args: + TODO: Minor: Throughout, you don't need "The path of" at the beginning of comments. Or change it to "The directory of". wcopy_dir (Path): The path of the working copy (the clone). timeout (int): Test timeout limit, in seconds. Returns: @@ -190,6 +192,10 @@ def _merge_and_test( # pylint: disable=too-many-arguments str: The tree fingerprint of the merge result. str: The left fingerprint. str: The right fingerprint. + TODO: Is this time to merge, time to test, or time to merge and test? + TODO: This is a ssingle number, but n_tests tells how many times to + TODO: run the test. Is this an average, median, overall, or something + TODO: else? (I have the same question elsewhere.) float: The time it took to run the merge, in seconds. """ ( @@ -415,12 +421,14 @@ def compute_tree_fingerprint(self) -> str: def get_sha_cache_entry( self, commit: str, start_merge: bool = False ) -> Union[None, dict]: + # TODO: is a SHA cache a cache from a SHA to something, or a cache from something to a SHA? """Gets a SHA cache entry. Args: commit (str): The commit to check. start_merge (bool, optional) = False: Whether to indicate that the merge starts if the commit is not in the cache. Returns: + # TODO: What is the structure of the dict this returns? Union[None,dict]: The cache entry if the commit is in the cache, None otherwise. """ cache = lookup_in_cache( @@ -501,7 +509,10 @@ def checkout_and_test( def test(self, timeout: int, n_tests: int) -> TEST_STATE: """Tests the repository. The test results of multiple runs is combined into one result. If one of the runs passes then the entire test is marked as passed. + # TODO: Should this line be preceded by "otherwise"? Is it possible that one run passes and another times out? + # TODO: I would think the test should be marked as timeout only if every run is timeout. If one of the runs timeouts then the entire test is marked as timeout. + # TODO: Otherwise the test is marked as failure? Args: timeout (int): The timeout limit, in seconds. n_tests (int): The number of times to run the test suite. diff --git a/src/python/test_repo_head.py b/src/python/test_repo_head.py index a8a9851b97..61183ca016 100755 --- a/src/python/test_repo_head.py +++ b/src/python/test_repo_head.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +# TODO: What does it mean to "validate it"? """Tests the HEAD of a repo and validates it if the test passes. usage: python3 test_repo_head.py --repos_csv_with_hashes diff --git a/src/python/variables.py b/src/python/variables.py index c01dce0889..302548dc0a 100644 --- a/src/python/variables.py +++ b/src/python/variables.py @@ -9,6 +9,7 @@ CACHE_BACKOFF_TIME = 2 * 60 # 2 minutes, in seconds DELETE_WORKDIRS = True REPOS_PATH = Path("repos") +# TODO: What is this used for? Please give it a more descriptive name. WORKDIR_PREFIX = Path(".workdir") TIMEOUT_MERGING = 60 * 15 # 15 minutes, in seconds diff --git a/src/scripts/merge_tools/intellimerge.sh b/src/scripts/merge_tools/intellimerge.sh index adc8370865..374a8e87c7 100755 --- a/src/scripts/merge_tools/intellimerge.sh +++ b/src/scripts/merge_tools/intellimerge.sh @@ -21,6 +21,7 @@ intellimerge_absolutepath="${ROOT_DIR}/${intellimerge_relativepath}" clone_dir=$1 branch1=$2 branch2=$3 +# TODO: Will this be in the repository? Would it be safer to put it under /tmp or /scratch? temp_dir=".workdir/intelli_temp_$$/" mkdir $temp_dir diff --git a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py index 539b461f28..61f2b59681 100755 --- a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py +++ b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py @@ -1,4 +1,5 @@ #! /usr/bin/env python +# TODO: Benedikt, could you review and improve this file? I'm sure it is not perfect Python style. """Edits a file in place to remove conflict markers related to Java imports.""" diff --git a/src/scripts/merge_tools/spork.sh b/src/scripts/merge_tools/spork.sh index 0fbe3f1a85..337fafb083 100755 --- a/src/scripts/merge_tools/spork.sh +++ b/src/scripts/merge_tools/spork.sh @@ -13,7 +13,8 @@ if [ "$#" -ne 3 ]; then exit 1 fi -# Kill all java processes that are running for over an hour (to avoid memory leaks) +# TODO: This script is an odd place to put this `killall` command. Should it be elsewhere in the experimental infrastructure? +# Kill all Java processes that are running for over an hour (to avoid memory leaks). killall -9 java --older-than 1h SCRIPT_PATH="$(dirname "$0")"; SCRIPT_PATH="$(eval "cd \"$SCRIPT_PATH\" && pwd")" diff --git a/src/scripts/run_repo_tests.sh b/src/scripts/run_repo_tests.sh index 8948f987a1..8c289cf77b 100755 --- a/src/scripts/run_repo_tests.sh +++ b/src/scripts/run_repo_tests.sh @@ -7,6 +7,8 @@ set -o nounset +# TODO: This is too agressive and will interfere with the user's other work. +# TODO: Instead, create a directory under /tmp (or, better, under /tmp/$USER) for the AST merging experiments, and clean it. # Test side effects can be seen in the /tmp directory. # We delete all the files older than 2h and owned by the current user. find /tmp -maxdepth 1 -user "$(whoami)" -mmin +120 -exec rm -rf {} \; diff --git a/src/scripts/utils/run_multiple_machines.sh b/src/scripts/utils/run_multiple_machines.sh index afe42f1724..2c2b7a5351 100755 --- a/src/scripts/utils/run_multiple_machines.sh +++ b/src/scripts/utils/run_multiple_machines.sh @@ -1,4 +1,5 @@ #!/bin/bash +# TODO: A branch of what? This repository? (Same question in `run_remotely.sh`.) # Run the code for a certain branch on multiple remote machines. # Usage: ./run_multiple_machines.sh # is the branch to run From 0ecfc371ce722b96400f3f0c8da3473ee1132ce0 Mon Sep 17 00:00:00 2001 From: Benedikt schesch Date: Sun, 24 Sep 2023 12:06:26 -0700 Subject: [PATCH 2/7] Comment update --- src/python/cache_utils.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py index 2c3f469634..0868d89cab 100755 --- a/src/python/cache_utils.py +++ b/src/python/cache_utils.py @@ -1,11 +1,8 @@ #!/usr/bin/env python3 -""" Contains all the functions related to the caches. -TODO: Elsewhere, the code speaks of "the cache" (example: get_cache_lock), but here it speaks of 4 caches. I'm confused about the relationship. -TODO: Are there 4 caches overall, or 4 caches per repository, or something else? Please clarify. -TODO: "after running the script": Which script? -There will be 4 caches which are stored on disk after running the script: -TODO: Is the cache directory literally named "cache"? Where is it on disk? -TODO: Are all the caches JSON files? +""" Contains all the functions related to the caches. The functions to interact with each +of the caches are in this file. Each cache is interacted with through the functions +of this file. The caches are all JSON files and are stored in the cache directory. +There will be 4 caches in total which are stored on disk after running the run.sh script: 1) cache/sha_cache_entry: A cache that maps the commit hash to a sha256 hash of the repository. 2) cache/test_cache: A cache that maps a sha256 to test results. 3) cache/merge_results:A cache that maps a merge to the result From 606b16c5572ad5bc7db17ed37a75dcf0150e41cc Mon Sep 17 00:00:00 2001 From: Benedikt schesch Date: Sun, 24 Sep 2023 13:34:56 -0700 Subject: [PATCH 3/7] Improvements --- run.sh | 2 +- src/python/cache_utils.py | 211 +++++++----------- ...olders.py => delete_cache_placeholders.py} | 12 +- src/python/latex_output.py | 12 +- src/python/merge_differ.py | 35 ++- src/python/merge_tester.py | 10 +- src/python/merge_tools_comparator.py | 63 +++--- src/python/repo.py | 55 ++--- src/python/test_repo_heads.py | 5 +- src/python/variables.py | 5 +- src/scripts/merge_tools/intellimerge.sh | 3 +- src/scripts/utils/run_multiple_machines.sh | 3 +- src/scripts/utils/run_remotely.sh | 2 +- 13 files changed, 175 insertions(+), 243 deletions(-) rename src/python/{clean_cache_placeholders.py => delete_cache_placeholders.py} (65%) diff --git a/run.sh b/run.sh index 9b1c1594a9..bc592a5872 100755 --- a/run.sh +++ b/run.sh @@ -73,7 +73,7 @@ if [ -d "$CACHE_DIR" ]; then find "$CACHE_DIR" -name "*.lock" -delete fi -python3 src/python/clean_cache_placeholders.py \ +python3 src/python/delete_cache_placeholders.py \ --cache_dir "$CACHE_DIR" python3 src/python/write_head_hashes.py \ diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py index b7501a9e6a..2ef4864ca6 100755 --- a/src/python/cache_utils.py +++ b/src/python/cache_utils.py @@ -19,9 +19,6 @@ CACHE_BACKOFF_TIME = 2 * 60 # 2 minutes, in seconds TIMEOUT = 90 * 60 # 90 minutes, in seconds -# TODO: In this file, please place the externally-visible or exported functions -# first, and then a comment marker of some kind, and then the "private" ones. - def slug_repo_name(repo_slug: str) -> str: """Given a GitHub repository slug ("owner/reponame"), returns the reponame. @@ -33,99 +30,13 @@ def slug_repo_name(repo_slug: str) -> str: return repo_slug.split("/")[1] -# TODO: Throughout, please use "_directory" instead of "_prefix" in -# variable names, when the variable actually holds a directory. I -# feel this will be clearer. -def get_cache_lock(repo_slug: str, cache_prefix: Path): - """Returns a lock for the cache of a repository. - Initially the lock is unlocked; the caller must explictly - lock and unlock the lock. - Args: - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - fasteners.InterProcessLock: A lock for the repository. - """ - lock_path = cache_prefix / "locks" / (slug_repo_name(repo_slug) + ".lock") - lock_path.parent.mkdir(parents=True, exist_ok=True) - lock = fasteners.InterProcessLock(lock_path) - return lock - - -def get_cache_path(repo_slug: str, cache_prefix: Path) -> Path: - """Returns the path to the cache file. - Args: - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - Path: The path to the cache file. - """ - cache_file_name = slug_repo_name(repo_slug) + ".json" - cache_path = cache_prefix / cache_file_name - cache_path.parent.mkdir(parents=True, exist_ok=True) - return cache_path - - -def is_in_cache( - cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path -) -> bool: - """Checks if the key is in the cache. - Args: - cache_key (Union[Tuple,str]): The key to check. - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path prefix to the cache directory. - Returns: - bool: True if the repository is in the cache, False otherwise. - """ - cache = load_cache(repo_slug, cache_prefix) - return cache_key in cache - - -def load_cache(repo_slug: str, cache_prefix: Path) -> dict: - # TODO: "the cache": which one? Also, does this load a cache or a cache - # entry? Please define those terms to help the reader. This method returns - # a cache, which is a dict. `cache_lookup` returns a cache entry, which is - # also a dict. What are they dicts from and to? - """Loads the cache. - Args: - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - dict: The cache. - """ - cache_path = get_cache_path(repo_slug, cache_prefix) - if not cache_path.exists(): - return {} - with open(cache_path, "r") as f: - cache_data = json.load(f) - return cache_data - - -def cache_lookup( - cache_key: Union[Tuple, str], repo_slug: str, cache_prefix: Path -) -> dict: - """Loads the cache and returns the value for the given key. - Args: - cache_key (Union[Tuple,str]): The key to check. - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - Returns: - dict: The cache. - """ - cache = load_cache(repo_slug, cache_prefix) - return cache[cache_key] - - -# TODO: The distinction between write_to_cache and set_in_cache is not clear to -# me. If it is only locking, then please give them the same name, but with one -# suffixed by "_without_lock". Also please indicate under what circumstances -# each should be called. -def write_to_cache( +def set_in_cache( cache_key: Union[Tuple, str], cache_value: Union[str, dict, None], repo_slug: str, - cache_prefix: Path, + cache_directory: Path, overwrite: bool = True, + acquire_lock: bool = True, ) -> None: """Puts an entry in the cache, then writes the cache to disk. This function is not thread-safe. @@ -133,14 +44,15 @@ def write_to_cache( cache_key (Union[Tuple,str]): The key to check. cache_value (dict): The value to write. repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. + cache_directory (Path): The path to the cache directory. overwrite (bool, optional) = True: Whether to overwrite an existing cache entry. If False, attempting to overwrite an existing cache entry throws an exception. """ - cache_path = get_cache_path(repo_slug, cache_prefix) - cache = load_cache(repo_slug, cache_prefix) - if cache_prefix != Path("cache-small/sha_cache_entry"): - print("write_to_cache:", cache_key, cache_value, repo_slug, cache_prefix) + if acquire_lock: + lock = get_cache_lock(repo_slug, cache_directory) + lock.acquire() + cache_path = get_cache_path(repo_slug, cache_directory) + cache = load_cache(repo_slug, cache_directory) if cache_key in cache and not overwrite: raise ValueError("Cache key already exists") cache[cache_key] = cache_value @@ -148,57 +60,36 @@ def write_to_cache( with open(cache_path, "w") as f: f.write(output) f.flush() + if acquire_lock: + lock.release() # type: ignore -def set_in_cache( - cache_key: Union[Tuple, str], - cache_value: Union[str, dict, None], - repo_slug: str, - cache_prefix: Path, - overwrite: bool = True, -) -> None: - """Puts an entry in the cache, then writes the cache to disk. - This function is thread-safe. - Args: - cache_key (Union[Tuple,str]): The key to check. - cache_value (dict): The value to write. - repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. - overwrite (bool, optional) = True: Whether to overwrite the cache if it already exists. - """ - lock = get_cache_lock(repo_slug, cache_prefix) - lock.acquire() - write_to_cache(cache_key, cache_value, repo_slug, cache_prefix, overwrite) - lock.release() - - -# TODO: I'm not clear what is the difference between `cache_lookup` and -# `lookup_in_cache`. Please clarify, and indicate when each should be called. def lookup_in_cache( cache_key: Union[Tuple, str], repo_slug: str, - cache_prefix: Path, + cache_directory: Path, set_run: bool = False, ) -> Union[str, dict, None]: """Checks if the cache is available and loads a specific entry. Args: cache_key (Union[Tuple,str]): The key to check. repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The path to the cache directory. + cache_directory (Path): The path to the cache directory. set_run (bool, optional) = False: Wheter to insert an empty cache entry if it does not exist. This is useful for preventing multiple runs from attempting to insert the same cache entry. Returns: Union[dict,None]: The cache entry if it exists, None otherwise. """ - lock = get_cache_lock(repo_slug, cache_prefix) + lock = get_cache_lock(repo_slug, cache_directory) lock.acquire() - cache_entry = get_cache_path(repo_slug, cache_prefix) + cache_entry = get_cache_path(repo_slug, cache_directory) cache_entry.parent.mkdir(parents=True, exist_ok=True) - if is_in_cache(cache_key, repo_slug, cache_prefix): + if is_in_cache(cache_key, repo_slug, cache_directory): total_time = 0 while True: - cache_data = cache_lookup(cache_key, repo_slug, cache_prefix) + cache = load_cache(repo_slug, cache_directory) + cache_data = cache[cache_key] if cache_data is not None: break lock.release() @@ -210,6 +101,70 @@ def lookup_in_cache( lock.release() return cache_data if set_run: - write_to_cache(cache_key, None, repo_slug, cache_prefix) + set_in_cache(cache_key, None, repo_slug, cache_directory, acquire_lock=False) lock.release() return None + + +# ====================== Internal functions ====================== + + +def get_cache_lock(repo_slug: str, cache_directory: Path): + """Returns a lock for the cache of a repository. + Initially the lock is unlocked; the caller must explictly + lock and unlock the lock. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path to the cache directory. + Returns: + fasteners.InterProcessLock: A lock for the repository. + """ + lock_path = cache_directory / "locks" / (slug_repo_name(repo_slug) + ".lock") + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock = fasteners.InterProcessLock(lock_path) + return lock + + +def get_cache_path(repo_slug: str, cache_directory: Path) -> Path: + """Returns the path to the cache file. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path to the cache directory. + Returns: + Path: The path to the cache file. + """ + cache_file_name = slug_repo_name(repo_slug) + ".json" + cache_path = cache_directory / cache_file_name + cache_path.parent.mkdir(parents=True, exist_ok=True) + return cache_path + + +def is_in_cache( + cache_key: Union[Tuple, str], repo_slug: str, cache_directory: Path +) -> bool: + """Checks if the key is in the cache. + Args: + cache_key (Union[Tuple,str]): The key to check. + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path prefix to the cache directory. + Returns: + bool: True if the repository is in the cache, False otherwise. + """ + cache = load_cache(repo_slug, cache_directory) + return cache_key in cache + + +def load_cache(repo_slug: str, cache_directory: Path) -> dict: + """Loads the cache associated to the repo_slug found in the cache directory. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + cache_directory (Path): The path to the cache directory. + Returns: + dict: The cache. + """ + cache_path = get_cache_path(repo_slug, cache_directory) + if not cache_path.exists(): + return {} + with open(cache_path, "r") as f: + cache_data = json.load(f) + return cache_data diff --git a/src/python/clean_cache_placeholders.py b/src/python/delete_cache_placeholders.py similarity index 65% rename from src/python/clean_cache_placeholders.py rename to src/python/delete_cache_placeholders.py index c2d861c7b3..22acd35c11 100755 --- a/src/python/clean_cache_placeholders.py +++ b/src/python/delete_cache_placeholders.py @@ -1,6 +1,4 @@ #!/usr/bin/env python3 -# TODO: This file uses "clean", "remove", and "delete". Please choose one term -# and stick to it, in both the code and in the file name. """ Deletes all placeholders from the cache. Placeholders are created when a a process starts; it indicates that is has started and is still running. @@ -8,12 +6,10 @@ result. This script deletes all placeholders from the cache. Usage: - python clean_cache_placeholders.py --cache_path + python delete_cache_placeholders.py --cache_directory Args: -TODO: Throughout, please don't use "_path" as a variable suffix, because it is -# not sufficiently precise. Instead, use either "_file" or "_directory". - cache_path (str): Path to cache directory + cache_directory (str): Path to cache directory """ from argparse import ArgumentParser @@ -27,10 +23,10 @@ ) args = parser.parse_args() - cache_path = Path(args.cache_dir) + cache_directory = Path(args.cache_dir) n_deleted = 0 - for file in cache_path.glob("**/*.json"): + for file in cache_directory.glob("**/*.json"): with open(file, "r") as f: data = json.load(f) diff --git a/src/python/latex_output.py b/src/python/latex_output.py index ef75f08243..54b01bedff 100755 --- a/src/python/latex_output.py +++ b/src/python/latex_output.py @@ -14,10 +14,9 @@ following input files: - full_repos_csv: csv file containing the full list of repositories - repos_head_passes_csv: csv file containing the list of repositories whose head passes tests -- tested_merges_path: path to the folder containing the merge results -- merges_path: path to the folder containing all found merges. -TODO: Throughout, be consistent about "directory" vs "folder". -- output_dir: path to the folder where the LaTeX files will be saved +- tested_merges_path: path to the directory containing the merge results +- merges_path: path to the directory containing all found merges. +- output_dir: path to the directory where the LaTeX files will be saved """ @@ -77,7 +76,7 @@ def latex_def(name, value) -> str: # Dictonary that lists the different subsets of merge tools for which plots -# and tables are generated. The key is the folder name which will contain all figures +# and tables are generated. The key is the directory name which will contain all figures # that will be used and the value is the list of plots to contain. PLOTS = { "all": [merge_tool.name for merge_tool in MERGE_TOOL], @@ -93,9 +92,6 @@ def latex_def(name, value) -> str: ], } -# TODO: I suggest that these should be declared in repo.py, or move the enums to -# a different file where this can also be defined. Or, define predicates for -# clients to use rather than exporting these lists. MERGE_CORRECT_NAMES = [ TEST_STATE.Tests_passed.name, ] diff --git a/src/python/merge_differ.py b/src/python/merge_differ.py index bc36bd7cb6..574eddef00 100755 --- a/src/python/merge_differ.py +++ b/src/python/merge_differ.py @@ -4,10 +4,10 @@ --merges_path --cache_dir This script compares the results of two merge tools on the same merge. -# TODO: Where is the output? Standard out? It outputs the diff of the two merge results if their results differ. -# TODO: Does "ignores" mean there is no output, or is it something deeper? -It ignores merges that have the same result or merges that are not successful. +It does not produce output for merges that have the same result or merges that +are not successful. +The output is written in cache_dir/merge_diffs. """ import os @@ -32,7 +32,7 @@ def get_merge_fingerprint( - merge_data: pd.Series, merge_tool: MERGE_TOOL, cache_prefix: Path + merge_data: pd.Series, merge_tool: MERGE_TOOL, cache_directory: Path ) -> Union[Tuple[None, None], Tuple[Repository, str]]: """Returns the repo and the fingerprint of a merge, or (None, None) if the merge is not successful. @@ -51,7 +51,7 @@ def get_merge_fingerprint( return None, None left = merge_data["left"] right = merge_data["right"] - repo = Repository(repo_slug, cache_prefix=cache_prefix) + repo = Repository(repo_slug, cache_directory=cache_directory) ( merge_status, merge_fingerprint, @@ -81,39 +81,34 @@ def get_merge_fingerprint( def merge_differ(args: Tuple[pd.Series, Path]) -> None: - # TODO: What does this funtion do with the diff? I think it writes to a file? Or does it populate a cache too? """Diffs the results of every two merge tools on the same merge. - Does not diff merges that have the same result or merges that are not successful. + Writes the diff to a file. Args: args (Tuple[pd.Series,Path]): A tuple containing the merge data and the cache prefix. - Returns: - TODO: How is "The result of the test." related to the diff of two merge tools? I don't see this function returning anything. - dict: The result of the test. """ - merge_data, cache_prefix = args + merge_data, cache_directory = args if not merge_data["parents pass"]: return - # TODO: See other comment about "_prefix" in variable names; change throughout. - diff_file_prefix = cache_prefix / "merge_diffs" - diff_file_prefix.mkdir(parents=True, exist_ok=True) + diff_file_directory = cache_directory / "merge_diffs" + diff_file_directory.mkdir(parents=True, exist_ok=True) for merge_tool1 in MERGE_TOOL: repo1, merge_fingerprint1 = get_merge_fingerprint( - merge_data, merge_tool1, cache_prefix + merge_data, merge_tool1, cache_directory ) if repo1 is None or merge_fingerprint1 is None: continue for merge_tool2 in MERGE_TOOL: repo2, merge_fingerprint2 = get_merge_fingerprint( - merge_data, merge_tool2, cache_prefix + merge_data, merge_tool2, cache_directory ) if repo2 is None or merge_fingerprint2 is None: continue - diff_file = diff_file_prefix / diff_file_name( + diff_file = diff_file_directory / diff_file_name( merge_fingerprint1, merge_fingerprint2 ) if diff_file.exists(): @@ -137,9 +132,8 @@ def diff_file_name(sha1: str, sha2: str) -> Path: """ # Use lexicographic order to prevent duplicates if sha1 < sha2: - # TODO: Why does this use ".txt" rather than ".diff" as the file extension? - return Path(sha1 + "_" + sha2 + ".txt") - return Path(sha2 + "_" + sha1 + ".txt") + return Path(sha1 + "_" + sha2 + ".diff") + return Path(sha2 + "_" + sha1 + ".diff") if __name__ == "__main__": @@ -150,7 +144,6 @@ def diff_file_name(sha1: str, sha2: str) -> Path: parser.add_argument("--cache_dir", type=Path, default="cache/") args = parser.parse_args() cache_dir = Path(args.cache_dir) - # TODO: See comment elsewhere about "_path" in variable names. cache_diffs_path = cache_dir / "merge_diffs" cache_diffs_path.mkdir(parents=True, exist_ok=True) diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py index bc840976f7..d0c51a8701 100755 --- a/src/python/merge_tester.py +++ b/src/python/merge_tester.py @@ -42,7 +42,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: Returns: pd.Series: The result of the test. """ - repo_slug, merge_data, cache_prefix = args + repo_slug, merge_data, cache_directory = args while psutil.cpu_percent() > 90: print( "merge_tester: Waiting for CPU load to come down", @@ -56,7 +56,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: merge_data["parents pass"] = False # TODO: It's not clear to me whether "left" and "right" are keys for branches, or shas, or something else. How about improving the key name? (We can change the find-merge-conflicts program if it is the source of the problem.) for branch in ["left", "right"]: - repo = Repository(repo_slug, cache_prefix=cache_prefix) + repo = Repository(repo_slug, cache_directory=cache_directory) repo.checkout(merge_data[branch]) tree_fingerprint = repo.compute_tree_fingerprint() if tree_fingerprint != merge_data[f"{branch}_tree_fingerprint"]: @@ -74,15 +74,12 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: merge_data[f"{branch} test result"] = test_result.name if test_result != TEST_STATE.Tests_passed: return merge_data - # TODO: It looks suspicious to me that the repo is only sometimes, not always, deleted. - del repo merge_data["parents pass"] = True for merge_tool in MERGE_TOOL: if is_merge_success(merge_data[merge_tool.name]): - # TODO: I suggest abstracting the body of this loop into a separate function, since it stands on its own logically. - repo = Repository(repo_slug, cache_prefix=cache_prefix) + repo = Repository(repo_slug, cache_directory=cache_directory) ( result, merge_fingerprint, @@ -111,7 +108,6 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: ) # Update the status from merge success to test result. merge_data[merge_tool.name] = result.name - del repo assert merge_tool.name in merge_data return merge_data diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py index 2fcb574c1e..4b06bb7295 100755 --- a/src/python/merge_tools_comparator.py +++ b/src/python/merge_tools_comparator.py @@ -51,16 +51,12 @@ def merger( # pylint: disable=too-many-locals Returns: dict: A dictionary containing the merge result. """ - repo_slug, merge_data, cache_prefix = args + repo_slug, merge_data, cache_directory = args - # TODO: I think this is a key, not an entry. Please clarify. - # TODO: Why are these two SHAs (if they are SHAs) not sorted as is done elsewhere? - cache_entry = merge_data["left"] + "_" + merge_data["right"] - merge_cache_prefix = cache_prefix / "merge_results" + cache_key = merge_data["left"] + "_" + merge_data["right"] + merge_cache_directory = cache_directory / "merge_results" - result = lookup_in_cache(cache_entry, repo_slug, merge_cache_prefix, True) - # if result is None: - # raise Exception(cache_entry," HERE") + result = lookup_in_cache(cache_key, repo_slug, merge_cache_directory, True) if result is not None and isinstance(result, dict): return result @@ -74,9 +70,9 @@ def merger( # pylint: disable=too-many-locals merge_tool.name, ) cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []} - # TODO: I suggest abstracting out the body of this loop as a function. + # TODO: I suggest abstracting out the body of this loop as a function. for i in range(N_REPETITIONS): - repo = Repository(repo_slug, cache_prefix=cache_prefix) + repo = Repository(repo_slug, cache_directory=cache_directory) ( merge_status, merge_fingerprint, @@ -91,7 +87,7 @@ def merger( # pylint: disable=too-many-locals timeout=TIMEOUT_MERGING, ) log_file: Path = ( - cache_prefix + cache_directory / "merge_logs" / slug_repo_name(repo_slug) / merge_data["left"] @@ -132,11 +128,34 @@ def merger( # pylint: disable=too-many-locals else: assert cache_data["left_tree_fingerprint"] == left_fingerprint assert cache_data["right_tree_fingerprint"] == right_fingerprint - del repo - set_in_cache(cache_entry, cache_data, repo_slug, merge_cache_prefix) + set_in_cache(cache_key, cache_data, repo_slug, merge_cache_directory) return cache_data +def check_if_two_merges_differ(cache_data: dict) -> bool: + """Returns true if two merge tools differ on the same merge. + Args: + cache_data (dict): A dictionary containing the merge data. + Returns: + bool: True if two merge tools differ on the same merge. + """ + for merge_tool1 in MERGE_TOOL: + for merge_tool2 in MERGE_TOOL: + if is_merge_success( + cache_data[merge_tool1.name]["results"][0] + ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): + return True + if is_merge_success( + cache_data[merge_tool1.name]["results"][0] + ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): + if ( + cache_data[merge_tool1.name]["merge_fingerprint"] + != cache_data[merge_tool2.name]["merge_fingerprint"] + ): + return True + return False + + if __name__ == "__main__": print("merge_tools_comparator: Start") parser = argparse.ArgumentParser() @@ -228,22 +247,8 @@ def merger( # pylint: disable=too-many-locals repo_slug = merger_arguments[i][0] merge_data = merger_arguments[i][1] cache_data = merger_results[i] - two_merge_tools_differ = False - # TODO: I suggest abstracting this loop into a function. That will also make it easier to return early, which can be done just with "return". (Currently it makes more iterations even after discovering that two merge tools do differ.) - for merge_tool1 in MERGE_TOOL: - for merge_tool2 in MERGE_TOOL: - if is_merge_success( - cache_data[merge_tool1.name]["results"][0] - ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): - two_merge_tools_differ = True - if is_merge_success( - cache_data[merge_tool1.name]["results"][0] - ) and is_merge_success(cache_data[merge_tool2.name]["results"][0]): - if ( - cache_data[merge_tool1.name]["merge_fingerprint"] - != cache_data[merge_tool2.name]["merge_fingerprint"] - ): - two_merge_tools_differ = True + two_merge_tools_differ = check_if_two_merges_differ(cache_data) + merge_data["two merge tools differ"] = two_merge_tools_differ merge_data["left_tree_fingerprint"] = cache_data["left_tree_fingerprint"] merge_data["right_tree_fingerprint"] = cache_data["right_tree_fingerprint"] diff --git a/src/python/repo.py b/src/python/repo.py index 04c6e88c72..c332c564c3 100755 --- a/src/python/repo.py +++ b/src/python/repo.py @@ -76,12 +76,10 @@ def compute_explanation( def repo_test(wcopy_dir: Path, timeout: int) -> Tuple[TEST_STATE, str]: """Returns the result of run_repo_tests.sh on the given working copy. - TODO: What is the definition of "the entire test"? How is it different than the test process? - If the test process passes then the entire test is marked as passed. - If the test process timeouts then the entire test is marked as timeout. + If the test process passes then the function returns and marks it as passed. + If the test process timeouts then the function returns and marks it as timedout. Args: - TODO: Minor: Throughout, you don't need "The path of" at the beginning of comments. Or change it to "The directory of". - wcopy_dir (Path): The path of the working copy (the clone). + wcopy_dir (Path): The directory of the working copy (the clone). timeout (int): Test timeout limit, in seconds. Returns: TEST_STATE: The result of the test. @@ -114,23 +112,23 @@ class Repository: def __init__( self, repo_slug: str, - cache_prefix: Path = Path(""), + cache_directory: Path = Path(""), ) -> None: """Initializes the repository. Args: repo_slug (str): The slug of the repository, which is "owner/reponame". - cache_prefix (Path): The prefix of the cache. + cache_directory (Path): The prefix of the cache. """ self.repo_slug = repo_slug self.path = REPOS_PATH / repo_slug.split("/")[1] workdir_id = uuid.uuid4().hex - self.workdir = WORKDIR_PREFIX / workdir_id + self.workdir = WORKDIR_DIRECTORY / workdir_id self.workdir.mkdir(parents=True, exist_ok=True) self.repo_path = self.workdir / self.path.name shutil.copytree(self.path, self.repo_path) self.repo = Repo(self.repo_path) - self.test_cache_prefix = cache_prefix / "test_cache" - self.sha_cache_prefix = cache_prefix / "sha_cache_entry" + self.test_cache_directory = cache_directory / "test_cache" + self.sha_cache_directory = cache_directory / "sha_cache_entry" def checkout(self, commit: str) -> Tuple[bool, str]: """Checks out the given commit. @@ -154,13 +152,13 @@ def checkout(self, commit: str) -> Tuple[bool, str]: + str(e) ) cache_entry = {"sha": None, "explanation": explanation} - set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_prefix) + set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_directory) return False, explanation cache_entry = { "sha": self.compute_tree_fingerprint(), "explanation": explanation, } - set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_prefix) + set_in_cache(commit, cache_entry, self.repo_slug, self.sha_cache_directory) return True, explanation def _merge_and_test( # pylint: disable=too-many-arguments @@ -190,11 +188,8 @@ def _merge_and_test( # pylint: disable=too-many-arguments str: The tree fingerprint of the merge result. str: The left fingerprint. str: The right fingerprint. - TODO: Is this time to merge, time to test, or time to merge and test? - TODO: This is a ssingle number, but n_tests tells how many times to - TODO: run the test. Is this an average, median, overall, or something - TODO: else? (I have the same question elsewhere.) - float: The time it took to run the merge, in seconds. + float: The time it took to run the merge, in seconds. Does not include + the test time. """ ( merge_status, @@ -287,7 +282,7 @@ def create_branch( commit, {"sha": None, "explanation": explanation}, self.repo_slug, - self.sha_cache_prefix, + self.sha_cache_directory, ) return None, explanation tree_fingerprint = self.compute_tree_fingerprint() @@ -335,7 +330,7 @@ def merge( # pylint: disable=too-many-locals cache_entry_name, {"sha": None, "explanation": explanation}, self.repo_slug, - self.sha_cache_prefix, + self.sha_cache_directory, ) return ( MERGE_STATE.Git_checkout_failed, @@ -387,7 +382,7 @@ def merge( # pylint: disable=too-many-locals cache_entry_name, cache_entry, self.repo_slug, - self.sha_cache_prefix, + self.sha_cache_directory, ) return ( merge_status, @@ -421,18 +416,18 @@ def compute_tree_fingerprint(self) -> str: def get_sha_cache_entry( self, commit: str, start_merge: bool = False ) -> Union[None, dict]: - # TODO: is a SHA cache a cache from a SHA to something, or a cache from something to a SHA? - """Gets a SHA cache entry. + """The SHA cache maps a commit to a tree fingerprint. + This function checks if the commit is in the cache. Args: commit (str): The commit to check. start_merge (bool, optional) = False: Whether to indicate that the merge starts if the commit is not in the cache. Returns: - # TODO: What is the structure of the dict this returns? - Union[None,dict]: The cache entry if the commit is in the cache, None otherwise. + Union[None, dict]: The cache entry if the commit is in the cache, None otherwise. + The dict contains the sha entry and an explanation entry. """ cache = lookup_in_cache( - commit, self.repo_slug, self.sha_cache_prefix, set_run=start_merge + commit, self.repo_slug, self.sha_cache_directory, set_run=start_merge ) if cache is None: return None @@ -453,7 +448,7 @@ def get_test_cache_entry( None otherwise. """ cache = lookup_in_cache( - sha, self.repo_slug, self.test_cache_prefix, set_run=start_test + sha, self.repo_slug, self.test_cache_directory, set_run=start_test ) if cache is None: return None @@ -511,10 +506,8 @@ def checkout_and_test( def test(self, timeout: int, n_tests: int) -> TEST_STATE: """Tests the repository. The test results of multiple runs is combined into one result. If one of the runs passes then the entire test is marked as passed. - # TODO: Should this line be preceded by "otherwise"? Is it possible that one run passes and another times out? - # TODO: I would think the test should be marked as timeout only if every run is timeout. If one of the runs timeouts then the entire test is marked as timeout. - # TODO: Otherwise the test is marked as failure? + Otherwise all runs must fail for the entire test to be marked as failed. Args: timeout (int): The timeout limit, in seconds. n_tests (int): The number of times to run the test suite. @@ -534,7 +527,7 @@ def test(self, timeout: int, n_tests: int) -> TEST_STATE: test_state, test_output = repo_test(self.repo_path, timeout) test_log_file = Path( os.path.join( - self.test_cache_prefix, + self.test_cache_directory, "logs", slug_repo_name(self.repo_slug), sha + "_" + str(i) + ".log", @@ -551,7 +544,7 @@ def test(self, timeout: int, n_tests: int) -> TEST_STATE: if test_state in (TEST_STATE.Tests_passed, TEST_STATE.Tests_timedout): break - set_in_cache(sha, cache_data, self.repo_slug, self.test_cache_prefix) + set_in_cache(sha, cache_data, self.repo_slug, self.test_cache_directory) return TEST_STATE[cache_data["test_result"]] def __del__(self) -> None: diff --git a/src/python/test_repo_heads.py b/src/python/test_repo_heads.py index 3604942293..714007b2f1 100755 --- a/src/python/test_repo_heads.py +++ b/src/python/test_repo_heads.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -# TODO: What does it mean to "validate it"? -"""Tests the HEAD commits of multiple repos and validates it if the test passes. +"""Tests the HEAD commits of multiple repos and considers them as valid if the test passes. usage: python3 test_repo_heads.py --repos_csv_with_hashes --output_path @@ -43,7 +42,7 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE: repo_slug = repo_info["repository"] print("test_repo_heads:", repo_slug, ": head_passes_tests : started") - repo = Repository(repo_slug, cache_prefix=cache) + repo = Repository(repo_slug, cache_directory=cache) test_state = repo.checkout_and_test( repo_info["head hash"], timeout=TIMEOUT_TESTING, n_tests=3 ) diff --git a/src/python/variables.py b/src/python/variables.py index 1ce260b1f0..d81c733af9 100644 --- a/src/python/variables.py +++ b/src/python/variables.py @@ -9,8 +9,9 @@ CACHE_BACKOFF_TIME = 2 * 60 # 2 minutes, in seconds DELETE_WORKDIRS = True REPOS_PATH = Path("repos") -# TODO: What is this used for? Please give it a more descriptive name. -WORKDIR_PREFIX = Path(".workdir") +WORKDIR_DIRECTORY = Path( + ".workdir" +) # Merges and testing will be performed in this directory. TIMEOUT_MERGING = 60 * 15 # 15 minutes, in seconds N_REPETITIONS = 3 diff --git a/src/scripts/merge_tools/intellimerge.sh b/src/scripts/merge_tools/intellimerge.sh index c79bd27073..f4fa8d157c 100755 --- a/src/scripts/merge_tools/intellimerge.sh +++ b/src/scripts/merge_tools/intellimerge.sh @@ -21,8 +21,7 @@ intellimerge_absolutepath="${ROOT_DIR}/${intellimerge_relativepath}" clone_dir=$1 branch1=$2 branch2=$3 -# TODO: Will this be in the repository? Would it be safer to put it under /tmp or /scratch? -temp_dir=".workdir/intelli_temp_$$/" +temp_dir="/tmp/intelli_temp_$$/" mkdir $temp_dir # run intellimerge diff --git a/src/scripts/utils/run_multiple_machines.sh b/src/scripts/utils/run_multiple_machines.sh index 2c2b7a5351..3db4743239 100755 --- a/src/scripts/utils/run_multiple_machines.sh +++ b/src/scripts/utils/run_multiple_machines.sh @@ -1,6 +1,5 @@ #!/bin/bash -# TODO: A branch of what? This repository? (Same question in `run_remotely.sh`.) -# Run the code for a certain branch on multiple remote machines. +# Run the code of a specific branch of this repository on multiple remote machines. # Usage: ./run_multiple_machines.sh # is the branch to run # is the path to a file containing the addresses of the machines (one address per line) diff --git a/src/scripts/utils/run_remotely.sh b/src/scripts/utils/run_remotely.sh index 44b60dd87c..86f9751ef8 100755 --- a/src/scripts/utils/run_remotely.sh +++ b/src/scripts/utils/run_remotely.sh @@ -1,5 +1,5 @@ #!/bin/sh -# Run the code for a certain branch on a remote machine. +# Run the code of a specific branch of this repository on a remote machine. # Usage: ./run_remotely.sh # is the branch to run # is the ssh address of the machine From 40e90af3b06bede4e170220307a42f84c602dc1f Mon Sep 17 00:00:00 2001 From: Benedikt schesch Date: Mon, 25 Sep 2023 15:34:39 -0700 Subject: [PATCH 4/7] Further changes --- src/python/cache_utils.py | 5 - src/python/merge_differ.py | 2 - src/python/merge_tester.py | 89 ++++++---- src/python/merge_tools_comparator.py | 163 +++++++++--------- .../resolve-import-conflicts-in-file.py | 19 +- src/scripts/merge_tools/spork.sh | 2 +- src/scripts/run_repo_tests.sh | 5 +- 7 files changed, 145 insertions(+), 140 deletions(-) diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py index 2ef4864ca6..4b8cea5855 100755 --- a/src/python/cache_utils.py +++ b/src/python/cache_utils.py @@ -35,7 +35,6 @@ def set_in_cache( cache_value: Union[str, dict, None], repo_slug: str, cache_directory: Path, - overwrite: bool = True, acquire_lock: bool = True, ) -> None: """Puts an entry in the cache, then writes the cache to disk. @@ -45,16 +44,12 @@ def set_in_cache( cache_value (dict): The value to write. repo_slug (str): The slug of the repository, which is "owner/reponame". cache_directory (Path): The path to the cache directory. - overwrite (bool, optional) = True: Whether to overwrite an existing cache entry. - If False, attempting to overwrite an existing cache entry throws an exception. """ if acquire_lock: lock = get_cache_lock(repo_slug, cache_directory) lock.acquire() cache_path = get_cache_path(repo_slug, cache_directory) cache = load_cache(repo_slug, cache_directory) - if cache_key in cache and not overwrite: - raise ValueError("Cache key already exists") cache[cache_key] = cache_value output = json.dumps(cache, indent=4) with open(cache_path, "w") as f: diff --git a/src/python/merge_differ.py b/src/python/merge_differ.py index 574eddef00..cd8e3a60fa 100755 --- a/src/python/merge_differ.py +++ b/src/python/merge_differ.py @@ -118,8 +118,6 @@ def merge_differ(args: Tuple[pd.Series, Path]) -> None: command.append(str(repo2.repo_path)) with open(diff_file, "w") as f: subprocess.run(command, stdout=f, stderr=f) - del repo2 - del repo1 def diff_file_name(sha1: str, sha2: str) -> Path: diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py index d0c51a8701..cf2a396148 100755 --- a/src/python/merge_tester.py +++ b/src/python/merge_tester.py @@ -54,10 +54,10 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: print("merge_tester: Started ", repo_slug, merge_data["left"], merge_data["right"]) merge_data["parents pass"] = False - # TODO: It's not clear to me whether "left" and "right" are keys for branches, or shas, or something else. How about improving the key name? (We can change the find-merge-conflicts program if it is the source of the problem.) for branch in ["left", "right"]: + commit_sha = merge_data[branch] repo = Repository(repo_slug, cache_directory=cache_directory) - repo.checkout(merge_data[branch]) + repo.checkout(commit_sha) tree_fingerprint = repo.compute_tree_fingerprint() if tree_fingerprint != merge_data[f"{branch}_tree_fingerprint"]: raise Exception( @@ -112,6 +112,54 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series: return merge_data +def build_arguments( + args: argparse.Namespace, + repo_slug: str, +) -> list: + """Builds the arguments for the merge_tester function. + Args: + args (argparse.Namespace): The arguments of the script. + repo_slug (str): The slug of the repository. + Returns: + list: The arguments for the merge_tester function. + """ + merge_list_file = Path( + os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") + ) + output_file = Path( + os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") + ) + if not merge_list_file.exists(): + print( + "merge_tester:", + repo_slug, + "does not have a list of merges. Missing file: ", + merge_list_file, + ) + return [] + + if output_file.exists(): + print( + "merge_tester: Skipping", + repo_slug, + "because it is already computed.", + ) + return [] + try: + merges = pd.read_csv(merge_list_file, header=0, index_col="idx") + except pd.errors.EmptyDataError: + print( + "merge_tester: Skipping", + repo_slug, + "because it does not contain any merges.", + ) + return [] + return [ + (repo_slug, merge_data, Path(args.cache_dir)) + for _, merge_data in merges.iterrows() + ] + + def main(): # pylint: disable=too-many-locals,too-many-statements """Main function""" print("merge_tester: Start") @@ -126,46 +174,11 @@ def main(): # pylint: disable=too-many-locals,too-many-statements repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx") - # TODO: I suggest abstracting this loop into a separate function (through printing "number of merges to test"). print("merge_tester: Started collecting merges to test") merge_tester_arguments = [] for _, repository_data in tqdm(repos.iterrows(), total=len(repos)): repo_slug = repository_data["repository"] - merge_list_file = Path( - os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") - ) - output_file = Path( - os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") - ) - if not merge_list_file.exists(): - print( - "merge_tester:", - repo_slug, - "does not have a list of merges. Missing file: ", - merge_list_file, - ) - continue - - if output_file.exists(): - print( - "merge_tester: Skipping", - repo_slug, - "because it is already computed.", - ) - continue - try: - merges = pd.read_csv(merge_list_file, header=0, index_col="idx") - except pd.errors.EmptyDataError: - print( - "merge_tester: Skipping", - repo_slug, - "because it does not contain any merges.", - ) - continue - merge_tester_arguments += [ - (repo_slug, merge_data, Path(args.cache_dir)) - for _, merge_data in merges.iterrows() - ] + merge_tester_arguments += build_arguments(args, repo_slug) # Shuffle input to reduce cache contention random.seed(42) diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py index 4b06bb7295..d149730d9d 100755 --- a/src/python/merge_tools_comparator.py +++ b/src/python/merge_tools_comparator.py @@ -69,8 +69,18 @@ def merger( # pylint: disable=too-many-locals merge_data["right"], merge_tool.name, ) - cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []} - # TODO: I suggest abstracting out the body of this loop as a function. + cache_data[merge_tool.name] = {"results": [], "run_time": []} + log_file: Path = ( + cache_directory + / "merge_logs" + / slug_repo_name(repo_slug) + / merge_data["left"] + / merge_data["right"] + / (merge_tool.name + f".log") + ) + log_file.parent.mkdir(parents=True, exist_ok=True) + log_file.unlink(missing_ok=True) + for i in range(N_REPETITIONS): repo = Repository(repo_slug, cache_directory=cache_directory) ( @@ -86,48 +96,25 @@ def merger( # pylint: disable=too-many-locals right_commit=merge_data["right"], timeout=TIMEOUT_MERGING, ) - log_file: Path = ( - cache_directory - / "merge_logs" - / slug_repo_name(repo_slug) - / merge_data["left"] - / merge_data["right"] - / (merge_tool.name + f".log") - ) - log_file.parent.mkdir(parents=True, exist_ok=True) - if log_file.exists(): - log_file.unlink() - with open(log_file, "w") as f: - f.write(explanation) + if i == 0: # Only write the log file once. + with open(log_file, "w") as f: + f.write(explanation) + cache_data[merge_tool.name]["results"].append(merge_status.name) cache_data[merge_tool.name]["log_file"] = str(log_file) cache_data[merge_tool.name]["run_time"].append(run_time) if "merge_fingerprint" not in cache_data[merge_tool.name]: cache_data[merge_tool.name]["merge_fingerprint"] = merge_fingerprint - else: - if ( - cache_data[merge_tool.name]["merge_fingerprint"] - != merge_fingerprint - ): - raise Exception( - "merge_tools_comparator: Merge fingerprint mismatch", - i, - repo_slug, - merge_data["left"], - merge_data["right"], - merge_tool.name, - merge_status, - cache_data[merge_tool.name]["merge_fingerprint"], - merge_fingerprint, - cache_data, - ) - - if "left_tree_fingerprint" not in cache_data: cache_data["left_tree_fingerprint"] = left_fingerprint cache_data["right_tree_fingerprint"] = right_fingerprint else: + assert ( + cache_data[merge_tool.name]["merge_fingerprint"] + == merge_fingerprint + ) assert cache_data["left_tree_fingerprint"] == left_fingerprint assert cache_data["right_tree_fingerprint"] == right_fingerprint + set_in_cache(cache_key, cache_data, repo_slug, merge_cache_directory) return cache_data @@ -156,6 +143,64 @@ def check_if_two_merges_differ(cache_data: dict) -> bool: return False +def build_merge_arguments(args: argparse.Namespace, repo_slug: str): + """ + Creates the arguments for the merger function. + Args: + args (argparse.Namespace): The arguments to the script. + repo_slug (str): The repository slug. + Returns: + list: A list of arguments for the merger function. + """ + merge_list_file = Path( + os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") + ) + output_file = Path( + os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") + ) + if not merge_list_file.exists(): + raise Exception( + "merge_tools_comparator:", + repo_slug, + "does not have a list of merges. Missing file: ", + merge_list_file, + ) + + if output_file.exists(): + print( + "merge_tools_comparator: Skipping", + repo_slug, + "because it is already computed.", + ) + return [] + + merges = pd.read_csv( + merge_list_file, + names=["idx", "branch_name", "merge", "left", "right", "notes"], + dtype={ + "idx": int, + "branch_name": str, + "merge": str, + "left": str, + "right": str, + "notes": str, + }, + header=0, + index_col="idx", + ) + merges["notes"].replace(np.nan, "", inplace=True) + + if args.only_trivial_merges: + merges = merges[merges["notes"].str.contains("a parent is the base")] + elif not args.include_trivial_merges: + merges = merges[~merges["notes"].str.contains("a parent is the base")] + arguments = [ + (repo_slug, merge_data, Path(args.cache_dir)) + for _, merge_data in merges.iterrows() + ] + return arguments + + if __name__ == "__main__": print("merge_tools_comparator: Start") parser = argparse.ArgumentParser() @@ -174,56 +219,8 @@ def check_if_two_merges_differ(cache_data: dict) -> bool: print("merge_tools_comparator: Constructing Inputs") merger_arguments = [] for _, repository_data in tqdm(repos.iterrows(), total=len(repos)): - # TODO: I suggest abstracting out the body of this loop as a function. - merges_repo = [] repo_slug = repository_data["repository"] - merge_list_file = Path( - os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") - ) - output_file = Path( - os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") - ) - if not merge_list_file.exists(): - raise Exception( - "merge_tools_comparator:", - repo_slug, - "does not have a list of merges. Missing file: ", - merge_list_file, - ) - - if output_file.exists(): - print( - "merge_tools_comparator: Skipping", - repo_slug, - "because it is already computed.", - ) - continue - - merges = pd.read_csv( - merge_list_file, - names=["idx", "branch_name", "merge", "left", "right", "notes"], - dtype={ - "idx": int, - "branch_name": str, - "merge": str, - "left": str, - "right": str, - "notes": str, - }, - header=0, - index_col="idx", - ) - merges["notes"].replace(np.nan, "", inplace=True) - - if args.only_trivial_merges: - merges = merges[merges["notes"].str.contains("a parent is the base")] - elif not args.include_trivial_merges: - merges = merges[~merges["notes"].str.contains("a parent is the base")] - - merger_arguments += [ - (repo_slug, merge_data, Path(args.cache_dir)) - for _, merge_data in merges.iterrows() - ] + merger_arguments += build_merge_arguments(args, repo_slug) # Shuffle input to reduce cache contention random.seed(42) diff --git a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py index 864a4a8bbc..36f2750419 100755 --- a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py +++ b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#! /usr/bin/env python3 # TODO: Benedikt, could you review and improve this file? I'm sure it is not perfect Python style. """Edits a file in place to remove conflict markers related to Java imports. @@ -12,6 +12,7 @@ import shutil import sys import tempfile +from typing import List, Union, Tuple if len(sys.argv) != 2: print( @@ -24,12 +25,12 @@ lines = file.readlines() -def all_import_lines(lines): +def all_import_lines(lines: List[str]) -> bool: """Return true if every line is a Java import line.""" return all(line.startswith("import ") for line in lines) -def merge(base, parent1, parent2): +def merge(base, parent1: List[str], parent2: List[str]) -> Union[List[str], None]: """Given text for the base and two parents, return merged text. This currently does a simplistic merge that retains all lines in either parent. @@ -40,7 +41,7 @@ def merge(base, parent1, parent2): parent2: a list of lines Returns: - a list of lines, or None if it cannot do merging. + Union[List[str], None]: If the merge is successful, returns a list of lines. """ if ( @@ -53,7 +54,9 @@ def merge(base, parent1, parent2): return None -def looking_at_conflict(start_index, lines): # pylint: disable=R0911 +def looking_at_conflict( # pylint: disable=too-many-return-statements + start_index: int, lines: List[str] +) -> Union[None, Tuple[List[str], List[str], List[str], int]]: """Tests whether the text starting at line `start_index` is the beginning of a conflict. If not, returns None. If so, returns a 4-tuple of (base, parent1, parent2, num_lines_in_conflict) @@ -82,7 +85,7 @@ def looking_at_conflict(start_index, lines): # pylint: disable=R0911 if index == num_lines: print( "Starting at line " - + start_index + + str(start_index) + ", did not find ||||||| or ======= in " + filename ) @@ -98,7 +101,7 @@ def looking_at_conflict(start_index, lines): # pylint: disable=R0911 if index == num_lines: print( "Starting at line " - + start_index + + str(start_index) + ", did not find ======= in " + filename ) @@ -114,7 +117,7 @@ def looking_at_conflict(start_index, lines): # pylint: disable=R0911 if index == num_lines: print( "Starting at line " - + start_index + + str(start_index) + ", did not find >>>>>>> in " + filename ) diff --git a/src/scripts/merge_tools/spork.sh b/src/scripts/merge_tools/spork.sh index 3e38375de0..7748d5f2e2 100755 --- a/src/scripts/merge_tools/spork.sh +++ b/src/scripts/merge_tools/spork.sh @@ -13,8 +13,8 @@ if [ "$#" -ne 3 ]; then exit 1 fi -# TODO: This script is an odd place to put this `killall` command. Should it be elsewhere in the experimental infrastructure? # Kill all Java processes that are running for over an hour (to avoid memory leaks). +# Spork tends to create Java processes that don't terminate even when the parent process is killed. killall -9 java --older-than 1h SCRIPT_PATH="$(dirname "$0")"; SCRIPT_PATH="$(eval "cd \"$SCRIPT_PATH\" && pwd")" diff --git a/src/scripts/run_repo_tests.sh b/src/scripts/run_repo_tests.sh index fc032b38b2..1d647d9b27 100755 --- a/src/scripts/run_repo_tests.sh +++ b/src/scripts/run_repo_tests.sh @@ -7,9 +7,8 @@ set -o nounset -# TODO: This is too agressive and will interfere with the user's other work. -# TODO: Instead, create a directory under /tmp (or, better, under /tmp/$USER) for the AST merging experiments, and clean it. -# Test side effects can be seen in the /tmp directory. +# Test side effects can be seen in the /tmp directory which are created by +# testing infrastructure. We delete them to avoid filling up the disk. # We delete all the files older than 2h and owned by the current user. find /tmp -maxdepth 1 -user "$(whoami)" -mmin +120 -exec rm -rf {} \; From 1d1b8474bf73178135012b596a39fe760308d143 Mon Sep 17 00:00:00 2001 From: Benedikt schesch Date: Mon, 25 Sep 2023 17:53:17 -0700 Subject: [PATCH 5/7] Reformating change --- .../merge_tools/resolve-import-conflicts | 2 +- .../resolve-import-conflicts-in-file.py | 76 +++++++++---------- 2 files changed, 36 insertions(+), 42 deletions(-) diff --git a/src/scripts/merge_tools/resolve-import-conflicts b/src/scripts/merge_tools/resolve-import-conflicts index feaee8b83c..aa78cd5231 100755 --- a/src/scripts/merge_tools/resolve-import-conflicts +++ b/src/scripts/merge_tools/resolve-import-conflicts @@ -19,7 +19,7 @@ SCRIPTDIR="$(cd "$(dirname "$0")" && pwd -P)" status=0 for file in "${files[@]}" ; do - if ! "${SCRIPTDIR}"/resolve-import-conflicts-in-file.py "$file" ; then + if ! "${SCRIPTDIR}"/resolve-import-conflicts-in-file.py --file "$file" ; then status=1 fi done diff --git a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py index 36f2750419..319e4fb35f 100755 --- a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py +++ b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py @@ -8,23 +8,12 @@ # TODO: merge both scripts into one. -import os import shutil -import sys +import argparse +from pathlib import Path import tempfile from typing import List, Union, Tuple -if len(sys.argv) != 2: - print( - "resolve-import-conflicts-in-file: Provide exactly one command-line argument." - ) - sys.exit(1) - -filename = sys.argv[1] -with open(filename) as file: - lines = file.readlines() - - def all_import_lines(lines: List[str]) -> bool: """Return true if every line is a Java import line.""" return all(line.startswith("import ") for line in lines) @@ -55,7 +44,7 @@ def merge(base, parent1: List[str], parent2: List[str]) -> Union[List[str], None def looking_at_conflict( # pylint: disable=too-many-return-statements - start_index: int, lines: List[str] + file: Path, start_index: int, lines: List[str] ) -> Union[None, Tuple[List[str], List[str], List[str], int]]: """Tests whether the text starting at line `start_index` is the beginning of a conflict. If not, returns None. @@ -63,7 +52,7 @@ def looking_at_conflict( # pylint: disable=too-many-return-statements where the first 3 elements of the tuple are lists of lines. Args: start_index: an index into `lines`. - lines: all the lines of the file with name `filename`. + lines: all the lines of the file with name `file`. """ if not lines[start_index].startswith("<<<<<<<"): @@ -87,13 +76,13 @@ def looking_at_conflict( # pylint: disable=too-many-return-statements "Starting at line " + str(start_index) + ", did not find ||||||| or ======= in " - + filename + + str(file) ) return None if lines[index].startswith("|||||||"): index = index + 1 if index == num_lines: - print("File ends with |||||||: " + filename) + print("File ends with |||||||: " + str(file)) return None while not lines[index].startswith("======="): base.append(lines[index]) @@ -103,13 +92,13 @@ def looking_at_conflict( # pylint: disable=too-many-return-statements "Starting at line " + str(start_index) + ", did not find ======= in " - + filename + + str(file) ) return None assert lines[index].startswith("=======") index = index + 1 # skip over "=======" line if index == num_lines: - print("File ends with =======: " + filename) + print("File ends with =======: " + str(file)) return None while not lines[index].startswith(">>>>>>>"): parent2.append(lines[index]) @@ -119,7 +108,7 @@ def looking_at_conflict( # pylint: disable=too-many-return-statements "Starting at line " + str(start_index) + ", did not find >>>>>>> in " - + filename + + str(file) ) return None index = index + 1 @@ -127,27 +116,32 @@ def looking_at_conflict( # pylint: disable=too-many-return-statements return (base, parent1, parent2, index - start_index) -## Main starts here. - -with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp: - file_len = len(lines) - i = 0 - while i < file_len: - conflict = looking_at_conflict(i, lines) - if conflict is None: - tmp.write(lines[i]) - i = i + 1 - else: - (base, parent1, parent2, num_lines) = conflict - merged = merge(base, parent1, parent2) - if merged is None: +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--file", type=Path) + args = parser.parse_args() + + with open(args.file) as file: + lines = file.readlines() + + with tempfile.NamedTemporaryFile(mode="w", delete=True) as tmp: + file_len = len(lines) + i = 0 + while i < file_len: + conflict = looking_at_conflict(Path(args.file), i, lines) + if conflict is None: tmp.write(lines[i]) i = i + 1 else: - for line in merged: - tmp.write(line) - i = i + num_lines - - tmp.close() - shutil.copy(tmp.name, filename) - os.unlink(tmp.name) + (base, parent1, parent2, num_lines) = conflict + merged = merge(base, parent1, parent2) + if merged is None: + tmp.write(lines[i]) + i = i + 1 + else: + for line in merged: + tmp.write(line) + i = i + num_lines + + # Copying the file back to the original location + shutil.copyfile(tmp.name, args.file) From b8378f2d96c98df2087f4d8af9b35c49f408adec Mon Sep 17 00:00:00 2001 From: Benedikt schesch Date: Mon, 25 Sep 2023 18:06:05 -0700 Subject: [PATCH 6/7] Style Fix --- src/scripts/merge_tools/resolve-import-conflicts-in-file.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py index 319e4fb35f..adef39b067 100755 --- a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py +++ b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py @@ -14,6 +14,7 @@ import tempfile from typing import List, Union, Tuple + def all_import_lines(lines: List[str]) -> bool: """Return true if every line is a Java import line.""" return all(line.startswith("import ") for line in lines) From ac5fd880b7267f57afe2acd21dea13e4fe0c3f4d Mon Sep 17 00:00:00 2001 From: Michael Ernst Date: Wed, 27 Sep 2023 11:54:51 -0700 Subject: [PATCH 7/7] Make resolve-import-conflicts return a status code (#218) --- src/scripts/merge_tools/git_hires_merge.sh | 2 +- src/scripts/merge_tools/gitmerge_ort.sh | 2 +- .../merge_tools/gitmerge_ort_ignorespace.sh | 2 +- .../merge_tools/gitmerge_ort_imports.sh | 15 +- .../gitmerge_ort_imports_ignorespace.sh | 15 +- .../gitmerge_recursive_histogram.sh | 2 +- .../gitmerge_recursive_ignorespace.sh | 2 +- .../merge_tools/gitmerge_recursive_minimal.sh | 2 +- .../merge_tools/gitmerge_recursive_myers.sh | 2 +- .../gitmerge_recursive_patience.sh | 2 +- src/scripts/merge_tools/gitmerge_resolve.sh | 2 +- src/scripts/merge_tools/resolve-conflicts.py | 285 ++++++++++++++++++ .../merge_tools/resolve-import-conflicts | 22 +- .../resolve-import-conflicts-in-file.py | 148 --------- 14 files changed, 335 insertions(+), 168 deletions(-) create mode 100755 src/scripts/merge_tools/resolve-conflicts.py delete mode 100755 src/scripts/merge_tools/resolve-import-conflicts-in-file.py diff --git a/src/scripts/merge_tools/git_hires_merge.sh b/src/scripts/merge_tools/git_hires_merge.sh index b1912eb703..a463e936b0 100755 --- a/src/scripts/merge_tools/git_hires_merge.sh +++ b/src/scripts/merge_tools/git_hires_merge.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./git-hires-merge.sh +# usage: ./git_hires_merge.sh clone_dir=$1 branch1=$2 diff --git a/src/scripts/merge_tools/gitmerge_ort.sh b/src/scripts/merge_tools/gitmerge_ort.sh index 51947da10e..d8a695734b 100755 --- a/src/scripts/merge_tools/gitmerge_ort.sh +++ b/src/scripts/merge_tools/gitmerge_ort.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-ort.sh +# usage: ./gitmerge_ort.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_ort_ignorespace.sh b/src/scripts/merge_tools/gitmerge_ort_ignorespace.sh index 2a40212493..ef1a5df5f8 100755 --- a/src/scripts/merge_tools/gitmerge_ort_ignorespace.sh +++ b/src/scripts/merge_tools/gitmerge_ort_ignorespace.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-ort-ignorespace.sh +# usage: ./gitmerge_ort_ignorespace.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_ort_imports.sh b/src/scripts/merge_tools/gitmerge_ort_imports.sh index 221280a6e0..eb83295136 100755 --- a/src/scripts/merge_tools/gitmerge_ort_imports.sh +++ b/src/scripts/merge_tools/gitmerge_ort_imports.sh @@ -1,12 +1,21 @@ #!/usr/bin/env sh -# usage: ./gitmerge-ort.sh +# usage: ./gitmerge_ort_imports.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 branch1=$2 branch2=$3 strategy="-s ort" -"$MERGE_SCRIPTS_DIR"/gitmerge.sh "$clone_dir" "$branch1" "$branch2" "$strategy" +if "$MERGE_SCRIPTS_DIR"/gitmerge.sh "$clone_dir" "$branch1" "$branch2" "$strategy"; then + exit 0 +fi -(cd "$clone_dir" && "$MERGE_SCRIPTS_DIR"/resolve-import-conflicts) +cd "$clone_dir" || exit 1 +if ! "$MERGE_SCRIPTS_DIR"/resolve-import-conflicts; then + echo "Conflict" + git merge --abort + exit 1 +fi + +exit 0 diff --git a/src/scripts/merge_tools/gitmerge_ort_imports_ignorespace.sh b/src/scripts/merge_tools/gitmerge_ort_imports_ignorespace.sh index 03770b22f7..41ee46a2e1 100755 --- a/src/scripts/merge_tools/gitmerge_ort_imports_ignorespace.sh +++ b/src/scripts/merge_tools/gitmerge_ort_imports_ignorespace.sh @@ -1,12 +1,21 @@ #!/usr/bin/env sh -# usage: ./gitmerge-ort-ignorespace.sh +# usage: ./gitmerge_ort_imports_ignorespace.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 branch1=$2 branch2=$3 strategy="-s ort -Xignore-space-change" -"$MERGE_SCRIPTS_DIR"/gitmerge.sh "$clone_dir" "$branch1" "$branch2" "$strategy" +if "$MERGE_SCRIPTS_DIR"/gitmerge.sh "$clone_dir" "$branch1" "$branch2" "$strategy"; then + exit 0 +fi -(cd "$clone_dir" && "$MERGE_SCRIPTS_DIR"/resolve-import-conflicts) +cd "$clone_dir" || exit 1 +if ! "$MERGE_SCRIPTS_DIR"/resolve-import-conflicts; then + echo "Conflict" + git merge --abort + exit 1 +fi + +exit 0 diff --git a/src/scripts/merge_tools/gitmerge_recursive_histogram.sh b/src/scripts/merge_tools/gitmerge_recursive_histogram.sh index 8e239dd933..6453c6e00e 100755 --- a/src/scripts/merge_tools/gitmerge_recursive_histogram.sh +++ b/src/scripts/merge_tools/gitmerge_recursive_histogram.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-recursive-histogram.sh +# usage: ./gitmerge_recursive_histogram.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_recursive_ignorespace.sh b/src/scripts/merge_tools/gitmerge_recursive_ignorespace.sh index 3535ace430..812d2dcd18 100755 --- a/src/scripts/merge_tools/gitmerge_recursive_ignorespace.sh +++ b/src/scripts/merge_tools/gitmerge_recursive_ignorespace.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-recursive-histogram.sh +# usage: ./gitmerge_recursive_ignorespace.sh MERGE_DIR="$(dirname "$0")" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_recursive_minimal.sh b/src/scripts/merge_tools/gitmerge_recursive_minimal.sh index 4c2aefb8ef..e7d916539a 100755 --- a/src/scripts/merge_tools/gitmerge_recursive_minimal.sh +++ b/src/scripts/merge_tools/gitmerge_recursive_minimal.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-recursive-minimal.sh +# usage: ./gitmerge_recursive_minimal.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_recursive_myers.sh b/src/scripts/merge_tools/gitmerge_recursive_myers.sh index df12c4306c..f95f093ba7 100755 --- a/src/scripts/merge_tools/gitmerge_recursive_myers.sh +++ b/src/scripts/merge_tools/gitmerge_recursive_myers.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-recursive-myers.sh +# usage: ./gitmerge_recursive_myers.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_recursive_patience.sh b/src/scripts/merge_tools/gitmerge_recursive_patience.sh index 6130ebbb4c..77e5a8e449 100755 --- a/src/scripts/merge_tools/gitmerge_recursive_patience.sh +++ b/src/scripts/merge_tools/gitmerge_recursive_patience.sh @@ -1,6 +1,6 @@ #!/usr/bin/env sh -# usage: ./gitmerge-recursive-patience.sh +# usage: ./gitmerge_recursive_patience.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/gitmerge_resolve.sh b/src/scripts/merge_tools/gitmerge_resolve.sh index 928a97960a..859faf70e2 100755 --- a/src/scripts/merge_tools/gitmerge_resolve.sh +++ b/src/scripts/merge_tools/gitmerge_resolve.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# usage: ./gitmerge-resolve.sh +# usage: ./gitmerge_resolve.sh MERGE_SCRIPTS_DIR="$(cd "$(dirname "$0")" && pwd -P)" clone_dir=$1 diff --git a/src/scripts/merge_tools/resolve-conflicts.py b/src/scripts/merge_tools/resolve-conflicts.py new file mode 100755 index 0000000000..0266230039 --- /dev/null +++ b/src/scripts/merge_tools/resolve-conflicts.py @@ -0,0 +1,285 @@ +#! /usr/bin/env python + +# This is a helper script for `resolve-adjacent-conflicts` and +# `resolve-import-conflicts`. + +"""Edits a file in place to remove certain conflict markers. + +Usage: resolve-conflicts.py [options] + +--java_imports: Resolves conflicts related to Java import statements +The output includes every `import` statements that is in either of the parents. + +--adjacent_lines: Resolves conflicts on adjacent lines, by accepting both edits. + +Exit status is 0 (success) if no conflicts remain. +Exit status is 1 (failure) if conflicts remain. +""" + +from argparse import ArgumentParser +import itertools +import os +import shutil +import sys +import tempfile +from typing import List, Union, Tuple + + +def main(): # pylint: disable=too-many-locals + """The main entry point.""" + arg_parser = ArgumentParser() + arg_parser.add_argument("filename") + arg_parser.add_argument( + "--java_imports", + action="store_true", + help="If set, resolve conflicts related to Java import statements", + ) + arg_parser.add_argument( + "--adjacent_lines", + action="store_true", + help="If set, resolve conflicts on adjacent lines", + ) + args = arg_parser.parse_args() + filename = args.filename + + with open(filename) as file: + lines = file.readlines() + + # Exit status 0 means no conflicts remain, 1 means some merge conflict remains. + conflicts_remain = False + with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp: + file_len = len(lines) + i = 0 + while i < file_len: + conflict = looking_at_conflict(filename, i, lines) + if conflict is None: + tmp.write(lines[i]) + i = i + 1 + else: + (base, parent1, parent2, num_lines) = conflict + merged = merge( + base, parent1, parent2, args.java_imports, args.adjacent_lines + ) + if merged is None: + tmp.write(lines[i]) + i = i + 1 + conflicts_remain = True + else: + for line in merged: + tmp.write(line) + i = i + num_lines + + tmp.close() + shutil.copy(tmp.name, filename) + os.unlink(tmp.name) + + if conflicts_remain: + sys.exit(1) + else: + sys.exit(0) + + +def looking_at_conflict( # pylint: disable=too-many-return-statements + filename: str, start_index: int, lines: List[str] +) -> Union[None, Tuple[List[str], List[str], List[str], int]]: + """Tests whether the following text starts a conflict. + If not, returns None. + If so, returns a 4-tuple of (base, parent1, parent2, num_lines_in_conflict) + where the first 3 elements of the tuple are lists of lines. + """ + + if not lines[start_index].startswith("<<<<<<<"): + return None + + base = [] + parent1 = [] + parent2 = [] + + num_lines = len(lines) + index = start_index + 1 + if index == num_lines: + return None + while not ( + lines[index].startswith("|||||||") or lines[index].startswith("=======") + ): + parent1.append(lines[index]) + index = index + 1 + if index == num_lines: + print( + "Starting at line " + + str(start_index) + + ", did not find ||||||| or ======= in " + + filename + ) + return None + if lines[index].startswith("|||||||"): + index = index + 1 + if index == num_lines: + print("File ends with |||||||: " + filename) + return None + while not lines[index].startswith("======="): + base.append(lines[index]) + index = index + 1 + if index == num_lines: + print( + "Starting at line " + + str(start_index) + + ", did not find ======= in " + + filename + ) + return None + assert lines[index].startswith("=======") + index = index + 1 # skip over "=======" line + if index == num_lines: + print("File ends with =======: " + filename) + return None + while not lines[index].startswith(">>>>>>>"): + parent2.append(lines[index]) + index = index + 1 + if index == num_lines: + print( + "Starting at line " + + str(start_index) + + ", did not find >>>>>>> in " + + filename + ) + return None + index = index + 1 + + return (base, parent1, parent2, index - start_index) + + +def merge( + base: List[str], + parent1: List[str], + parent2: List[str], + java_imports: bool, + adjacent_lines: bool, +) -> Union[List[str], None]: + """Given text for the base and two parents, return merged text. + + Args: + base: a list of lines + parent1: a list of lines + parent2: a list of lines + + Returns: + a list of lines, or None if it cannot do merging. + """ + + print(base, parent1, parent2) + + if java_imports: + if ( + all_import_lines(base) + and all_import_lines(parent1) + and all_import_lines(parent2) + ): + # A simplistic merge that retains all import lines in either parent. + return list(set(parent1 + parent2)).sort() + + if adjacent_lines: + adjacent_line_merge = merge_edits_on_different_lines(base, parent1, parent2) + if adjacent_line_merge is not None: + return adjacent_line_merge + + return None + + +def all_import_lines(lines: List[str]) -> bool: + """Return true if every given line is a Java import line or is blank.""" + return all(line.startswith("import ") or line.strip() == "" for line in lines) + + +def merge_edits_on_different_lines( + base, parent1: List[str], parent2: List[str] +) -> Union[List[str], None]: + """Return a merged version, if at most parent1 or parent2 edits each line. + Otherwise, return None. + """ + + print("Entered merge_edits_on_different_lines") + + ### No lines are added or removed, only modified. + base_len = len(base) + result = None + if base_len == len(parent1) and base_len == len(parent2): + result = [] + for base_line, parent1_line, parent2_line in itertools.zip_longest( + base, parent1, parent2 + ): + print("Considering line:", base_line, parent1_line, parent2_line) + if parent1_line == parent2_line: + result.append(parent1_line) + elif base_line == parent1_line: + result.append(parent2_line) + elif base_line == parent2_line: + result.append(parent1_line) + else: + result = None + break + print("merge_edits_on_different_lines =>", result) + if result is not None: + print("merge_edits_on_different_lines =>", result) + return result + + ### Deletions at the beginning or end. + if base_len != 0: + result = merge_base_is_prefix_or_suffix(base, parent1, parent2) + if result is None: + result = merge_base_is_prefix_or_suffix(base, parent2, parent1) + if result is not None: + return result + + ### Interleaved deletions, with an empty merge outcome. + if base_len != 0: + if issubsequence(parent1, base) and issubsequence(parent2, base): + return [] + + print("merge_edits_on_different_lines =>", result) + return result + + +def merge_base_is_prefix_or_suffix( + base: List[str], parent1: List[str], parent2: List[str] +) -> Union[List[str], None]: + """Special cases when the base is a prefix or suffix of parent1. + That is, parent1 is pure additions at the beginning or end of base. Parent2 + deleted all the lines, possibly replacing them by something else. (We know + this because there is no common line in base and parent2. If there were, it + would also be in parent1, and the hunk would have been split into two at the + common line that's in all three texts.) + We know the relative position of the additions in parent1. + """ + base_len = len(base) + parent1_len = len(parent1) + parent2_len = len(parent2) + if base_len < parent1_len: + if parent1[:base_len] == base: + print("startswith", parent1, base) + return parent2 + parent1[base_len:] + if parent1[-base_len:] == base: + print("endswith", parent1, base) + return parent1[:-base_len] + parent2 + return None + + +def issubsequence(s1: List[str], s2: List[str]) -> bool: + """Returns true if s1 is subsequence of s2.""" + + # Iterative implementation. + + n, m = len(s1), len(s2) + i, j = 0, 0 + while i < n and j < m: + if s1[i] == s2[j]: + i += 1 + j += 1 + + # If i reaches end of s1, we found all characters of s1 in s2, + # so s1 is a subsequence of s2. + return i == n + + +if __name__ == "__main__": + main() diff --git a/src/scripts/merge_tools/resolve-import-conflicts b/src/scripts/merge_tools/resolve-import-conflicts index aa78cd5231..73db46473d 100755 --- a/src/scripts/merge_tools/resolve-import-conflicts +++ b/src/scripts/merge_tools/resolve-import-conflicts @@ -1,12 +1,24 @@ #!/bin/bash +# bash, not POSIX sh, because of "readarray". -# This script edits files to remove conflict markers related to Java imports. -# It works on all files given on the command line; -# if none are given, it works on all files in or under the current directory. +# This script edits files in place to remove conflict markers related to Java +# imports. For a given conflict that involves only `import` statements and +# blank lines, the output includes every `import` statement that is in either +# of the parents. This script leaves other conflicts untouched. + +# Usage: +# resolve-import-conflicts [file ...] +# +# The script works on all files given on the command line. +# If none are given, the script works on all files in or under the current directory. +# +# The exit status code is 0 (success) if all conflicts are resolved in all the files. +# The exit status code is 1 (failure) if any conflict remains. # This script is not a git mergetool. A git mergetool is given the base, parent 1, and # parent 2 files, all without conflict markers. -# However, this script can be run instead of a git mergetool, or after a git mergetool. +# However, this can be run after a git mergetool that leaves conflict markers +# in files, as the default git mergetool does. if [ "$#" -eq 0 ] ; then readarray -t files < <(grep -l -r '^<<<<<<< HEAD' .) @@ -19,7 +31,7 @@ SCRIPTDIR="$(cd "$(dirname "$0")" && pwd -P)" status=0 for file in "${files[@]}" ; do - if ! "${SCRIPTDIR}"/resolve-import-conflicts-in-file.py --file "$file" ; then + if ! "${SCRIPTDIR}"/resolve-conflicts.py --java_imports "$file" ; then status=1 fi done diff --git a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py b/src/scripts/merge_tools/resolve-import-conflicts-in-file.py deleted file mode 100755 index adef39b067..0000000000 --- a/src/scripts/merge_tools/resolve-import-conflicts-in-file.py +++ /dev/null @@ -1,148 +0,0 @@ -#! /usr/bin/env python3 -# TODO: Benedikt, could you review and improve this file? I'm sure it is not perfect Python style. - -"""Edits a file in place to remove conflict markers related to Java imports. -It simplistically leaves all the imports that appear in either parent. -""" - - -# TODO: merge both scripts into one. - -import shutil -import argparse -from pathlib import Path -import tempfile -from typing import List, Union, Tuple - - -def all_import_lines(lines: List[str]) -> bool: - """Return true if every line is a Java import line.""" - return all(line.startswith("import ") for line in lines) - - -def merge(base, parent1: List[str], parent2: List[str]) -> Union[List[str], None]: - """Given text for the base and two parents, return merged text. - - This currently does a simplistic merge that retains all lines in either parent. - - Args: - base: a list of lines - parent1: a list of lines - parent2: a list of lines - - Returns: - Union[List[str], None]: If the merge is successful, returns a list of lines. - """ - - if ( - all_import_lines(base) - and all_import_lines(parent1) - and all_import_lines(parent2) - ): - return parent1 + parent2 - - return None - - -def looking_at_conflict( # pylint: disable=too-many-return-statements - file: Path, start_index: int, lines: List[str] -) -> Union[None, Tuple[List[str], List[str], List[str], int]]: - """Tests whether the text starting at line `start_index` is the beginning of a conflict. - If not, returns None. - If so, returns a 4-tuple of (base, parent1, parent2, num_lines_in_conflict) - where the first 3 elements of the tuple are lists of lines. - Args: - start_index: an index into `lines`. - lines: all the lines of the file with name `file`. - """ - - if not lines[start_index].startswith("<<<<<<<"): - return None - - base = [] - parent1 = [] - parent2 = [] - - num_lines = len(lines) - index = start_index + 1 - if index == num_lines: - return None - while not ( - lines[index].startswith("|||||||") or lines[index].startswith("=======") - ): - parent1.append(lines[index]) - index = index + 1 - if index == num_lines: - print( - "Starting at line " - + str(start_index) - + ", did not find ||||||| or ======= in " - + str(file) - ) - return None - if lines[index].startswith("|||||||"): - index = index + 1 - if index == num_lines: - print("File ends with |||||||: " + str(file)) - return None - while not lines[index].startswith("======="): - base.append(lines[index]) - index = index + 1 - if index == num_lines: - print( - "Starting at line " - + str(start_index) - + ", did not find ======= in " - + str(file) - ) - return None - assert lines[index].startswith("=======") - index = index + 1 # skip over "=======" line - if index == num_lines: - print("File ends with =======: " + str(file)) - return None - while not lines[index].startswith(">>>>>>>"): - parent2.append(lines[index]) - index = index + 1 - if index == num_lines: - print( - "Starting at line " - + str(start_index) - + ", did not find >>>>>>> in " - + str(file) - ) - return None - index = index + 1 - - return (base, parent1, parent2, index - start_index) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--file", type=Path) - args = parser.parse_args() - - with open(args.file) as file: - lines = file.readlines() - - with tempfile.NamedTemporaryFile(mode="w", delete=True) as tmp: - file_len = len(lines) - i = 0 - while i < file_len: - conflict = looking_at_conflict(Path(args.file), i, lines) - if conflict is None: - tmp.write(lines[i]) - i = i + 1 - else: - (base, parent1, parent2, num_lines) = conflict - merged = merge(base, parent1, parent2) - if merged is None: - tmp.write(lines[i]) - i = i + 1 - else: - for line in merged: - tmp.write(line) - i = i + num_lines - - # Copying the file back to the original location - shutil.copyfile(tmp.name, args.file)