diff --git a/src/python/README.md b/src/python/README.md new file mode 100644 index 0000000000..8f4cbda30d --- /dev/null +++ b/src/python/README.md @@ -0,0 +1,55 @@ +# Python Scripts for Merge Conflict Analysis + + + + +This directory contains Python scripts designed to facilitate the analysis of merge conflicts using various merge tools. The scripts allow users to recreate merges, analyze conflicts, and compare different merge algorithms' across the base, conflict, and programmer merge. + + + + +## Scripts Overview + + + + +- `diff3_analysis.py`: This script analyzes merge conflicts for two merge tools on a given conflict. + +- Performs a 3 way diff between the base, conflicting branches, and the programmer merge. +- Also, it automatically outputs the differences (as given by diff3) between a pair of merge algorithms in a .txt file. +- From the diff, 1: represents the base, 2: represents the conflicting file, 3: represents the programmer's merge. + + + + +## Prerequisites + + + + +- Necessary Python packages installed inside conda or mamba environment(`pandas`, `GitPython`): +pip install pandas +pip install GitPython + + + +## Usage + + + + +### Analyzing a Single Merge Conflict + + + + +To analyze a conflicts comparing two merge tools inside src/python run: + + +python3 diff3_analysis.py + + + +Ex: + +python3 diff3_analysis.py "gitmerge_ort" "spork" 11034-72 "./mixed_results_spork" diff --git a/src/python/diff3_analysis.py b/src/python/diff3_analysis.py new file mode 100644 index 0000000000..c53df5f75f --- /dev/null +++ b/src/python/diff3_analysis.py @@ -0,0 +1,270 @@ +""" +Recreates a merge and outputs the diff files for two algorithms for comparison on a given conflict. +Displays base, conflicting branches, and programmer merge. +See src/python/README.md for details on usage. +""" + +import sys +import argparse +import subprocess +import re +import os +import shutil +import tempfile +import pandas as pd +from repo import clone_repo_to_path +from merge_tester import MERGE_STATE + + +# pylint: disable-msg=too-many-locals +# pylint: disable-msg=too-many-arguments +# pylint: disable-msg=too-many-statements + + +def setup_environment(): + """Remove repository directories to clean up the environment.""" + shutil.rmtree("./repos", ignore_errors=True) + + +def clone_and_checkout(repo_name, branch_sha, clone_dir): + """ + Clone a repository to a specified path and checkout a given SHA. + + + Args: + repo_name (str): The repository to clone. + branch_sha (str): The SHA commit or branch to checkout. + clone_dir (str): Directory path to clone the repository. + + + Returns: + repo (GitPython.repo): The cloned repository object. + """ + repo = clone_repo_to_path(repo_name, clone_dir) + repo.remote().fetch() + repo.git.checkout(branch_sha, force=True) + repo.submodule_update() + return repo + + +def process_diff( + tool_name, + base_path, + attempt_path, + merge_path, + output_dir, + idx: str, + filename, +): + """ + Process the diff between files and save the output to a designated file. + + Args: + tool_name (str): Identifier for the merge tool. + base_path (str): Path to the base file. + attempt_path (str): Path to the merge attempt file. + merge_path (str): Path to the manually merged file. + output_dir (str): Directory where results will be saved. + idx (str): Invariant repo-idx - merge-idx of the commit. + filename (str): Base name for the output file. + """ + # Run diff3 or fall back to diff if files are missing + diff_results = subprocess.run( + ["diff3", base_path, attempt_path, merge_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if "No such file or directory" in diff_results.stderr: + diff_results = subprocess.run( + ["diff", attempt_path, merge_path], stdout=subprocess.PIPE, text=True + ) + + # Prepare the output filename + diff_filename = os.path.join( + output_dir, str(idx), tool_name, f"diff_{filename}.txt" + ) + os.makedirs( + os.path.dirname(diff_filename), exist_ok=True + ) # Ensure the directory exists + + # Write the diff results to the file + with open(diff_filename, "w") as diff_file: + diff_file.write(diff_results.stdout) + print(f"Diff results saved to {diff_filename}") + + +def diff3_analysis(merge_tool1: str, merge_tool2: str, idx: str, repo_output_dir): + """ + Analyzes merge conflicts using the diff3 tool and opens the results in the default text viewer. + Args: + merge_tool1 (str): The merge tool that Merge_failed (tool name as written in spreadsheet) + merge_tool2 (str): The merge tool that Failed_tests or Passed_tests + idx (str): Invariant 'repo-idx - merge-idx' Benedikt added to the spreadsheet for + to identify commits across different results. + repo_output_dir (path): The path of where we want to store the results from the analysis + + Returns: + None + """ + + # Deletes base, programmer_merge, and merge_attempt folders in repos dir + # We do this to prevent errors if cloning the same repo into the folder twice + setup_environment() + + # Retrieve left and right branch from hash in repo + df = pd.read_csv("../../results/combined/result.csv") + + # Find rows where 'repo-idx' and 'merge-idx' match the specified values + results_index = df[(df["idx"] == idx)].index[0] + print(results_index) + + repo_name = df.iloc[results_index]["repository"] + + script = "../scripts/merge_tools/" + merge_tool1 + ".sh" + repo = clone_repo_to_path( + repo_name, "./repos/merge_attempt1" + ) # Return a Git-Python repo object + repo.remote().fetch() + left_sha = df.iloc[results_index]["left"] + repo.git.checkout(left_sha, force=True) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_LEFT_BRANCH", force=True) + repo.git.checkout(df.iloc[results_index]["right"], force=True) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_RIGHT_BRANCH", force=True) + print("Checked out left and right") + + # Clone the base + base_sha = subprocess.run( + [ + "git", + "merge-base", + "TEMP_LEFT_BRANCH", + "TEMP_RIGHT_BRANCH", + ], + cwd="./repos/merge_attempt1/" + repo_name, + stdout=subprocess.PIPE, + text=True, + ) + print("Found base sha" + base_sha.stdout) + base_sha = base_sha.stdout.strip() + repo2 = clone_and_checkout(repo_name, base_sha, "./repos/base") + + # Recreate the merge + result = subprocess.run( + [ + script, + repo.git.rev_parse("--show-toplevel"), + "TEMP_LEFT_BRANCH", + "TEMP_RIGHT_BRANCH", + ], + stdout=subprocess.PIPE, + text=True, + ) + + conflict_file_matches = re.findall( + r"CONFLICT \(.+\): Merge conflict in (.+)", result.stdout + ) + print(result.stdout) + + if conflict_file_matches == []: + print("No conflict files to search") + return + + repo3 = clone_and_checkout( + repo_name, df.iloc[results_index]["merge"], "./repos/programmer_merge" + ) + print(conflict_file_matches) + + script = "../scripts/merge_tools/" + merge_tool2 + ".sh" + repo4 = clone_repo_to_path( + repo_name, "./repos/merge_attempt2" + ) # Return a Git-Python repo object + repo4.remote().fetch() + left_sha = df.iloc[results_index]["left"] + repo4.git.checkout(left_sha, force=True) + print("Checking out left" + left_sha) + repo4.submodule_update() + repo4.git.checkout("-b", "TEMP_LEFT_BRANCH", force=True) + repo4.git.checkout(df.iloc[results_index]["right"], force=True) + print("Checking out right" + df.iloc[results_index]["right"]) + repo4.submodule_update() + repo4.git.checkout("-b", "TEMP_RIGHT_BRANCH", force=True) + + for conflict_file_match in conflict_file_matches: + conflicting_file = str(conflict_file_match) + conflict_path = os.path.join(repo_name, conflicting_file) + conflict_file_base, _ = os.path.splitext(os.path.basename(conflicting_file)) + + # Paths for the first merge attempt + conflict_path_merge_attempt1 = os.path.join( + "./repos/merge_attempt1", conflict_path + ) + conflict_path_base = os.path.join("./repos/base", conflict_path) + conflict_path_programmer_merge = os.path.join( + "./repos/programmer_merge", conflict_path + ) + + # Process the first merge attempt + process_diff( + merge_tool1, + conflict_path_base, + conflict_path_merge_attempt1, + conflict_path_programmer_merge, + repo_output_dir, + idx, + conflict_file_base, + ) + + """ + BREAK + """ + + # Paths for the second merge attempt + conflict_path_merge_attempt2 = os.path.join( + "./repos/merge_attempt2", conflict_path + ) + + # Process the second merge attempt + process_diff( + merge_tool2, + conflict_path_base, + conflict_path_merge_attempt2, + conflict_path_programmer_merge, + repo_output_dir, + idx, + conflict_file_base, + ) + + +def main(): + """ + Parses arguments and calls diff3_analysis from the CLI + """ + parser = argparse.ArgumentParser( + description="Process and compare merge conflicts using two tools." + ) + parser.add_argument("merge_tool1", type=str, help="The first merge tool to use") + parser.add_argument("merge_tool2", type=str, help="The second merge tool to use") + parser.add_argument( + "idx", + type=str, + help="The index of the repository in the results spreadsheet", + ) + parser.add_argument( + "repo_output_dir", type=str, help="The directory to store the results" + ) + + args = parser.parse_args() + + diff3_analysis( + merge_tool1=args.merge_tool1, + merge_tool2=args.merge_tool2, + idx=args.idx, + repo_output_dir=args.repo_output_dir, + ) + + +if __name__ == "__main__": + main() diff --git a/src/python/repo.py b/src/python/repo.py index c1351b0548..d621d18167 100755 --- a/src/python/repo.py +++ b/src/python/repo.py @@ -96,6 +96,54 @@ def clone_repo(repo_slug: str, repo_dir: Path) -> git.repo.Repo: ) from None +@timeout(10 * 60) +def clone_repo_to_path(repo_name: str, path: str) -> git.repo.Repo: + """Clones a repository to a specified path. Alternative to the clone_repo + method since it takes a string path and returns a git repo Repo obj. + + Args: + repo_name (str): The name of the repository in the format "owner/reponame". + path (str): The path to the folder where the repository will be cloned. + + Returns: + git.repo.Repo: The Git repository object. + + Raises: + Exception: If there's an error during cloning or submodule update. + + """ + # Define the path for the repository directory + repo_dir = Path(path) / Path(repo_name) + + # Check if the repository directory already exists + if repo_dir.exists(): + # If the repository exists, open it as a Git repository + repo = git.repo.Repo(repo_dir) + else: + # If the repository doesn't exist, clone it from GitHub + repo_dir.parent.mkdir(parents=True, exist_ok=True) + os.environ["GIT_TERMINAL_PROMPT"] = "0" + print(repo_name, " : Cloning repo") + # ":@" in URL ensures that we are not prompted for login details + # for the repos that are now private. + github_url = "https://:@github.com/" + repo_name + ".git" + print(repo_name, " : Finished cloning") + try: + # Clone the repository from the GitHub URL + repo = git.repo.Repo.clone_from(github_url, repo_dir) + print(repo_name, " : Finished cloning") + # Fetch remote branches + repo.remote().fetch() + repo.remote().fetch("refs/pull/*/head:refs/remotes/origin/pull/*") + # Update submodules if present + repo.submodule_update() + except Exception as e: + print(repo_name, "Exception during cloning:\n", e) + raise + + return repo + + TEST_STATE = Enum( "TEST_STATE", [