diff --git a/src/python/diff3_analysis.py b/src/python/diff3_analysis.py new file mode 100644 index 0000000000..eaa9d0b534 --- /dev/null +++ b/src/python/diff3_analysis.py @@ -0,0 +1,189 @@ +"""Runs a merge and uses diff3 to compare it to the base and final branch of a given repo. +See readme for details on shell scripts to run these methods. +""" + +import sys +import argparse +import subprocess +import re +import os +import shutil +import tempfile +import pandas as pd +from repo import clone_repo +from merge_tester import MERGE_STATE + +# pylint: disable-msg=too-many-locals + + +def diff3_analysis(merge_tool: str, results_index: int, repo_output_dir): + """ + Analyzes merge conflicts using the diff3 tool and opens the results in the default text viewer. + + Args: + merge_tool (str): The merge tool to be used. + results_index (int): The index of the repository in the results DataFrame. + repo_output_dir (path): The path of where we want to store the results from the analysis + + Returns: + None + """ + + # Deletes base, programmer_merge, and merge_attempt folders in repos dir + # We do this to prevent errors if cloning the same repo into the folder twice + shutil.rmtree("./repos", ignore_errors=True) + + # Retrieve left and right branch from hash in repo + df = pd.read_csv("../../results/combined/result.csv") + repo_name = df.iloc[results_index]["repository"] + + script = "../scripts/merge_tools/" + merge_tool + ".sh" + repo = clone_repo( + repo_name, "./repos/merge_attempt1" + ) # Return a Git-Python repo object + repo.remote().fetch() + left_sha = df.iloc[results_index]["left"] + repo.git.checkout(left_sha, force=True) + print("Checking out left" + left_sha) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_LEFT_BRANCH", force=True) + repo.git.checkout(df.iloc[results_index]["right"], force=True) + print("Checking out right" + df.iloc[results_index]["right"]) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_RIGHT_BRANCH", force=True) + + base_sha = subprocess.run( + [ + "git", + "merge-base", + "TEMP_LEFT_BRANCH", + "TEMP_RIGHT_BRANCH", + ], + cwd="./repos/merge_attempt1/" + repo_name, + stdout=subprocess.PIPE, + text=True, + ) + print("Found base sha" + base_sha.stdout) + + repo2 = clone_repo(repo_name, "./repos/base") # Return a Git-Python repo object + repo2.remote().fetch() + base_sha = base_sha.stdout.strip() + repo2.git.checkout(base_sha, force=True) + repo2.submodule_update() + + result = subprocess.run( + [ + script, + repo.git.rev_parse("--show-toplevel"), + "TEMP_LEFT_BRANCH", + "TEMP_RIGHT_BRANCH", + ], + stdout=subprocess.PIPE, + text=True, + ) + + conflict_file_matches = re.findall( + r"CONFLICT \(.+\): Merge conflict in (.+)", result.stdout + ) + print(result.stdout) + + repo3 = clone_repo( + repo_name, "./repos/programmer_merge" + ) # Return a Git-Python repo object + repo3.git.checkout(df.iloc[results_index]["merge"], force=True) + repo3.submodule_update() + + print(conflict_file_matches) + + for conflict_file_match in conflict_file_matches: + conflicting_file = str(conflict_file_match) + conflict_path = os.path.join(repo_name, conflicting_file) + conflict_path_merge_attempt = os.path.join( + "./repos/merge_attempt1", conflict_path + ) + + conflict_path_base = os.path.join("./repos/base", conflict_path) + conflict_path_programmer_merge = os.path.join( + "./repos/programmer_merge", conflict_path + ) + + diff_results = subprocess.run( + [ + "diff3", + conflict_path_base, + conflict_path_merge_attempt, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + # Check that diff3 didn't run into missing files in the base + error_message = "No such file or directory" + if error_message in diff_results.stderr: + # Since the conflict file was added in both parents we can't diff the base. + diff_results = subprocess.run( + [ + "diff", + conflict_path_merge_attempt, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + text=True, + ) + + # Remove ._ at the end of the file name that will mess things up + conflicting_file_base, _ = os.path.splitext(os.path.basename(conflicting_file)) + + # Generate a filename for the diff result, including the new subdirectory + diff_filename = os.path.join( + repo_output_dir, str(results_index), f"diff_{conflicting_file_base}.txt" + ) + + # Extract the directory part from diff_filename + output_dir = os.path.dirname(diff_filename) + + # Ensure the output directory exists + os.makedirs(output_dir, exist_ok=True) + + # Write the diff results to the file + with open(diff_filename, "w") as diff_file: + diff_file.write(diff_results.stdout) + + # Optionally, print or log the path of the diff file + print(f"Diff results saved to {diff_filename}") + + +def main(merge_tool: str, results_index: int, repo_output_dir: str): + """ + Entry point for the script when run from the command line. + """ + # Convert results_index to int here if using argparse + diff3_analysis(merge_tool, results_index, repo_output_dir) + + +if __name__ == "__main__": + # Use argparse to parse command line arguments + parser = argparse.ArgumentParser( + description="Analyze merge conflicts using the diff3 tool." + ) + parser.add_argument("merge_tool", type=str, help="The merge tool to be used.") + parser.add_argument( + "results_index", + type=int, + help="The index of the repository in the results DataFrame.", + ) + parser.add_argument( + "repo_output_dir", + type=str, + help="The path of where we want to store the results from the analysis.", + ) + + args = parser.parse_args() + + # Ensure the output directory exists + os.makedirs(args.repo_output_dir, exist_ok=True) + + # Call main function with parsed arguments + main(args.merge_tool, args.results_index, args.repo_output_dir) diff --git a/src/python/diff3_pair_analysis.py b/src/python/diff3_pair_analysis.py new file mode 100644 index 0000000000..b6588fc23b --- /dev/null +++ b/src/python/diff3_pair_analysis.py @@ -0,0 +1,239 @@ +"""Runs a merge and uses diff3 to compare it to the base and final branch of a given repo. +""" + +import sys +import argparse +import subprocess +import re +import os +import shutil +import tempfile +import pandas as pd +from repo import clone_repo_to_path +from merge_tester import MERGE_STATE + +# pylint: disable-msg=too-many-locals +# pylint: disable-msg=too-many-statements + + +def diff3_pair_analysis( + merge_tool1: str, merge_tool2: str, results_index: int, repo_output_dir +): + """ + Analyzes merge conflicts using the diff3 tool and opens the results in the default text viewer. + + Args: + merge_tool (str): The merge tool to be used. + results_index (int): The index of the repository in the results DataFrame. + repo_output_dir (path): The path of where we want to store the results from the analysis + + Returns: + None + """ + + # Deletes base, programmer_merge, and merge_attempt folders in repos dir + # We do this to prevent errors if cloning the same repo into the folder twice + shutil.rmtree("./repos", ignore_errors=True) + + # Retrieve left and right branch from hash in repo + df = pd.read_csv("../../results/combined/result.csv") + repo_name = df.iloc[results_index]["repository"] + + script = "../scripts/merge_tools/" + merge_tool1 + ".sh" + repo = clone_repo_to_path( + repo_name, "./repos/merge_attempt1" + ) # Return a Git-Python repo object + repo.remote().fetch() + left_sha = df.iloc[results_index]["left"] + repo.git.checkout(left_sha, force=True) + print("Checking out left" + left_sha) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_LEFT_BRANCH", force=True) + repo.git.checkout(df.iloc[results_index]["right"], force=True) + print("Checking out right" + df.iloc[results_index]["right"]) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_RIGHT_BRANCH", force=True) + + base_sha = subprocess.run( + [ + "git", + "merge-base", + "TEMP_LEFT_BRANCH", + "TEMP_RIGHT_BRANCH", + ], + cwd="./repos/merge_attempt1/" + repo_name, + stdout=subprocess.PIPE, + text=True, + ) + print("Found base sha" + base_sha.stdout) + + repo2 = clone_repo_to_path( + repo_name, "./repos/base" + ) # Return a Git-Python repo object + repo2.remote().fetch() + base_sha = base_sha.stdout.strip() + repo2.git.checkout(base_sha, force=True) + repo2.submodule_update() + + result = subprocess.run( + [ + script, + repo.git.rev_parse("--show-toplevel"), + "TEMP_LEFT_BRANCH", + "TEMP_RIGHT_BRANCH", + ], + stdout=subprocess.PIPE, + text=True, + ) + + conflict_file_matches = re.findall( + r"CONFLICT \(.+\): Merge conflict in (.+)", result.stdout + ) + print(result.stdout) + + if conflict_file_matches == []: + print("No conflict files to search") + return + + repo3 = clone_repo_to_path( + repo_name, "./repos/programmer_merge" + ) # Return a Git-Python repo object + repo3.git.checkout(df.iloc[results_index]["merge"], force=True) + repo3.submodule_update() + + print(conflict_file_matches) + + script = "../scripts/merge_tools/" + merge_tool2 + ".sh" + repo4 = clone_repo_to_path( + repo_name, "./repos/merge_attempt2" + ) # Return a Git-Python repo object + repo4.remote().fetch() + left_sha = df.iloc[results_index]["left"] + repo4.git.checkout(left_sha, force=True) + print("Checking out left" + left_sha) + repo4.submodule_update() + repo4.git.checkout("-b", "TEMP_LEFT_BRANCH", force=True) + repo4.git.checkout(df.iloc[results_index]["right"], force=True) + print("Checking out right" + df.iloc[results_index]["right"]) + repo4.submodule_update() + repo4.git.checkout("-b", "TEMP_RIGHT_BRANCH", force=True) + + for conflict_file_match in conflict_file_matches: + conflicting_file = str(conflict_file_match) + conflict_path = os.path.join(repo_name, conflicting_file) + conflict_path_merge_attempt1 = os.path.join( + "./repos/merge_attempt1", conflict_path + ) + + conflict_path_base = os.path.join("./repos/base", conflict_path) + conflict_path_programmer_merge = os.path.join( + "./repos/programmer_merge", conflict_path + ) + + diff_results = subprocess.run( + [ + "diff3", + conflict_path_base, + conflict_path_merge_attempt1, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + # Check that diff3 didn't run into missing files in the base + error_message = "No such file or directory" + if error_message in diff_results.stderr: + # Since the conflict file was added in both parents we can't diff the base. + diff_results = subprocess.run( + [ + "diff", + conflict_path_merge_attempt1, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + text=True, + ) + + # Remove ._ at the end of the file name that will mess things up + conflicting_file_base, _ = os.path.splitext(os.path.basename(conflicting_file)) + + # Generate a filename for the diff result, including the new subdirectory + diff_filename = os.path.join( + repo_output_dir, + str(results_index), + merge_tool1, + f"diff_{conflicting_file_base}.txt", + ) + + # Extract the directory part from diff_filename + output_dir = os.path.dirname(diff_filename) + + # Ensure the output directory exists + os.makedirs(output_dir, exist_ok=True) + + # Write the diff results to the file + with open(diff_filename, "w") as diff_file: + diff_file.write(diff_results.stdout) + + # Optionally, print or log the path of the diff file + print(f"Diff results saved to {diff_filename}") + + """ + + BREAK + + """ + + conflict_path = os.path.join(repo_name, conflicting_file) + conflict_path_merge_attempt2 = os.path.join( + "./repos/merge_attempt2", conflict_path + ) + + diff_results = subprocess.run( + [ + "diff3", + conflict_path_base, + conflict_path_merge_attempt2, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + # Check that diff3 didn't run into missing files in the base + error_message = "No such file or directory" + if error_message in diff_results.stderr: + # Since the conflict file was added in both parents we can't diff the base. + diff_results = subprocess.run( + [ + "diff", + conflict_path_merge_attempt2, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + text=True, + ) + + # Generate a filename for the diff result, including the new subdirectory + diff_filename = os.path.join( + repo_output_dir, + str(results_index), + merge_tool2, + f"diff_{conflicting_file_base}.txt", + ) + + # Extract the directory part from diff_filename + output_dir = os.path.dirname(diff_filename) + + # Ensure the output directory exists + os.makedirs(output_dir, exist_ok=True) + + # Write the diff results to the file + with open(diff_filename, "w") as diff_file: + diff_file.write(diff_results.stdout) + + # Optionally, print or log the path of the diff file + print(f"Diff results saved to {diff_filename}") diff --git a/src/python/readme.md b/src/python/readme.md new file mode 100644 index 0000000000..8ea9316799 --- /dev/null +++ b/src/python/readme.md @@ -0,0 +1,85 @@ +# Python Scripts for Merge Conflict Analysis + + + + +This directory contains Python scripts designed to facilitate the analysis of merge conflicts using various merge tools. The scripts allow users to recreate merges, analyze conflicts, and compare different merge algorithms' effectiveness. + + + + +## Scripts Overview + + + + +- `diff3_analysis.py`: This script analyzes merge conflicts for a single specified merge tool and commit. +- `run_diff3_analysis.py`: This script automates the analysis across multiple commits and merge tools, aggregating the results. + + + + +## Prerequisites + + + + +- Python 3.x installed on your system. +- Necessary Python packages installed (e.g., `pandas`, `GitPython`). + + + + +## Usage + + + + +### Analyzing a Single Merge Conflict + + + + +To analyze merge conflicts using a specific merge tool for a single commit: + + +python3 diff3_analysis.py + + + + +Ex: + + +python3 diff3_analysis.py gitmerge_ort 582 ./merge_conflict_analysis_diffs/582/gitmerge_ort + + + + +: The merge tool to use for the analysis (e.g., gitmerge_ort). +: The index of the commit in the dataset. +: The directory where the analysis results will be saved. + + + + +Running Bulk Analysis +To run the analysis over multiple commits and all merge tools: + + +python3 run_analysis.py --results_index --repo_output_dir "" + + + + +Ex: + + +python3 run_diff3_analysis.py --results_index 582,427,930 --repo_output_dir "./merge_conflict_analysis_diffs" + + +: Comma-separated list of commit indices to analyze. Example: 582,427,930. +: The directory where the bulk analysis results will be saved. + + + diff --git a/src/python/repo.py b/src/python/repo.py index 8d6f3d2868..d2d47c52ac 100755 --- a/src/python/repo.py +++ b/src/python/repo.py @@ -93,6 +93,34 @@ def clone_repo(repo_slug: str, repo_dir: Path) -> git.repo.Repo: ) from None +def clone_repo_to_path(repo_slug: str, path: str) -> git.repo.Repo: + """Clones a repository, or runs `git fetch` if the repository is already cloned. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + """ + repo_dir = Path(path) / Path(repo_slug) + if repo_dir.exists(): + repo = git.repo.Repo(repo_dir) + else: + repo_dir.parent.mkdir(parents=True, exist_ok=True) + os.environ["GIT_TERMINAL_PROMPT"] = "0" + print(repo_slug, " : Cloning repo") + # ":@" in URL ensures that we are not prompted for login details + # for the repos that are now private. + github_url = "https://:@github.com/" + repo_slug + ".git" + print(repo_slug, " : Finished cloning") + try: + repo = git.repo.Repo.clone_from(github_url, repo_dir) + print(repo_slug, " : Finished cloning") + repo.remote().fetch() + repo.remote().fetch("refs/pull/*/head:refs/remotes/origin/pull/*") + repo.submodule_update() + except Exception as e: + print(repo_slug, "Exception during cloning:\n", e) + raise + return repo + + TEST_STATE = Enum( "TEST_STATE", [