From 3b56780bcb5ec8562668681dd67940907b41cc33 Mon Sep 17 00:00:00 2001 From: cactusbranch01 Date: Fri, 29 Mar 2024 23:17:30 -0700 Subject: [PATCH] New Paper Fixes Changes added files from extend_dataset --- src/python/bulk_diff3_analysis.py | 181 ++++++++++++++++++++++++++++++ src/python/diff3_analysis.py | 181 ++++++++++++++++++++++++++++++ src/python/repo.py | 30 +++++ 3 files changed, 392 insertions(+) create mode 100644 src/python/bulk_diff3_analysis.py create mode 100644 src/python/diff3_analysis.py diff --git a/src/python/bulk_diff3_analysis.py b/src/python/bulk_diff3_analysis.py new file mode 100644 index 0000000000..85ff06d133 --- /dev/null +++ b/src/python/bulk_diff3_analysis.py @@ -0,0 +1,181 @@ +"""Runs a merge and uses diff3 to compare it to the base and final branch of a given repo. +See readme for details on shell scripts to run these methods. +""" + +import sys +import argparse +import subprocess +import re +import os +import shutil +import tempfile +import pandas as pd +from repo import clone_repo_to_path +from merge_tester import MERGE_STATE + +# pylint: disable-msg=too-many-locals + + +def diff3_analysis(merge_tool: str, results_index: int, repo_output_dir): + """ + Analyzes merge conflicts using the diff3 tool and opens the results in the default text viewer. + + Args: + merge_tool (str): The merge tool to be used. + results_index (int): The index of the repository in the results DataFrame. + repo_output_dir (path): The path of where we want to store the results from the analysis + + Returns: + None + """ + + # Deletes base, programmer_merge, and merge_attempt folders in repos dir + # We do this to prevent errors if cloning the same repo into the folder twice + shutil.rmtree("./repos", ignore_errors=True) + + # Retrieve left and right branch from hash in repo + df = pd.read_csv("../../results_greatest_hits/result.csv") + repo_name = df.iloc[results_index]["repository"] + + script = "../scripts/merge_tools/" + merge_tool + ".sh" + repo = clone_repo_to_path( + repo_name, "./repos/merge_attempt" + ) # Return a Git-Python repo object + repo.remote().fetch() + left_sha = df.iloc[results_index]["left"] + repo.git.checkout(left_sha, force=True) + print("Checking out left" + left_sha) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_LEFT_BRANCH", force=True) + repo.git.checkout(df.iloc[results_index]["right"], force=True) + print("Checking out right" + df.iloc[results_index]["right"]) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_RIGHT_BRANCH", force=True) + + base_sha = subprocess.run( + [ + "git", + "merge-base", + "TEMP_LEFT_BRANCH", + "TEMP_RIGHT_BRANCH", + ], + cwd="./repos/merge_attempt/" + repo_name, + stdout=subprocess.PIPE, + text=True, + ) + print("Found base sha" + base_sha.stdout) + + repo2 = clone_repo_to_path( + repo_name, "./repos/base" + ) # Return a Git-Python repo object + repo2.remote().fetch() + base_sha = base_sha.stdout.strip() + repo2.git.checkout(base_sha, force=True) + repo2.submodule_update() + + result = subprocess.run( + [ + script, + repo.git.rev_parse("--show-toplevel"), + "TEMP_LEFT_BRANCH", + "TEMP_RIGHT_BRANCH", + ], + stdout=subprocess.PIPE, + text=True, + ) + + conflict_file_matches = re.findall( + r"CONFLICT \(.+\): Merge conflict in (.+)", result.stdout + ) + + print(result.stdout) + + repo3 = clone_repo_to_path( + repo_name, "./repos/programmer_merge" + ) # Return a Git-Python repo object + repo3.git.checkout(df.iloc[results_index]["merge"], force=True) + repo3.submodule_update() + + for conflict_file_match in conflict_file_matches: + conflicting_file = str(conflict_file_match) + conflict_path = os.path.join(repo_name, conflicting_file) + conflict_path_merge_attempt = os.path.join( + "./repos/merge_attempt", conflict_path + ) + + conflict_path_base = os.path.join("./repos/base", conflict_path) + conflict_path_programmer_merge = os.path.join( + "./repos/programmer_merge", conflict_path + ) + + diff_results = subprocess.run( + [ + "diff3", + conflict_path_base, + conflict_path_merge_attempt, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + # Check that diff3 didn't run into missing files in the base + error_message = "No such file or directory" + if error_message in diff_results.stderr: + # Since the conflict file was added in both parents we can't diff the base. + diff_results = subprocess.run( + [ + "diff", + conflict_path_merge_attempt, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + text=True, + ) + + # Generate a filename for the diff result, including the new subdirectory + diff_filename = os.path.join( + repo_output_dir, f"diff_{os.path.basename(conflicting_file)}.txt" + ) + + # Write the diff results to the file + with open(diff_filename, "w") as diff_file: + diff_file.write(diff_results.stdout) + + # Optionally, print or log the path of the diff file + print(f"Diff results saved to {diff_filename}") + + +def main(merge_tool: str, results_index: int, repo_output_dir: str): + """ + Entry point for the script when run from the command line. + """ + # Convert results_index to int here if using argparse + diff3_analysis(merge_tool, results_index, repo_output_dir) + + +if __name__ == "__main__": + # Use argparse to parse command line arguments + parser = argparse.ArgumentParser( + description="Analyze merge conflicts using the diff3 tool." + ) + parser.add_argument("merge_tool", type=str, help="The merge tool to be used.") + parser.add_argument( + "results_index", + type=int, + help="The index of the repository in the results DataFrame.", + ) + parser.add_argument( + "repo_output_dir", + type=str, + help="The path of where we want to store the results from the analysis.", + ) + + args = parser.parse_args() + + # Ensure the output directory exists + os.makedirs(args.repo_output_dir, exist_ok=True) + + # Call main function with parsed arguments + main(args.merge_tool, args.results_index, args.repo_output_dir) diff --git a/src/python/diff3_analysis.py b/src/python/diff3_analysis.py new file mode 100644 index 0000000000..85ff06d133 --- /dev/null +++ b/src/python/diff3_analysis.py @@ -0,0 +1,181 @@ +"""Runs a merge and uses diff3 to compare it to the base and final branch of a given repo. +See readme for details on shell scripts to run these methods. +""" + +import sys +import argparse +import subprocess +import re +import os +import shutil +import tempfile +import pandas as pd +from repo import clone_repo_to_path +from merge_tester import MERGE_STATE + +# pylint: disable-msg=too-many-locals + + +def diff3_analysis(merge_tool: str, results_index: int, repo_output_dir): + """ + Analyzes merge conflicts using the diff3 tool and opens the results in the default text viewer. + + Args: + merge_tool (str): The merge tool to be used. + results_index (int): The index of the repository in the results DataFrame. + repo_output_dir (path): The path of where we want to store the results from the analysis + + Returns: + None + """ + + # Deletes base, programmer_merge, and merge_attempt folders in repos dir + # We do this to prevent errors if cloning the same repo into the folder twice + shutil.rmtree("./repos", ignore_errors=True) + + # Retrieve left and right branch from hash in repo + df = pd.read_csv("../../results_greatest_hits/result.csv") + repo_name = df.iloc[results_index]["repository"] + + script = "../scripts/merge_tools/" + merge_tool + ".sh" + repo = clone_repo_to_path( + repo_name, "./repos/merge_attempt" + ) # Return a Git-Python repo object + repo.remote().fetch() + left_sha = df.iloc[results_index]["left"] + repo.git.checkout(left_sha, force=True) + print("Checking out left" + left_sha) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_LEFT_BRANCH", force=True) + repo.git.checkout(df.iloc[results_index]["right"], force=True) + print("Checking out right" + df.iloc[results_index]["right"]) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_RIGHT_BRANCH", force=True) + + base_sha = subprocess.run( + [ + "git", + "merge-base", + "TEMP_LEFT_BRANCH", + "TEMP_RIGHT_BRANCH", + ], + cwd="./repos/merge_attempt/" + repo_name, + stdout=subprocess.PIPE, + text=True, + ) + print("Found base sha" + base_sha.stdout) + + repo2 = clone_repo_to_path( + repo_name, "./repos/base" + ) # Return a Git-Python repo object + repo2.remote().fetch() + base_sha = base_sha.stdout.strip() + repo2.git.checkout(base_sha, force=True) + repo2.submodule_update() + + result = subprocess.run( + [ + script, + repo.git.rev_parse("--show-toplevel"), + "TEMP_LEFT_BRANCH", + "TEMP_RIGHT_BRANCH", + ], + stdout=subprocess.PIPE, + text=True, + ) + + conflict_file_matches = re.findall( + r"CONFLICT \(.+\): Merge conflict in (.+)", result.stdout + ) + + print(result.stdout) + + repo3 = clone_repo_to_path( + repo_name, "./repos/programmer_merge" + ) # Return a Git-Python repo object + repo3.git.checkout(df.iloc[results_index]["merge"], force=True) + repo3.submodule_update() + + for conflict_file_match in conflict_file_matches: + conflicting_file = str(conflict_file_match) + conflict_path = os.path.join(repo_name, conflicting_file) + conflict_path_merge_attempt = os.path.join( + "./repos/merge_attempt", conflict_path + ) + + conflict_path_base = os.path.join("./repos/base", conflict_path) + conflict_path_programmer_merge = os.path.join( + "./repos/programmer_merge", conflict_path + ) + + diff_results = subprocess.run( + [ + "diff3", + conflict_path_base, + conflict_path_merge_attempt, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + # Check that diff3 didn't run into missing files in the base + error_message = "No such file or directory" + if error_message in diff_results.stderr: + # Since the conflict file was added in both parents we can't diff the base. + diff_results = subprocess.run( + [ + "diff", + conflict_path_merge_attempt, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + text=True, + ) + + # Generate a filename for the diff result, including the new subdirectory + diff_filename = os.path.join( + repo_output_dir, f"diff_{os.path.basename(conflicting_file)}.txt" + ) + + # Write the diff results to the file + with open(diff_filename, "w") as diff_file: + diff_file.write(diff_results.stdout) + + # Optionally, print or log the path of the diff file + print(f"Diff results saved to {diff_filename}") + + +def main(merge_tool: str, results_index: int, repo_output_dir: str): + """ + Entry point for the script when run from the command line. + """ + # Convert results_index to int here if using argparse + diff3_analysis(merge_tool, results_index, repo_output_dir) + + +if __name__ == "__main__": + # Use argparse to parse command line arguments + parser = argparse.ArgumentParser( + description="Analyze merge conflicts using the diff3 tool." + ) + parser.add_argument("merge_tool", type=str, help="The merge tool to be used.") + parser.add_argument( + "results_index", + type=int, + help="The index of the repository in the results DataFrame.", + ) + parser.add_argument( + "repo_output_dir", + type=str, + help="The path of where we want to store the results from the analysis.", + ) + + args = parser.parse_args() + + # Ensure the output directory exists + os.makedirs(args.repo_output_dir, exist_ok=True) + + # Call main function with parsed arguments + main(args.merge_tool, args.results_index, args.repo_output_dir) diff --git a/src/python/repo.py b/src/python/repo.py index 4ead383f54..06cadcb124 100755 --- a/src/python/repo.py +++ b/src/python/repo.py @@ -84,6 +84,36 @@ def clone_repo(repo_slug: str) -> git.repo.Repo: return repo +@timeout(10 * 60) +def clone_repo_to_path(repo_slug: str, path: str) -> git.repo.Repo: + """Clones a repository, or runs `git fetch` if the repository is already cloned. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + path (str): The path to clone the repo into + """ + repo_dir = path / repo_slug + if repo_dir.exists(): + repo = git.repo.Repo(repo_dir) + else: + repo_dir.parent.mkdir(parents=True, exist_ok=True) + os.environ["GIT_TERMINAL_PROMPT"] = "0" + os.environ["GIT_SSH_COMMAND"] = "ssh -o BatchMode=yes" + print(repo_slug, " : Cloning repo") + # ":@" in URL ensures that we are not prompted for login details + # for the repos that are now private. + github_url = "https://:@github.com/" + repo_slug + ".git" + try: + repo = git.repo.Repo.clone_from(github_url, repo_dir) + print(repo_slug, " : Finished cloning") + repo.remote().fetch() + repo.remote().fetch("refs/pull/*/head:refs/remotes/origin/pull/*") + repo.submodule_update() + except GitCommandError as e: + print(repo_slug, "GitCommandError during cloning:\n", e) + raise Exception("GitCommandError during cloning") from e + return repo + + TEST_STATE = Enum( "TEST_STATE", [