From bf922fe4d85837237519bf499507e59a4b4b129c Mon Sep 17 00:00:00 2001 From: cactusbranch01 Date: Mon, 11 Dec 2023 11:18:50 -0800 Subject: [PATCH] extend-ben-dataset branch changes --- src/python/diff3_analysis.ipynb | 70 ++++++++++++++++ src/python/diff3_analysis.py | 139 ++++++++++++++++++++++++++++++++ src/python/repo.py | 28 +++++++ 3 files changed, 237 insertions(+) create mode 100644 src/python/diff3_analysis.ipynb create mode 100644 src/python/diff3_analysis.py diff --git a/src/python/diff3_analysis.ipynb b/src/python/diff3_analysis.ipynb new file mode 100644 index 0000000000..ee9f4d11a3 --- /dev/null +++ b/src/python/diff3_analysis.ipynb @@ -0,0 +1,70 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dropwizard/metrics : Cloning repo\n", + "dropwizard/metrics : Finished cloning\n", + "dropwizard/metrics : Finished cloning\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Switched to branch 'TEMP_LEFT_BRANCH'\n", + "diff: ./repos/merge_attempt/dropwizard/metrics/pom.xml: No such file or directory\n", + "diff: ./repos/programmer_merge/dropwizard/metrics/pom.xml: No such file or directory\n" + ] + } + ], + "source": [ + "repo_num = 548\n", + "merge_tool = \"gitmerge_ort\"\n", + "# merge_tool = \"gitmerge_ort_adjacent\"\n", + "# merge_tool = \"gitmerge_ort_ignorespace\"\n", + "# merge_tool = \"gitmerge_ort_imports\"\n", + "# merge_tool = \"gitmerge_ort_imports_ignorespace\"\n", + "# merge_tool = \"gitmerge_resolve\"\n", + "# merge_tool = \"gitmerge_recursive_histogram\"\n", + "# merge_tool = \"gitmerge_recursive_ignorespace\"\n", + "# merge_tool = \"gitmerge_recursive_minimal\"\t\n", + "# merge_tool = \"gitmerge_recursive_myers\"\n", + "# merge_tool = \"gitmerge_recursive_patience\"\n", + "# merge_tool = \"git_hires_merge\"\n", + "# merge_tool = \"spork\"\n", + "# merge_tool = \"intellimerge\"\n", + "\n", + "from diff3_analysis import diff3_analysis\n", + "diff3_analysis(merge_tool, repo_num)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "research", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/python/diff3_analysis.py b/src/python/diff3_analysis.py new file mode 100644 index 0000000000..4cc961e6be --- /dev/null +++ b/src/python/diff3_analysis.py @@ -0,0 +1,139 @@ +"""Runs a merge and uses diff3 to compare it to the base and final branch of a given repo. +""" +import subprocess +import re +import os +import tempfile +import pandas as pd +from repo import clone_repo_to_path +from merge_tester import MERGE_STATE + +# pylint: disable-msg=too-many-locals + + +def diff3_analysis(merge_tool: str, repo_num: int): + """ + Analyzes merge conflicts using the diff3 tool and opens the results in the default text viewer. + + Args: + merge_tool (str): The merge tool to be used. + repo_num (int): The index of the repository in the results DataFrame. + + Returns: + None + """ + df = pd.read_csv("../../results_greatest_hits/result.csv") + repo_name = df.iloc[repo_num]["repository"] + + script = "../scripts/merge_tools/" + merge_tool + ".sh" + repo = clone_repo_to_path( + repo_name, "./repos/merge_attempt" + ) # Return a Git-Python repo object + repo.remote().fetch() + left_sha = df.iloc[repo_num]["left"] + repo.git.checkout(left_sha, force=True) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_LEFT_BRANCH", force=True) + repo.git.checkout(df.iloc[repo_num]["right"], force=True) + repo.submodule_update() + repo.git.checkout("-b", "TEMP_RIGHT_BRANCH", force=True) + + result = subprocess.run( + [ + script, + repo.git.rev_parse("--show-toplevel"), + "TEMP_LEFT_BRANCH", + "TEMP_RIGHT_BRANCH", + ], + stdout=subprocess.PIPE, + text=True, + ) + + conflict_file_matches = re.findall( + r"CONFLICT \(.+\): Merge conflict in (.+)", result.stdout + ) + + repo = clone_repo_to_path( + repo_name, "./repos/programmer_merge" + ) # Return a Git-Python repo object + repo.git.checkout(df.iloc[repo_num]["merge"], force=True) + repo.submodule_update() + + ''' + repo = clone_repo_to_path( + repo_name, "./repos/base" + ) # Return a Git-Python repo object + repo.git.checkout(df.iloc[repo_num]["base"], force=True) + repo.submodule_update() + ''' + + for conflict_file_match in conflict_file_matches: + conflicting_file = str(conflict_file_match) + conflict_path = os.path.join(repo_name, conflicting_file) + conflict_path_merge_attempt = os.path.join( + "./repos/merge_attempt", conflict_path + ) + + ''' + conflict_path_base = os.path.join("./repos/base", conflict_path) + ''' + conflict_path_programmer_merge = os.path.join( + "./repos/programmer_merge", conflict_path + ) + ''' + diff_results = subprocess.run( + [ + "diff3", + conflict_path_base, + conflict_path_merge_attempt, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + # Check that diff3 didn't run into missing files in the base + error_message = "No such file or directory" + if error_message in diff_results.stderr: + ''' + # Since the conflict file was added in both parents we can't diff the base. + diff_results = subprocess.run( + [ + "diff", + conflict_path_merge_attempt, + conflict_path_programmer_merge, + ], + stdout=subprocess.PIPE, + text=True, + ) + + # Use a temporary file to store the diff results + with tempfile.NamedTemporaryFile(mode="w+", delete=False) as temp_file: + temp_file.write(diff_results.stdout) + + # Open the saved text file with the default application + subprocess.run(["xdg-open", temp_file.name], check=True) + + # Delete the temporary file + os.remove(temp_file.name) + + # Deletes base, programmer_merge, and merge_attempt folders in repos dir + # We do this to prevent errors if cloning the same repo into the folder twice + ''' + subprocess.run( + ["rm", "-rf", "./repos/base"], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + ''' + subprocess.run( + ["rm", "-rf", "./repos/merge_attempt"], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + subprocess.run( + ["rm", "-rf", "./repos/programmer_merge"], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) diff --git a/src/python/repo.py b/src/python/repo.py index 31118dcfcd..9e2ce64170 100755 --- a/src/python/repo.py +++ b/src/python/repo.py @@ -57,6 +57,34 @@ def clone_repo(repo_slug: str) -> git.repo.Repo: return repo +def clone_repo_to_path(repo_slug: str, path: str) -> git.repo.Repo: + """Clones a repository, or runs `git fetch` if the repository is already cloned. + Args: + repo_slug (str): The slug of the repository, which is "owner/reponame". + """ + repo_dir = REPOS_PATH / Path(repo_slug) + if repo_dir.exists(): + repo = git.repo.Repo(repo_dir) + else: + repo_dir.parent.mkdir(parents=True, exist_ok=True) + os.environ["GIT_TERMINAL_PROMPT"] = "0" + print(repo_slug, " : Cloning repo") + # ":@" in URL ensures that we are not prompted for login details + # for the repos that are now private. + github_url = "https://:@github.com/" + repo_slug + ".git" + print(repo_slug, " : Finished cloning") + try: + repo = git.repo.Repo.clone_from(github_url, repo_dir) + print(repo_slug, " : Finished cloning") + repo.remote().fetch() + repo.remote().fetch("refs/pull/*/head:refs/remotes/origin/pull/*") + repo.submodule_update() + except Exception as e: + print(repo_slug, "Exception during cloning:\n", e) + raise + return repo + + TEST_STATE = Enum( "TEST_STATE", [