From d5ed25cf534b55edfc656f1a88e4083f2bd7e96a Mon Sep 17 00:00:00 2001 From: Benedikt Schesch <37979523+benedikt-schesch@users.noreply.github.com> Date: Thu, 9 May 2024 13:51:36 -0700 Subject: [PATCH] Check reproducible results in CI/CD (#285) --- .gitconfig | 3 + .github/workflows/check-reproducibility.yml | 50 +++++++++ .github/workflows/small-test.yml | 10 +- Makefile | 7 +- run.sh | 19 ++-- src/python/replay_merge.py | 113 +++++++++++++++----- src/python/repo.py | 27 +++-- src/scripts/merge_tools/git_hires_merge.sh | 3 + src/scripts/merge_tools/spork.sh | 15 ++- 9 files changed, 183 insertions(+), 64 deletions(-) create mode 100644 .gitconfig create mode 100644 .github/workflows/check-reproducibility.yml diff --git a/.gitconfig b/.gitconfig new file mode 100644 index 0000000000..eb2a801689 --- /dev/null +++ b/.gitconfig @@ -0,0 +1,3 @@ +[user] + email = example@example.come + name = Example Example diff --git a/.github/workflows/check-reproducibility.yml b/.github/workflows/check-reproducibility.yml new file mode 100644 index 0000000000..36f7780179 --- /dev/null +++ b/.github/workflows/check-reproducibility.yml @@ -0,0 +1,50 @@ +name: Reproducibility Check +on: [push, pull_request] +jobs: + test: + strategy: + matrix: + maven: [ '3.9.2' ] + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + steps: + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: 17.0.7 + - run: echo "JAVA17_HOME=$JAVA_HOME" >> $GITHUB_ENV + - run: java -version + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v3 + with: + python-version: 3.12 + auto-update-conda: true + mamba-version: "*" + channels: conda-forge,defaults + activate-environment: AST + environment-file: environment.yml + - name: Install maven + uses: s4u/setup-maven-action@v1.12.0 + with: + java-version: 17 + maven-version: ${{ matrix.maven }} + - name: Clean caches & workspace + run: make clean + - run: echo "LOGURU_COLORIZE=NO" >> $GITHUB_ENV + - name: make check-merges-reproducibility + run: | + git config --global user.email "example@example.com" + git config --global user.name "Example Example" + head -n 151 results/combined/result.csv > temp.csv && mv temp.csv results/combined/result.csv + make check-merges-reproducibility + env: + GITHUB_TOKEN: ${{ secrets.TOKEN_GITHUB }} diff --git a/.github/workflows/small-test.yml b/.github/workflows/small-test.yml index febdff45eb..fde40180e3 100644 --- a/.github/workflows/small-test.yml +++ b/.github/workflows/small-test.yml @@ -1,4 +1,4 @@ -name: Run Small test +name: Small test on: [push, pull_request] jobs: test: @@ -47,17 +47,19 @@ jobs: - name: Install PdfLaTeX run: sudo apt update && sudo apt install texlive-latex-extra -y - name: Install maven - uses: s4u/setup-maven-action@v1.8.0 + uses: s4u/setup-maven-action@v1.12.0 with: java-version: 17 maven-version: ${{ matrix.maven }} - name: Clean caches & workspace run: make clean + - name: Install killall + run: sudo apt update && sudo apt install psmisc -y - run: echo "LOGURU_COLORIZE=NO" >> $GITHUB_ENV - name: Run small test run: | - git config --global user.email "example@example.com" - git config --global user.name "Example Example" + git config --global merge.customMerge.name "Always incorrect custom merge driver" + git config --global merge.customMerge.driver 'fake-merge-driver %O %A %B %L %P' make small-test env: GITHUB_TOKEN: ${{ secrets.TOKEN_GITHUB }} diff --git a/Makefile b/Makefile index 2f9808426d..4ee0c98848 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,8 @@ CSV_RESULTS_GREATEST_HITS = results/greatest_hits/result.csv CSV_RESULTS_REAPER = results/reaper/result.csv CSV_RESULTS = $(CSV_RESULTS_COMBINED) +NUM_PROCESSES = 0 + shell-script-style: shellcheck -e SC2153 -x -P SCRIPTDIR --format=gcc ${SH_SCRIPTS} ${BASH_SCRIPTS} checkbashisms ${SH_SCRIPTS} @@ -130,8 +132,9 @@ clean-local: rm -rf repos check-merges-reproducibility: - @echo "Running replay_merge for each idx in parallel..." - @tail -n +2 $(CSV_RESULTS) | awk -F, '{print $$1}' | parallel -u --halt now,fail=1 -j 0 'python3 src/python/replay_merge.py -delete_workdir --idx {}' + @echo "Running replay_merge for each idx in parallel using GNU Parallel..." + @set -e; \ + tail -n +2 $(CSV_RESULTS) | awk -F, '{print $$1}' | parallel -j 50% python3 src/python/replay_merge.py --merges_csv $(CSV_RESULTS) -delete_workdir --idx {} protect-repos: find repos -mindepth 1 -type d -exec chmod a-w {} + diff --git a/run.sh b/run.sh index bf66245c7a..bd124bcdac 100755 --- a/run.sh +++ b/run.sh @@ -55,18 +55,8 @@ done PATH=$(pwd)/src/scripts/merge_tools/:$PATH export PATH -echo "Checking for custom merge drivers in global configuration..." -merge_drivers=$(git config --global --get-regexp '^merge\..*\.driver$' || echo "No merge drivers set") - -if [ "$merge_drivers" == "No merge drivers set" ]; then - echo "No custom merge drivers found in global configuration. Proceeding with the evaluation." - # Include other commands to continue the script here -else - echo "Error: Custom merge drivers are set in global configuration." - echo "Please unset them before running the evaluation." - echo "Merge driver found: $merge_drivers" - exit 1 -fi +GIT_CONFIG_GLOBAL=$(pwd)/.gitconfig +export GIT_CONFIG_GLOBAL # Check if cache.tar exists and cache is missing if [ -f cache.tar ] && [ ! -d cache ]; then @@ -107,10 +97,13 @@ fi mkdir -p "$OUT_DIR" -# Delete all locks in cache +# Delete all locks if [ -d "$CACHE_DIR" ]; then find "$CACHE_DIR" -name "*.lock" -delete fi +if [ -d "repos" ]; then + find "repos/locks" -name "*.lock" -delete +fi # Delete .workdir rm -rf .workdir diff --git a/src/python/replay_merge.py b/src/python/replay_merge.py index 3aa3561cbc..e55f7484d4 100755 --- a/src/python/replay_merge.py +++ b/src/python/replay_merge.py @@ -3,6 +3,8 @@ """Replay merges and their test results""" import argparse import os +import sys +import tarfile from pathlib import Path import shutil import pandas as pd @@ -21,6 +23,33 @@ logger.add("replay_merge.log", mode="a") +def store_artifacts(result_df: pd.DataFrame) -> None: + """Store artifacts in a tarball directly fro.""" + tarball_path = "replay_merge_artifacts.tar.gz" + + # Create the tarball and add files, ensuring no path modification + with tarfile.open(tarball_path, "w:gz") as tar: + for idx in result_df.index: + repo_path = result_df.loc[idx, "repo path"] + log_path = result_df.loc[idx, "merge log path"] + + # Add repository directories or files to the tarball with absolute paths + tar.add(repo_path, arcname=repo_path) + + # Add log files to the tarball with absolute paths + tar.add(log_path, arcname=log_path) + + logger.info("Artifacts created") + + +def delete_workdirs(results_df: pd.DataFrame) -> None: + """Delete the workdirs after replaying the merges.""" + for idx in results_df.index: + os.system("chmod -R 777 " + str(results_df.loc[idx, "repo path"])) + shutil.rmtree(results_df.loc[idx, "repo path"]) + logger.info("Workdirs deleted") + + # pylint: disable=too-many-arguments, too-many-locals def merge_replay( merge_idx: str, @@ -28,6 +57,7 @@ def merge_replay( merge_data: pd.Series, test_merge: bool = False, delete_workdir: bool = True, + create_artifacts: bool = False, dont_check_fingerprints: bool = False, ) -> pd.DataFrame: """Replay a merge and its test results. @@ -75,13 +105,21 @@ def merge_replay( f"workdir {WORKDIR_DIRECTORY/workdir} already exists. Skipping" ) continue - - repo = Repository( - repo_slug, - cache_directory=Path("no_cache/"), - workdir_id=workdir, - delete_workdir=delete_workdir, - ) + try: + repo = Repository( + repo_slug, + cache_directory=Path("no_cache/"), + workdir_id=workdir, + delete_workdir=False, + lazy_clone=False, + ) + except Exception as e: + logger.error( + f"Git clone failed for {repo_slug} {merge_data['left']}" + + f"{merge_data['right']} {e}" + ) + # Exit with 0 for CI/CD to not cause problems in case a repo is no longer available + sys.exit(0) ( merge_result, merge_fingerprint, @@ -96,6 +134,7 @@ def merge_replay( timeout=TIMEOUT_TESTING_MERGE, use_cache=False, ) + assert repo.local_repo_path.exists() root_dir = Path("replay_logs") log_path = root_dir / Path( "merges/" @@ -111,6 +150,7 @@ def merge_replay( log_path.parent.mkdir(parents=True, exist_ok=True) with open(log_path, "w", encoding="utf-8") as f: f.write(explanation) + assert repo.local_repo_path.exists() result_df.loc[ merge_tool.name, ["merge result", "merge log path", "repo path", "merge fingerprint"], @@ -120,14 +160,41 @@ def merge_replay( repo.local_repo_path, merge_fingerprint, ] - if ( - merge_data[f"{merge_tool.name}_merge_fingerprint"] != merge_fingerprint - and not dont_check_fingerprints + assert repo.local_repo_path.exists() + + if ( # pylint: disable=too-many-boolean-expressions + merge_result + not in ( + MERGE_STATE.Git_checkout_failed, + TEST_STATE.Git_checkout_failed, + ) + and ( + merge_data[f"{merge_tool.name}_merge_fingerprint"] + != merge_fingerprint + and not dont_check_fingerprints + ) + and (merge_tool != MERGE_TOOL.spork) + and ( + merge_tool != MERGE_TOOL.gitmerge_resolve + or merge_result != MERGE_STATE.Merge_failed + ) ): + assert repo.local_repo_path.exists() + if create_artifacts: + store_artifacts(result_df) + if delete_workdir: + delete_workdirs(result_df) + print("=====================================\n") + with open(log_path, "r", encoding="utf-8") as f: + print(f.read()) + print("=====================================\n") raise Exception( f"fingerprints differ: after merge of {workdir} with {merge_tool}, found" + f" {merge_fingerprint} but expected " - + f"{merge_data[f'{merge_tool.name}_merge_fingerprint']}" + + f"{merge_data[f'{merge_tool.name}_merge_fingerprint']} at log path {log_path}" + + f" and repo path {repo.local_repo_path}", + merge_result, + f"idx {merge_idx}", ) if merge_result not in ( @@ -216,6 +283,11 @@ def merge_replay( ) args = parser.parse_args() + os.environ["PATH"] += os.pathsep + os.path.join( + os.getcwd(), "src/scripts/merge_tools" + ) + os.environ["GIT_CONFIG_GLOBAL"] = os.getcwd() + "/.gitconfig" + logger.info(f"Replaying merge with index {args.idx}") if args.delete_workdir: logger.info("Deleting workdir after replaying the merge") @@ -235,7 +307,8 @@ def merge_replay( str(repo_slug), merge_data, args.test, - args.delete_workdir and not args.create_artifacts, + args.delete_workdir, + args.create_artifacts, args.dont_check_fingerprints, ) for idx, row in results_df.iterrows(): @@ -254,16 +327,6 @@ def merge_replay( # Create artifacts which means creating a tarball of all the relevant workdirs if args.create_artifacts: - logger.info("Creating artifacts") - os.system( - "tar -czf replay_merge_artifacts.tar.gz " - + " ".join( - [str(results_df.loc[idx, "repo path"]) for idx in results_df.index] - ) - ) - logger.info("Artifacts created") - if args.delete_workdir: - for idx in results_df.index: - os.system("chmod -R 777 " + str(results_df.loc[idx, "repo path"])) - shutil.rmtree(results_df.loc[idx, "repo path"]) - logger.info("Workdirs deleted") + store_artifacts(results_df) + if args.delete_workdir: + delete_workdirs(results_df) diff --git a/src/python/repo.py b/src/python/repo.py index 28456c50a2..3424647f23 100755 --- a/src/python/repo.py +++ b/src/python/repo.py @@ -23,6 +23,7 @@ set_in_cache, lookup_in_cache, ) +import fasteners import git.repo from variables import ( REPOS_PATH, @@ -209,21 +210,16 @@ def __init__( # pylint: disable=too-many-arguments def clone_repo(self) -> None: """Clones the repository.""" - if self.repo_path.exists(): - return - print( - "Cloning", - self.repo_slug, - "to", - self.repo_path, - "because:", - self.repo_path.exists(), - ) - try: - clone_repo(self.repo_slug, self.repo_path) - except Exception as e: - logger.error("Exception during cloning:\n", e) - raise + lock_path = REPOS_PATH / "locks" / self.repo_slug + lock = fasteners.InterProcessLock(lock_path) + with lock: + if self.repo_path.exists(): + return + try: + clone_repo(self.repo_slug, self.repo_path) + except Exception as e: + logger.error("Exception during cloning:\n", e) + raise if not self.repo_path.exists(): logger.error( f"Repo {self.repo_slug} does not exist after cloning {self.repo_path}" @@ -246,6 +242,7 @@ def copy_repo(self) -> None: ignore_dangling_symlinks=True, ) os.system("chmod -R 777 " + str(self.local_repo_path)) + self.repo = Repo(self.local_repo_path) def checkout(self, commit: str, use_cache: bool = True) -> Tuple[bool, str]: diff --git a/src/scripts/merge_tools/git_hires_merge.sh b/src/scripts/merge_tools/git_hires_merge.sh index bc2d1ba275..90ae509345 100755 --- a/src/scripts/merge_tools/git_hires_merge.sh +++ b/src/scripts/merge_tools/git_hires_merge.sh @@ -6,6 +6,9 @@ clone_dir=$1 branch1=$2 branch2=$3 +# Print the current PATH +echo "PATH: $PATH" + cd "$clone_dir" || exit 1 git checkout "$branch1" --force diff --git a/src/scripts/merge_tools/spork.sh b/src/scripts/merge_tools/spork.sh index 2753140916..9124fb89e3 100755 --- a/src/scripts/merge_tools/spork.sh +++ b/src/scripts/merge_tools/spork.sh @@ -26,14 +26,19 @@ clone_dir=$1 branch1=$2 branch2=$3 +cd "$clone_dir" || exit + # set up spork driver -(echo "[merge \"spork\"]"; - echo " name = spork"; - echo " driver = java -jar $spork_absolutepath --git-mode %A %O %B -o %A") >> "$clone_dir/.git/config" -echo "*.java merge=spork" >> "$clone_dir/.gitattributes" +git config --local merge.spork.name "spork" +git config --local merge.spork.driver "java -jar $spork_absolutepath --git-mode %A %O %B -o %A" + +# print git config +echo "*.java merge=spork" >> .gitattributes # perform merge -cd "$clone_dir" || exit +echo "Current git config:" +git config --list + git checkout "$branch1" --force git merge --no-edit "$branch2" retVal=$?