Skip to content

Commit

Permalink
Check reproducible results in CI/CD (#285)
Browse files Browse the repository at this point in the history
  • Loading branch information
benedikt-schesch authored May 9, 2024
1 parent 0fc1a7f commit d5ed25c
Show file tree
Hide file tree
Showing 9 changed files with 183 additions and 64 deletions.
3 changes: 3 additions & 0 deletions .gitconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[user]
email = [email protected]
name = Example Example
50 changes: 50 additions & 0 deletions .github/workflows/check-reproducibility.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: Reproducibility Check
on: [push, pull_request]
jobs:
test:
strategy:
matrix:
maven: [ '3.9.2' ]
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
steps:
- uses: actions/setup-java@v4
with:
distribution: 'zulu'
java-version: 17.0.7
- run: echo "JAVA17_HOME=$JAVA_HOME" >> $GITHUB_ENV
- run: java -version
- uses: actions/checkout@v4
with:
submodules: 'true'
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Setup miniconda
uses: conda-incubator/setup-miniconda@v3
with:
python-version: 3.12
auto-update-conda: true
mamba-version: "*"
channels: conda-forge,defaults
activate-environment: AST
environment-file: environment.yml
- name: Install maven
uses: s4u/[email protected]
with:
java-version: 17
maven-version: ${{ matrix.maven }}
- name: Clean caches & workspace
run: make clean
- run: echo "LOGURU_COLORIZE=NO" >> $GITHUB_ENV
- name: make check-merges-reproducibility
run: |
git config --global user.email "[email protected]"
git config --global user.name "Example Example"
head -n 151 results/combined/result.csv > temp.csv && mv temp.csv results/combined/result.csv
make check-merges-reproducibility
env:
GITHUB_TOKEN: ${{ secrets.TOKEN_GITHUB }}
10 changes: 6 additions & 4 deletions .github/workflows/small-test.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Run Small test
name: Small test
on: [push, pull_request]
jobs:
test:
Expand Down Expand Up @@ -47,17 +47,19 @@ jobs:
- name: Install PdfLaTeX
run: sudo apt update && sudo apt install texlive-latex-extra -y
- name: Install maven
uses: s4u/setup-maven-action@v1.8.0
uses: s4u/setup-maven-action@v1.12.0
with:
java-version: 17
maven-version: ${{ matrix.maven }}
- name: Clean caches & workspace
run: make clean
- name: Install killall
run: sudo apt update && sudo apt install psmisc -y
- run: echo "LOGURU_COLORIZE=NO" >> $GITHUB_ENV
- name: Run small test
run: |
git config --global user.email "[email protected]"
git config --global user.name "Example Example"
git config --global merge.customMerge.name "Always incorrect custom merge driver"
git config --global merge.customMerge.driver 'fake-merge-driver %O %A %B %L %P'
make small-test
env:
GITHUB_TOKEN: ${{ secrets.TOKEN_GITHUB }}
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ CSV_RESULTS_GREATEST_HITS = results/greatest_hits/result.csv
CSV_RESULTS_REAPER = results/reaper/result.csv
CSV_RESULTS = $(CSV_RESULTS_COMBINED)

NUM_PROCESSES = 0

shell-script-style:
shellcheck -e SC2153 -x -P SCRIPTDIR --format=gcc ${SH_SCRIPTS} ${BASH_SCRIPTS}
checkbashisms ${SH_SCRIPTS}
Expand Down Expand Up @@ -130,8 +132,9 @@ clean-local:
rm -rf repos

check-merges-reproducibility:
@echo "Running replay_merge for each idx in parallel..."
@tail -n +2 $(CSV_RESULTS) | awk -F, '{print $$1}' | parallel -u --halt now,fail=1 -j 0 'python3 src/python/replay_merge.py -delete_workdir --idx {}'
@echo "Running replay_merge for each idx in parallel using GNU Parallel..."
@set -e; \
tail -n +2 $(CSV_RESULTS) | awk -F, '{print $$1}' | parallel -j 50% python3 src/python/replay_merge.py --merges_csv $(CSV_RESULTS) -delete_workdir --idx {}

protect-repos:
find repos -mindepth 1 -type d -exec chmod a-w {} +
Expand Down
19 changes: 6 additions & 13 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,8 @@ done
PATH=$(pwd)/src/scripts/merge_tools/:$PATH
export PATH

echo "Checking for custom merge drivers in global configuration..."
merge_drivers=$(git config --global --get-regexp '^merge\..*\.driver$' || echo "No merge drivers set")

if [ "$merge_drivers" == "No merge drivers set" ]; then
echo "No custom merge drivers found in global configuration. Proceeding with the evaluation."
# Include other commands to continue the script here
else
echo "Error: Custom merge drivers are set in global configuration."
echo "Please unset them before running the evaluation."
echo "Merge driver found: $merge_drivers"
exit 1
fi
GIT_CONFIG_GLOBAL=$(pwd)/.gitconfig
export GIT_CONFIG_GLOBAL

# Check if cache.tar exists and cache is missing
if [ -f cache.tar ] && [ ! -d cache ]; then
Expand Down Expand Up @@ -107,10 +97,13 @@ fi

mkdir -p "$OUT_DIR"

# Delete all locks in cache
# Delete all locks
if [ -d "$CACHE_DIR" ]; then
find "$CACHE_DIR" -name "*.lock" -delete
fi
if [ -d "repos" ]; then
find "repos/locks" -name "*.lock" -delete
fi

# Delete .workdir
rm -rf .workdir
Expand Down
113 changes: 88 additions & 25 deletions src/python/replay_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"""Replay merges and their test results"""
import argparse
import os
import sys
import tarfile
from pathlib import Path
import shutil
import pandas as pd
Expand All @@ -21,13 +23,41 @@
logger.add("replay_merge.log", mode="a")


def store_artifacts(result_df: pd.DataFrame) -> None:
"""Store artifacts in a tarball directly fro."""
tarball_path = "replay_merge_artifacts.tar.gz"

# Create the tarball and add files, ensuring no path modification
with tarfile.open(tarball_path, "w:gz") as tar:
for idx in result_df.index:
repo_path = result_df.loc[idx, "repo path"]
log_path = result_df.loc[idx, "merge log path"]

# Add repository directories or files to the tarball with absolute paths
tar.add(repo_path, arcname=repo_path)

# Add log files to the tarball with absolute paths
tar.add(log_path, arcname=log_path)

logger.info("Artifacts created")


def delete_workdirs(results_df: pd.DataFrame) -> None:
"""Delete the workdirs after replaying the merges."""
for idx in results_df.index:
os.system("chmod -R 777 " + str(results_df.loc[idx, "repo path"]))
shutil.rmtree(results_df.loc[idx, "repo path"])
logger.info("Workdirs deleted")


# pylint: disable=too-many-arguments, too-many-locals
def merge_replay(
merge_idx: str,
repo_slug: str,
merge_data: pd.Series,
test_merge: bool = False,
delete_workdir: bool = True,
create_artifacts: bool = False,
dont_check_fingerprints: bool = False,
) -> pd.DataFrame:
"""Replay a merge and its test results.
Expand Down Expand Up @@ -75,13 +105,21 @@ def merge_replay(
f"workdir {WORKDIR_DIRECTORY/workdir} already exists. Skipping"
)
continue

repo = Repository(
repo_slug,
cache_directory=Path("no_cache/"),
workdir_id=workdir,
delete_workdir=delete_workdir,
)
try:
repo = Repository(
repo_slug,
cache_directory=Path("no_cache/"),
workdir_id=workdir,
delete_workdir=False,
lazy_clone=False,
)
except Exception as e:
logger.error(
f"Git clone failed for {repo_slug} {merge_data['left']}"
+ f"{merge_data['right']} {e}"
)
# Exit with 0 for CI/CD to not cause problems in case a repo is no longer available
sys.exit(0)
(
merge_result,
merge_fingerprint,
Expand All @@ -96,6 +134,7 @@ def merge_replay(
timeout=TIMEOUT_TESTING_MERGE,
use_cache=False,
)
assert repo.local_repo_path.exists()
root_dir = Path("replay_logs")
log_path = root_dir / Path(
"merges/"
Expand All @@ -111,6 +150,7 @@ def merge_replay(
log_path.parent.mkdir(parents=True, exist_ok=True)
with open(log_path, "w", encoding="utf-8") as f:
f.write(explanation)
assert repo.local_repo_path.exists()
result_df.loc[
merge_tool.name,
["merge result", "merge log path", "repo path", "merge fingerprint"],
Expand All @@ -120,14 +160,41 @@ def merge_replay(
repo.local_repo_path,
merge_fingerprint,
]
if (
merge_data[f"{merge_tool.name}_merge_fingerprint"] != merge_fingerprint
and not dont_check_fingerprints
assert repo.local_repo_path.exists()

if ( # pylint: disable=too-many-boolean-expressions
merge_result
not in (
MERGE_STATE.Git_checkout_failed,
TEST_STATE.Git_checkout_failed,
)
and (
merge_data[f"{merge_tool.name}_merge_fingerprint"]
!= merge_fingerprint
and not dont_check_fingerprints
)
and (merge_tool != MERGE_TOOL.spork)
and (
merge_tool != MERGE_TOOL.gitmerge_resolve
or merge_result != MERGE_STATE.Merge_failed
)
):
assert repo.local_repo_path.exists()
if create_artifacts:
store_artifacts(result_df)
if delete_workdir:
delete_workdirs(result_df)
print("=====================================\n")
with open(log_path, "r", encoding="utf-8") as f:
print(f.read())
print("=====================================\n")
raise Exception(
f"fingerprints differ: after merge of {workdir} with {merge_tool}, found"
+ f" {merge_fingerprint} but expected "
+ f"{merge_data[f'{merge_tool.name}_merge_fingerprint']}"
+ f"{merge_data[f'{merge_tool.name}_merge_fingerprint']} at log path {log_path}"
+ f" and repo path {repo.local_repo_path}",
merge_result,
f"idx {merge_idx}",
)

if merge_result not in (
Expand Down Expand Up @@ -216,6 +283,11 @@ def merge_replay(
)
args = parser.parse_args()

os.environ["PATH"] += os.pathsep + os.path.join(
os.getcwd(), "src/scripts/merge_tools"
)
os.environ["GIT_CONFIG_GLOBAL"] = os.getcwd() + "/.gitconfig"

logger.info(f"Replaying merge with index {args.idx}")
if args.delete_workdir:
logger.info("Deleting workdir after replaying the merge")
Expand All @@ -235,7 +307,8 @@ def merge_replay(
str(repo_slug),
merge_data,
args.test,
args.delete_workdir and not args.create_artifacts,
args.delete_workdir,
args.create_artifacts,
args.dont_check_fingerprints,
)
for idx, row in results_df.iterrows():
Expand All @@ -254,16 +327,6 @@ def merge_replay(

# Create artifacts which means creating a tarball of all the relevant workdirs
if args.create_artifacts:
logger.info("Creating artifacts")
os.system(
"tar -czf replay_merge_artifacts.tar.gz "
+ " ".join(
[str(results_df.loc[idx, "repo path"]) for idx in results_df.index]
)
)
logger.info("Artifacts created")
if args.delete_workdir:
for idx in results_df.index:
os.system("chmod -R 777 " + str(results_df.loc[idx, "repo path"]))
shutil.rmtree(results_df.loc[idx, "repo path"])
logger.info("Workdirs deleted")
store_artifacts(results_df)
if args.delete_workdir:
delete_workdirs(results_df)
27 changes: 12 additions & 15 deletions src/python/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
set_in_cache,
lookup_in_cache,
)
import fasteners
import git.repo
from variables import (
REPOS_PATH,
Expand Down Expand Up @@ -209,21 +210,16 @@ def __init__( # pylint: disable=too-many-arguments

def clone_repo(self) -> None:
"""Clones the repository."""
if self.repo_path.exists():
return
print(
"Cloning",
self.repo_slug,
"to",
self.repo_path,
"because:",
self.repo_path.exists(),
)
try:
clone_repo(self.repo_slug, self.repo_path)
except Exception as e:
logger.error("Exception during cloning:\n", e)
raise
lock_path = REPOS_PATH / "locks" / self.repo_slug
lock = fasteners.InterProcessLock(lock_path)
with lock:
if self.repo_path.exists():
return
try:
clone_repo(self.repo_slug, self.repo_path)
except Exception as e:
logger.error("Exception during cloning:\n", e)
raise
if not self.repo_path.exists():
logger.error(
f"Repo {self.repo_slug} does not exist after cloning {self.repo_path}"
Expand All @@ -246,6 +242,7 @@ def copy_repo(self) -> None:
ignore_dangling_symlinks=True,
)
os.system("chmod -R 777 " + str(self.local_repo_path))

self.repo = Repo(self.local_repo_path)

def checkout(self, commit: str, use_cache: bool = True) -> Tuple[bool, str]:
Expand Down
3 changes: 3 additions & 0 deletions src/scripts/merge_tools/git_hires_merge.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ clone_dir=$1
branch1=$2
branch2=$3

# Print the current PATH
echo "PATH: $PATH"

cd "$clone_dir" || exit 1

git checkout "$branch1" --force
Expand Down
Loading

0 comments on commit d5ed25c

Please sign in to comment.