Skip to content

Commit

Permalink
Bug fixes (#228)
Browse files Browse the repository at this point in the history
  • Loading branch information
benedikt-schesch authored Oct 18, 2023
1 parent 8771e99 commit 1d71aa6
Show file tree
Hide file tree
Showing 15 changed files with 2,105 additions and 36 deletions.
1 change: 0 additions & 1 deletion .github/workflows/check-style.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ jobs:
python-version: 3.8
auto-update-conda: true
channels: conda-forge,defaults
mamba-version: "*"
activate-environment: AST
environment-file: environment.yml
- name: Install shellcheck and checkbashisms
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/small-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ jobs:
python-version: 3.8
auto-update-conda: true
channels: conda-forge,defaults
mamba-version: "*"
activate-environment: AST
environment-file: environment.yml
- name: Install PdfLaTeX
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ cache/
artifacts/
artifacts.tar.gz
*.hprof
cache.tar

output/
merge_repo/
Expand Down
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ style: shell-script-style python-style java-style

SH_SCRIPTS = $(shell grep --exclude-dir=build --exclude-dir=repos --exclude-dir=cache -r -l '^\#! \?\(/bin/\|/usr/bin/env \)sh' * | grep -v 'git-hires-merge' | grep -v /.git/ | grep -v '~$$' | grep -v '\.tar$$' | grep -v gradlew)
BASH_SCRIPTS = $(shell grep --exclude-dir=build --exclude-dir=repos --exclude-dir=cache -r -l '^\#! \?\(/bin/\|/usr/bin/env \)bash' * | grep -v /.git/ | grep -v '~$$' | grep -v '\.tar$$' | grep -v gradlew)
PYTHON_FILES = $(shell find . -name '*.py' ! -path './repos/*' | grep -v '/__pycache__/' | grep -v '/.git/' | grep -v gradlew)
PYTHON_FILES = $(shell find . -name '*.py' ! -path './repos/*' -not -path "./.workdir/*" | grep -v '/__pycache__/' | grep -v '/.git/' | grep -v gradlew)

shell-script-style:
shellcheck -e SC2153 -x -P SCRIPTDIR --format=gcc ${SH_SCRIPTS} ${BASH_SCRIPTS}
Expand Down Expand Up @@ -49,7 +49,8 @@ clean-everything: clean clean-cache clean-test-cache clean-stored-hashes

# Compresses the cache.
compress-cache:
rm -r cache.tar
if [ ! -d cache ]; then echo "cache does not exist"; exit 1; fi
if [ -f cache.tar ]; then rm -f cache.tar; fi
tar --exclude="lock" -czf cache.tar cache

# Decompresses the cache.
Expand Down
1,001 changes: 1,001 additions & 0 deletions input_data/repos_1000.csv

Large diffs are not rendered by default.

960 changes: 960 additions & 0 deletions input_data/repos_1000_with_hashes.csv

Large diffs are not rendered by default.

15 changes: 11 additions & 4 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ set -o nounset

REPOS_CSV="$1"
OUT_DIR="$2"
N_REPETITIONS=$3
N_MERGES=$3
CACHE_DIR="${4}"

comparator_flags=""
Expand Down Expand Up @@ -91,13 +91,20 @@ java -cp build/libs/astmergeevaluation-all.jar \
"$OUT_DIR/repos_head_passes.csv" \
"$OUT_DIR/merges"

# Sample 20*<n_merges> merges
read -ra merge_comparator_flags <<<"${comparator_flags}"
python3 src/python/merge_tools_comparator.py \
python3 src/python/sample_merges.py \
--repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
--merges_path "$OUT_DIR/merges/" \
--output_dir "$OUT_DIR/merges_sampled/" \
--n_merges "$((20 * "$N_MERGES"))" \
"${merge_comparator_flags[@]}"

python3 src/python/merge_tools_comparator.py \
--repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
--merges_path "$OUT_DIR/merges_sampled/" \
--output_dir "$OUT_DIR/merges_compared/" \
--cache_dir "$CACHE_DIR" \
"${merge_comparator_flags[@]}"

python3 src/python/merge_tester.py \
--repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
Expand All @@ -115,5 +122,5 @@ python3 src/python/latex_output.py \
--tested_merges_path "$OUT_DIR/merges_tested/" \
--full_repos_csv "$REPOS_CSV" \
--repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
--n_merges "$N_REPETITIONS" \
--n_merges "$N_MERGES" \
--output_dir "$OUT_DIR"
21 changes: 21 additions & 0 deletions run_1000.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash

# usage: ./run_full.sh [-i <machine_id> -n <num_machines>] [-d]
# Runs the stack all the repositories
# The output appears in result/ .
# <machine_id> optional argument to specify the id of the current machine.
# <num_machine> optional argument to specify the total number of machines used.
# <diff> optional argument to specify whether to diff the merges.
# Warning: This takes days to run.


set -e
set -o nounset

# Check if cache.tar exists and cache is missing
if [ -f cache.tar ] && [ ! -d cache ]; then
echo "Decompressing cache.tar"
make decompress-cache
fi

./run.sh input_data/repos_1000.csv results-trivial-merges 20 cache "$@"
7 changes: 6 additions & 1 deletion src/python/latex_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ def main(): # pylint: disable=too-many-locals,too-many-branches,too-many-statem

try:
merges = pd.read_csv(merge_list_file, header=0, index_col="idx")
if len(merges) == 0:
raise pd.errors.EmptyDataError
except pd.errors.EmptyDataError:
print(
"latex_output: Skipping",
Expand Down Expand Up @@ -487,7 +489,10 @@ def main(): # pylint: disable=too-many-locals,too-many-branches,too-many-statem
)
if not os.path.isfile(merge_list_file):
continue
merges = pd.read_csv(merge_list_file, index_col=0)
try:
merges = pd.read_csv(merge_list_file, index_col=0)
except pd.errors.EmptyDataError:
continue
if len(merges) > 0:
repos += 1
count += len(merges)
Expand Down
14 changes: 9 additions & 5 deletions src/python/merge_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
for branch in ["left", "right"]:
commit_sha = merge_data[branch]
repo = Repository(repo_slug, cache_directory=cache_directory)
repo.checkout(commit_sha)
tree_fingerprint = repo.compute_tree_fingerprint()
test_result, tree_fingerprint = repo.checkout_and_test(
commit_sha, TIMEOUT_TESTING_PARENT, N_TESTS
)
if tree_fingerprint != merge_data[f"{branch}_tree_fingerprint"]:
raise Exception(
"merge_tester: Tree fingerprint mismatch",
Expand All @@ -70,11 +71,13 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
merge_data[f"{branch}_tree_fingerprint"],
repo.path,
)
test_result = repo.test(TIMEOUT_TESTING_PARENT, N_TESTS)
merge_data[f"{branch} test result"] = test_result.name
if test_result != TEST_STATE.Tests_passed:
return merge_data

print(
"merge_tester: Parents pass", repo_slug, merge_data["left"], merge_data["right"]
)
merge_data["parents pass"] = True

for merge_tool in MERGE_TOOL:
Expand Down Expand Up @@ -109,6 +112,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
# Update the status from merge success to test result.
merge_data[merge_tool.name] = result.name
assert merge_tool.name in merge_data
print("merge_tester: Finished", repo_slug, merge_data["left"], merge_data["right"])
return merge_data


Expand Down Expand Up @@ -204,7 +208,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
for i in tqdm(range(len(merge_tester_arguments))):
repo_slug = merge_tester_arguments[i][0]
merge_results = merge_tester_results[i]
if merge_results["parents pass"]:
if len(merge_results) > 0 and merge_results["parents pass"]:
n_merges_parents_pass += 1
repo_result[repo_slug].append(merge_results)

Expand All @@ -229,7 +233,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
df.sort_index(inplace=True)
df.to_csv(output_file, index_label="idx")
n_total_merges += len(df)
n_total_merges_parents_pass += len(df[df["parents pass"]])
n_total_merges_parents_pass += len(df[df["parents pass"]]) if len(df) > 0 else 0

print("merge_tester: Number of newly tested merges:", len(merge_tester_arguments))
print(
Expand Down
15 changes: 2 additions & 13 deletions src/python/merge_tools_comparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,10 @@
--merges_path <path_to_merges>
--output_dir <output_dir>
--cache_dir <cache_dir>
--include_trivial_merges (optional flag)
--only_trivial_merges (optional flag)
This script flags merges that have different results for different merge tools.
The output is written in output_dir and consists of the same files as the input
files, but with an additional column that indicates whether the merge tools
differ.
If the flag --include_trivial_merges is set, then the script will also output
merges that are trivial.
If the flag --only_trivial_merges is set, then the script will only output
merges that are trivial.
"""

import os
Expand Down Expand Up @@ -159,12 +153,13 @@ def build_merge_arguments(args: argparse.Namespace, repo_slug: str):
os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
)
if not merge_list_file.exists():
raise Exception(
print(
"merge_tools_comparator:",
repo_slug,
"does not have a list of merges. Missing file: ",
merge_list_file,
)
return []

if output_file.exists():
print(
Expand All @@ -190,10 +185,6 @@ def build_merge_arguments(args: argparse.Namespace, repo_slug: str):
)
merges["notes"].replace(np.nan, "", inplace=True)

if args.only_trivial_merges:
merges = merges[merges["notes"].str.contains("a parent is the base")]
elif not args.include_trivial_merges:
merges = merges[~merges["notes"].str.contains("a parent is the base")]
arguments = [
(repo_slug, merge_data, Path(args.cache_dir))
for _, merge_data in merges.iterrows()
Expand All @@ -208,8 +199,6 @@ def build_merge_arguments(args: argparse.Namespace, repo_slug: str):
parser.add_argument("--merges_path", type=Path)
parser.add_argument("--output_dir", type=Path)
parser.add_argument("--cache_dir", type=Path, default="cache/merges/")
parser.add_argument("--include_trivial_merges", action="store_true")
parser.add_argument("--only_trivial_merges", action="store_true")
args = parser.parse_args()
Path(args.cache_dir).mkdir(parents=True, exist_ok=True)
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
Expand Down
17 changes: 10 additions & 7 deletions src/python/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def __init__(
self.workdir = WORKDIR_DIRECTORY / workdir_id
self.workdir.mkdir(parents=True, exist_ok=True)
self.repo_path = self.workdir / self.path.name
shutil.copytree(self.path, self.repo_path)
shutil.copytree(self.path, self.repo_path, symlinks=True)
self.repo = Repo(self.repo_path)
self.test_cache_directory = cache_directory / "test_cache"
self.sha_cache_directory = cache_directory / "sha_cache_entry"
Expand Down Expand Up @@ -460,7 +460,7 @@ def _checkout_and_test(
commit: str,
timeout: int,
n_tests: int,
) -> TEST_STATE:
) -> Tuple[TEST_STATE, Union[str, None]]:
"""Helper function for `checkout_and_test`,
which checks out the given commit and tests the repository.
This function does not check the cache.
Expand All @@ -470,19 +470,21 @@ def _checkout_and_test(
n_tests (int): The number of times to run the test suite.
Returns:
TEST_STATE: The result of the test.
Union[str,None]: The tree fingerprint of the result.
"""
result, explanation = self.checkout(commit)
if not result:
print("Checkout failed for", self.repo_slug, commit, explanation)
return TEST_STATE.Git_checkout_failed
return self.test(timeout, n_tests)
return TEST_STATE.Git_checkout_failed, None
sha = self.compute_tree_fingerprint()
return self.test(timeout, n_tests), sha

def checkout_and_test(
self,
commit: str,
timeout: int,
n_tests: int,
) -> TEST_STATE:
) -> Tuple[TEST_STATE, Union[str, None]]:
"""Checks out the given commit and tests the repository.
Args:
commit (str): The commit to checkout.
Expand All @@ -491,16 +493,17 @@ def checkout_and_test(
check_cache (bool, optional) = True: Whether to check the cache.
Returns:
TEST_STATE: The result of the test.
Union[str,None]: The tree fingerprint of the result.
"""
sha_cache_entry = self.get_sha_cache_entry(commit, start_merge=True)
if sha_cache_entry is None:
return self._checkout_and_test(commit, timeout, n_tests)
if sha_cache_entry["sha"] is None:
return TEST_STATE.Git_checkout_failed
return TEST_STATE.Git_checkout_failed, None
result = self.get_test_cache_entry(sha_cache_entry["sha"])
if result is None:
return self._checkout_and_test(commit, timeout, n_tests)
return result
return result, sha_cache_entry["sha"]

def test(self, timeout: int, n_tests: int) -> TEST_STATE:
"""Tests the repository. The test results of multiple runs is combined into one result.
Expand Down
79 changes: 79 additions & 0 deletions src/python/sample_merges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python3
""" Samples n_merges for each repository.
usage: python3 sample_merges.py --repos_head_passes_csv <path_to_repos_head_passes.csv>
--merges_path <path_to_merges>
--output_dir <output_dir>
--include_trivial_merges (optional)
--only_trivial_merges (optional)
This script samples n_merges for each repository.
If the flag --include_trivial_merges is set, then the script will also output
merges that are trivial.
If the flag --only_trivial_merges is set, then the script will only output
merges that are trivial.
"""

import os
import argparse
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import numpy as np
from cache_utils import slug_repo_name

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--repos_head_passes_csv", type=Path)
parser.add_argument("--merges_path", type=Path)
parser.add_argument("--output_dir", type=Path)
parser.add_argument("--n_merges", type=int, default=100)
parser.add_argument("--include_trivial_merges", action="store_true")
parser.add_argument("--only_trivial_merges", action="store_true")
args = parser.parse_args()

repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx")
Path(args.output_dir).mkdir(parents=True, exist_ok=True)

for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
repo_slug = repository_data["repository"]
merge_list_file = Path(
os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv")
)
output_file = Path(
os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
)
if not merge_list_file.exists():
print(
"sample_merges:",
repo_slug,
"does not have a list of merges. Missing file: ",
merge_list_file,
)
continue

if output_file.exists():
print(
"sample_merges: Skipping",
repo_slug,
"because it is already computed.",
)
continue
try:
merges = pd.read_csv(merge_list_file, header=0, index_col="idx")
except pd.errors.EmptyDataError:
print(
"sample_merges: Skipping",
repo_slug,
"because it does not contain any merges.",
)
continue

merges["notes"].replace(np.nan, "", inplace=True)
if args.only_trivial_merges:
merges = merges[merges["notes"].str.contains("a parent is the base")]
elif not args.include_trivial_merges:
merges = merges[~merges["notes"].str.contains("a parent is the base")]

n_merges = min(merges.shape[0], args.n_merges)
sample = merges.sample(n_merges, random_state=42)
sample.sort_index(inplace=True)
sample.to_csv(output_file)
2 changes: 1 addition & 1 deletion src/python/test_repo_heads.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE:
print("test_repo_heads:", repo_slug, ": head_passes_tests : started")

repo = Repository(repo_slug, cache_directory=cache)
test_state = repo.checkout_and_test(
test_state, _ = repo.checkout_and_test(
repo_info["head hash"], timeout=TIMEOUT_TESTING, n_tests=3
)
print("test_repo_heads:", repo_slug, ": head_passes_tests : returning", test_state)
Expand Down
2 changes: 1 addition & 1 deletion src/python/write_head_hashes.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def get_latest_hash(args):
print("write_head_hashes: Finished cloning repos and collecting head hashes")

result_df = pd.DataFrame([i for i in get_latest_hash_result if i is not None])
result_df = result_df.set_index(result_df.columns[0]).reset_index(drop=True)
result_df = result_df.reset_index(drop=True)
print("write_head_hashes: Started storing repo HEAD hashes")
result_df.to_csv(args.output_path, index_label="idx")
print("write_head_hashes: Finished storing repo HEAD hashes")

0 comments on commit 1d71aa6

Please sign in to comment.