Bug fixes (#228)

benedikt-schesch · Oct 18, 2023 · 1d71aa6 · 1d71aa6
1 parent 8771e99
commit 1d71aa6
Show file tree

Hide file tree

Showing 15 changed files with 2,105 additions and 36 deletions.
diff --git a/.github/workflows/check-style.yml b/.github/workflows/check-style.yml
@@ -19,7 +19,6 @@ jobs:
         python-version: 3.8
         auto-update-conda: true
         channels: conda-forge,defaults
-        mamba-version: "*"
         activate-environment: AST
         environment-file: environment.yml
     - name: Install shellcheck and checkbashisms

diff --git a/.github/workflows/small-test.yml b/.github/workflows/small-test.yml
@@ -40,7 +40,6 @@ jobs:
         python-version: 3.8
         auto-update-conda: true
         channels: conda-forge,defaults
-        mamba-version: "*"
         activate-environment: AST
         environment-file: environment.yml
     - name: Install PdfLaTeX

diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ cache/
 artifacts/
 artifacts.tar.gz
 *.hprof
+cache.tar
 
 output/
 merge_repo/

diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@ style: shell-script-style python-style java-style
 
 SH_SCRIPTS   = $(shell grep --exclude-dir=build --exclude-dir=repos --exclude-dir=cache -r -l '^\#! \?\(/bin/\|/usr/bin/env \)sh'   * | grep -v 'git-hires-merge' | grep -v /.git/ | grep -v '~$$' | grep -v '\.tar$$' | grep -v gradlew)
 BASH_SCRIPTS = $(shell grep --exclude-dir=build --exclude-dir=repos --exclude-dir=cache -r -l '^\#! \?\(/bin/\|/usr/bin/env \)bash' * | grep -v /.git/ | grep -v '~$$' | grep -v '\.tar$$' | grep -v gradlew)
-PYTHON_FILES = $(shell find .  -name '*.py' ! -path './repos/*' | grep -v '/__pycache__/' | grep -v '/.git/' | grep -v gradlew)
+PYTHON_FILES = $(shell find .  -name '*.py' ! -path './repos/*' -not -path "./.workdir/*" | grep -v '/__pycache__/' | grep -v '/.git/' | grep -v gradlew)
 
 shell-script-style:
 	shellcheck -e SC2153 -x -P SCRIPTDIR --format=gcc ${SH_SCRIPTS} ${BASH_SCRIPTS}
@@ -49,7 +49,8 @@ clean-everything: clean clean-cache clean-test-cache clean-stored-hashes
 
 # Compresses the cache.
 compress-cache:
-	rm -r cache.tar
+	if [ ! -d cache ]; then echo "cache does not exist"; exit 1; fi
+	if [ -f cache.tar ]; then rm -f cache.tar; fi
 	tar --exclude="lock" -czf cache.tar cache
 
 # Decompresses the cache.

diff --git a/input_data/repos_1000.csv b/input_data/repos_1000.csv
diff --git a/input_data/repos_1000_with_hashes.csv b/input_data/repos_1000_with_hashes.csv
diff --git a/run.sh b/run.sh
@@ -17,7 +17,7 @@ set -o nounset
 
 REPOS_CSV="$1"
 OUT_DIR="$2"
-N_REPETITIONS=$3
+N_MERGES=$3
 CACHE_DIR="${4}"
 
 comparator_flags=""
@@ -91,13 +91,20 @@ java -cp build/libs/astmergeevaluation-all.jar \
     "$OUT_DIR/repos_head_passes.csv" \
     "$OUT_DIR/merges"
 
+# Sample 20*<n_merges> merges
 read -ra merge_comparator_flags <<<"${comparator_flags}"
-python3 src/python/merge_tools_comparator.py \
+python3 src/python/sample_merges.py \
     --repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
     --merges_path "$OUT_DIR/merges/" \
+    --output_dir "$OUT_DIR/merges_sampled/" \
+    --n_merges "$((20 * "$N_MERGES"))" \
+    "${merge_comparator_flags[@]}"
+
+python3 src/python/merge_tools_comparator.py \
+    --repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
+    --merges_path "$OUT_DIR/merges_sampled/" \
     --output_dir "$OUT_DIR/merges_compared/" \
     --cache_dir "$CACHE_DIR" \
-    "${merge_comparator_flags[@]}"
 
 python3 src/python/merge_tester.py \
     --repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
@@ -115,5 +122,5 @@ python3 src/python/latex_output.py \
     --tested_merges_path "$OUT_DIR/merges_tested/" \
     --full_repos_csv "$REPOS_CSV" \
     --repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
-    --n_merges "$N_REPETITIONS" \
+    --n_merges "$N_MERGES" \
     --output_dir "$OUT_DIR"
diff --git a/run_1000.sh b/run_1000.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+# usage: ./run_full.sh [-i <machine_id> -n <num_machines>] [-d]
+# Runs the stack all the repositories
+# The output appears in result/ .
+# <machine_id> optional argument to specify the id of the current machine.
+# <num_machine> optional argument to specify the total number of machines used.
+# <diff> optional argument to specify whether to diff the merges.
+# Warning: This takes days to run.
+
+
+set -e
+set -o nounset
+
+# Check if cache.tar exists and cache is missing
+if [ -f cache.tar ] && [ ! -d cache ]; then
+    echo "Decompressing cache.tar"
+    make decompress-cache
+fi
+
+./run.sh input_data/repos_1000.csv results-trivial-merges 20 cache "$@"
diff --git a/src/python/latex_output.py b/src/python/latex_output.py
@@ -156,6 +156,8 @@ def main():  # pylint: disable=too-many-locals,too-many-branches,too-many-statem
 
         try:
             merges = pd.read_csv(merge_list_file, header=0, index_col="idx")
+            if len(merges) == 0:
+                raise pd.errors.EmptyDataError
         except pd.errors.EmptyDataError:
             print(
                 "latex_output: Skipping",
@@ -487,7 +489,10 @@ def main():  # pylint: disable=too-many-locals,too-many-branches,too-many-statem
         )
         if not os.path.isfile(merge_list_file):
             continue
-        merges = pd.read_csv(merge_list_file, index_col=0)
+        try:
+            merges = pd.read_csv(merge_list_file, index_col=0)
+        except pd.errors.EmptyDataError:
+            continue
         if len(merges) > 0:
             repos += 1
         count += len(merges)

diff --git a/src/python/merge_tester.py b/src/python/merge_tester.py
@@ -57,8 +57,9 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
     for branch in ["left", "right"]:
         commit_sha = merge_data[branch]
         repo = Repository(repo_slug, cache_directory=cache_directory)
-        repo.checkout(commit_sha)
-        tree_fingerprint = repo.compute_tree_fingerprint()
+        test_result, tree_fingerprint = repo.checkout_and_test(
+            commit_sha, TIMEOUT_TESTING_PARENT, N_TESTS
+        )
         if tree_fingerprint != merge_data[f"{branch}_tree_fingerprint"]:
             raise Exception(
                 "merge_tester: Tree fingerprint mismatch",
@@ -70,11 +71,13 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
                 merge_data[f"{branch}_tree_fingerprint"],
                 repo.path,
             )
-        test_result = repo.test(TIMEOUT_TESTING_PARENT, N_TESTS)
         merge_data[f"{branch} test result"] = test_result.name
         if test_result != TEST_STATE.Tests_passed:
             return merge_data
 
+    print(
+        "merge_tester: Parents pass", repo_slug, merge_data["left"], merge_data["right"]
+    )
     merge_data["parents pass"] = True
 
     for merge_tool in MERGE_TOOL:
@@ -109,6 +112,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
             # Update the status from merge success to test result.
             merge_data[merge_tool.name] = result.name
         assert merge_tool.name in merge_data
+    print("merge_tester: Finished", repo_slug, merge_data["left"], merge_data["right"])
     return merge_data
 
 
@@ -204,7 +208,7 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
     for i in tqdm(range(len(merge_tester_arguments))):
         repo_slug = merge_tester_arguments[i][0]
         merge_results = merge_tester_results[i]
-        if merge_results["parents pass"]:
+        if len(merge_results) > 0 and merge_results["parents pass"]:
             n_merges_parents_pass += 1
         repo_result[repo_slug].append(merge_results)
 
@@ -229,7 +233,7 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
             df.sort_index(inplace=True)
             df.to_csv(output_file, index_label="idx")
         n_total_merges += len(df)
-        n_total_merges_parents_pass += len(df[df["parents pass"]])
+        n_total_merges_parents_pass += len(df[df["parents pass"]]) if len(df) > 0 else 0
 
     print("merge_tester: Number of newly tested merges:", len(merge_tester_arguments))
     print(

diff --git a/src/python/merge_tools_comparator.py b/src/python/merge_tools_comparator.py
@@ -4,16 +4,10 @@
                                 --merges_path <path_to_merges>
                                 --output_dir <output_dir>
                                 --cache_dir <cache_dir>
-                                --include_trivial_merges (optional flag)
-                                --only_trivial_merges (optional flag)
 This script flags merges that have different results for different merge tools.
 The output is written in output_dir and consists of the same files as the input
 files, but with an additional column that indicates whether the merge tools
 differ.
-If the flag --include_trivial_merges is set, then the script will also output
-merges that are trivial.
-If the flag --only_trivial_merges is set, then the script will only output
-merges that are trivial.
 """
 
 import os
@@ -159,12 +153,13 @@ def build_merge_arguments(args: argparse.Namespace, repo_slug: str):
         os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
     )
     if not merge_list_file.exists():
-        raise Exception(
+        print(
             "merge_tools_comparator:",
             repo_slug,
             "does not have a list of merges. Missing file: ",
             merge_list_file,
         )
+        return []
 
     if output_file.exists():
         print(
@@ -190,10 +185,6 @@ def build_merge_arguments(args: argparse.Namespace, repo_slug: str):
     )
     merges["notes"].replace(np.nan, "", inplace=True)
 
-    if args.only_trivial_merges:
-        merges = merges[merges["notes"].str.contains("a parent is the base")]
-    elif not args.include_trivial_merges:
-        merges = merges[~merges["notes"].str.contains("a parent is the base")]
     arguments = [
         (repo_slug, merge_data, Path(args.cache_dir))
         for _, merge_data in merges.iterrows()
@@ -208,8 +199,6 @@ def build_merge_arguments(args: argparse.Namespace, repo_slug: str):
     parser.add_argument("--merges_path", type=Path)
     parser.add_argument("--output_dir", type=Path)
     parser.add_argument("--cache_dir", type=Path, default="cache/merges/")
-    parser.add_argument("--include_trivial_merges", action="store_true")
-    parser.add_argument("--only_trivial_merges", action="store_true")
     args = parser.parse_args()
     Path(args.cache_dir).mkdir(parents=True, exist_ok=True)
     Path(args.output_dir).mkdir(parents=True, exist_ok=True)

diff --git a/src/python/repo.py b/src/python/repo.py
@@ -124,7 +124,7 @@ def __init__(
         self.workdir = WORKDIR_DIRECTORY / workdir_id
         self.workdir.mkdir(parents=True, exist_ok=True)
         self.repo_path = self.workdir / self.path.name
-        shutil.copytree(self.path, self.repo_path)
+        shutil.copytree(self.path, self.repo_path, symlinks=True)
         self.repo = Repo(self.repo_path)
         self.test_cache_directory = cache_directory / "test_cache"
         self.sha_cache_directory = cache_directory / "sha_cache_entry"
@@ -460,7 +460,7 @@ def _checkout_and_test(
         commit: str,
         timeout: int,
         n_tests: int,
-    ) -> TEST_STATE:
+    ) -> Tuple[TEST_STATE, Union[str, None]]:
         """Helper function for `checkout_and_test`,
         which checks out the given commit and tests the repository.
         This function does not check the cache.
@@ -470,19 +470,21 @@ def _checkout_and_test(
             n_tests (int): The number of times to run the test suite.
         Returns:
             TEST_STATE: The result of the test.
+            Union[str,None]: The tree fingerprint of the result.
         """
         result, explanation = self.checkout(commit)
         if not result:
             print("Checkout failed for", self.repo_slug, commit, explanation)
-            return TEST_STATE.Git_checkout_failed
-        return self.test(timeout, n_tests)
+            return TEST_STATE.Git_checkout_failed, None
+        sha = self.compute_tree_fingerprint()
+        return self.test(timeout, n_tests), sha
 
     def checkout_and_test(
         self,
         commit: str,
         timeout: int,
         n_tests: int,
-    ) -> TEST_STATE:
+    ) -> Tuple[TEST_STATE, Union[str, None]]:
         """Checks out the given commit and tests the repository.
         Args:
             commit (str): The commit to checkout.
@@ -491,16 +493,17 @@ def checkout_and_test(
             check_cache (bool, optional) = True: Whether to check the cache.
         Returns:
             TEST_STATE: The result of the test.
+            Union[str,None]: The tree fingerprint of the result.
         """
         sha_cache_entry = self.get_sha_cache_entry(commit, start_merge=True)
         if sha_cache_entry is None:
             return self._checkout_and_test(commit, timeout, n_tests)
         if sha_cache_entry["sha"] is None:
-            return TEST_STATE.Git_checkout_failed
+            return TEST_STATE.Git_checkout_failed, None
         result = self.get_test_cache_entry(sha_cache_entry["sha"])
         if result is None:
             return self._checkout_and_test(commit, timeout, n_tests)
-        return result
+        return result, sha_cache_entry["sha"]
 
     def test(self, timeout: int, n_tests: int) -> TEST_STATE:
         """Tests the repository. The test results of multiple runs is combined into one result.

diff --git a/src/python/sample_merges.py b/src/python/sample_merges.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+""" Samples n_merges for each repository.
+usage: python3 sample_merges.py --repos_head_passes_csv <path_to_repos_head_passes.csv>
+                                --merges_path <path_to_merges>
+                                --output_dir <output_dir>
+                                --include_trivial_merges (optional)
+                                --only_trivial_merges (optional)
+This script samples n_merges for each repository.
+If the flag --include_trivial_merges is set, then the script will also output
+merges that are trivial.
+If the flag --only_trivial_merges is set, then the script will only output
+merges that are trivial.
+"""
+
+import os
+import argparse
+from pathlib import Path
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+from cache_utils import slug_repo_name
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repos_head_passes_csv", type=Path)
+    parser.add_argument("--merges_path", type=Path)
+    parser.add_argument("--output_dir", type=Path)
+    parser.add_argument("--n_merges", type=int, default=100)
+    parser.add_argument("--include_trivial_merges", action="store_true")
+    parser.add_argument("--only_trivial_merges", action="store_true")
+    args = parser.parse_args()
+
+    repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx")
+    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+
+    for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
+        repo_slug = repository_data["repository"]
+        merge_list_file = Path(
+            os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv")
+        )
+        output_file = Path(
+            os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
+        )
+        if not merge_list_file.exists():
+            print(
+                "sample_merges:",
+                repo_slug,
+                "does not have a list of merges. Missing file: ",
+                merge_list_file,
+            )
+            continue
+
+        if output_file.exists():
+            print(
+                "sample_merges: Skipping",
+                repo_slug,
+                "because it is already computed.",
+            )
+            continue
+        try:
+            merges = pd.read_csv(merge_list_file, header=0, index_col="idx")
+        except pd.errors.EmptyDataError:
+            print(
+                "sample_merges: Skipping",
+                repo_slug,
+                "because it does not contain any merges.",
+            )
+            continue
+
+        merges["notes"].replace(np.nan, "", inplace=True)
+        if args.only_trivial_merges:
+            merges = merges[merges["notes"].str.contains("a parent is the base")]
+        elif not args.include_trivial_merges:
+            merges = merges[~merges["notes"].str.contains("a parent is the base")]
+
+        n_merges = min(merges.shape[0], args.n_merges)
+        sample = merges.sample(n_merges, random_state=42)
+        sample.sort_index(inplace=True)
+        sample.to_csv(output_file)
diff --git a/src/python/test_repo_heads.py b/src/python/test_repo_heads.py
@@ -43,7 +43,7 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> TEST_STATE:
     print("test_repo_heads:", repo_slug, ": head_passes_tests : started")
 
     repo = Repository(repo_slug, cache_directory=cache)
-    test_state = repo.checkout_and_test(
+    test_state, _ = repo.checkout_and_test(
         repo_info["head hash"], timeout=TIMEOUT_TESTING, n_tests=3
     )
     print("test_repo_heads:", repo_slug, ": head_passes_tests : returning", test_state)

diff --git a/src/python/write_head_hashes.py b/src/python/write_head_hashes.py
@@ -122,7 +122,7 @@ def get_latest_hash(args):
     print("write_head_hashes: Finished cloning repos and collecting head hashes")
 
     result_df = pd.DataFrame([i for i in get_latest_hash_result if i is not None])
-    result_df = result_df.set_index(result_df.columns[0]).reset_index(drop=True)
+    result_df = result_df.reset_index(drop=True)
     print("write_head_hashes: Started storing repo HEAD hashes")
     result_df.to_csv(args.output_path, index_label="idx")
     print("write_head_hashes: Finished storing repo HEAD hashes")