diff --git a/.gitignore b/.gitignore index bafc21a4fa..d4d2855054 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ output.txt replay_logs/ my.secrets +repos-small-test output/ merge_repo/ merges_small_valid_subsamples/ diff --git a/Makefile b/Makefile index 406c9ad7bd..f0556ebc2a 100644 --- a/Makefile +++ b/Makefile @@ -33,11 +33,13 @@ check-python-style: # This target deletes files that are not committed to version control. clean: ${MAKE} clean-workdir - rm -rf repos + rm -rf repos-small-test rm -rf scratch rm -rf results/small rm -rf .valid_merges_counters +clean-repos: + rm -rf repos # This target deletes files in the cache, which is commited to version control. clean-cache: rm -rf cache @@ -95,24 +97,19 @@ copy-paper: rsync -av --exclude='*.csv' results ../AST-Merging-Evaluation-Paper/ find ../AST-Merging-Evaluation-Paper/ -type d -empty -delete -# Update cache -update-cache-results: - python3 src/python/cache_merger.py - make compress-cache - # As of 2023-07-31, this takes 5-20 minutes to run, depending on your machine. small-test: ${MAKE} clean-test-cache clean - ./run_small.sh --include_trivial_merges --no_timing + AST_REPOS_PATH=repos-small-test ./run_small.sh --include_trivial_merges --no_timing ${MAKE} compress-small-cache ${MAKE} small-test-diff rm -rf results/small - ./run_small.sh --include_trivial_merges --no_timing + AST_REPOS_PATH=repos-small-test ./run_small.sh --include_trivial_merges --no_timing ${MAKE} small-test-diff small-test-without-cleaning: ${MAKE} clean-test-cache - ./run_small.sh --include_trivial_merges --no_timing + AST_REPOS_PATH=repos-small-test ./run_small.sh --include_trivial_merges --no_timing ${MAKE} small-test-diff update-figures: @@ -147,7 +144,7 @@ clean-workdir: clean-local: ${MAKE} clean-workdir - rm -rf repos + rm -rf repos-small-test check-merges-reproducibility: @echo "Running replay_merge sequentially for each idx..." diff --git a/README.md b/README.md index c31b122303..2ed847a3fe 100644 --- a/README.md +++ b/README.md @@ -222,6 +222,6 @@ To run style checking run `make style`. To investigate differences between two mergers: -* edit file `src/python/select_from_results.py` to reflect the differences you are interested in. -* run `src/python/select_from_results.py` to create a .csv database containing only the differences. +* edit file `src/python/utils/select_from_results.py` to reflect the differences you are interested in. +* run `src/python/utils/select_from_results.py` to create a .csv database containing only the differences. * run `src/python/replay_merge.py --idx INDEX` (maybe add `-test`) for the index of the merge you are interested in. diff --git a/run.sh b/run.sh index 1da71605a8..4bd5325994 100755 --- a/run.sh +++ b/run.sh @@ -146,17 +146,12 @@ mkdir -p "$OUT_DIR" if [ -d "$CACHE_DIR" ]; then find "$CACHE_DIR" -name "*.lock" -delete fi -if [ -d "repos" ]; then - find "repos/locks" -name "*.lock" -delete +REPOS_PATH=${AST_REPOS_PATH:-repos} +if [ -d "$REPOS_PATH" ]; then + find "$REPOS_PATH" -name "*.lock" -delete fi -# Check if .workdir exists and delete it -if [ -d .workdir ]; then - chmod -R +w .workdir - rm -rf .workdir -fi - -python3 src/python/delete_cache_placeholders.py \ +python3 src/python/utils/delete_cache_placeholders.py \ --cache_dir "$CACHE_DIR" python3 src/python/write_head_hashes.py \ diff --git a/src/python/cache_merger.py b/src/python/cache_merger.py deleted file mode 100755 index 9e55878f32..0000000000 --- a/src/python/cache_merger.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""Merge multiple caches into one. Usage: -python3 cache_merger.py ... --output_cache -""" - -import shutil -import json -from argparse import ArgumentParser -from pathlib import Path -from typing import List -from rich.progress import ( - Progress, - SpinnerColumn, - BarColumn, - TimeElapsedColumn, - TimeRemainingColumn, - TextColumn, -) - - -def merge_json_data(paths: List[Path], output_path: Path): - """Merge multiple json files into one""" - data = {} - for path in paths: - if path.exists(): - with path.open("r") as f: - data.update(json.load(f)) - if not output_path.parent.exists(): - output_path.parent.mkdir(parents=True, exist_ok=True) - with output_path.open("w") as f: - json.dump(data, f, indent=4, sort_keys=True) - - -def copy_file(source: Path, destination: Path): - """Copy a file from source to destination""" - if not destination.exists(): - destination.parent.mkdir(parents=True, exist_ok=True) - shutil.copy(source, destination) - - -def process_directory(directory: Path, other_caches: List[Path], output_cache: Path): - """Process a directory recursively""" - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TimeElapsedColumn(), - TimeRemainingColumn(), - ) as progress: - task = progress.add_task("Traversing dir...", total=directory.stat().st_size) - for path in directory.rglob("*"): - if path.is_file(): - # Skip the first part of the path (the cache name) - relative_path = Path(*path.parts[1:]) - corresponding_paths = [ - cache / relative_path - for cache in other_caches - if (cache / relative_path).exists() - ] - if path.suffix == ".json": - merge_json_data( - [path] + corresponding_paths, output_cache / relative_path - ) - elif path.suffix != ".lock": - copy_file(path, output_cache / relative_path) - progress.update(task, advance=path.stat().st_size) - - -def merge_caches(caches: List[Path], output_cache: Path): - """Merge multiple caches into one""" - if not output_cache.exists(): - output_cache.mkdir(parents=True, exist_ok=True) - - for cache in caches: - process_directory(cache, [c for c in caches if c != cache], output_cache) - - -if __name__ == "__main__": - parser = ArgumentParser(description="Merge multiple caches into one") - parser.add_argument("caches", type=Path, nargs="+", help="List of caches to merge") - parser.add_argument( - "--output_cache", type=Path, help="Output cache", default="cache" - ) - args = parser.parse_args() - merge_caches(args.caches, args.output_cache) diff --git a/src/python/delete_cache_entry_of_merge_tests.py b/src/python/delete_cache_entry_of_merge_tests.py index cf7dea429d..2cc1556322 100644 --- a/src/python/delete_cache_entry_of_merge_tests.py +++ b/src/python/delete_cache_entry_of_merge_tests.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Delete the keys containing 'imports' in the JSON files in the given directory.""" +"""Delete the keys containing the SHA of the merge tests from the cache.""" import pandas as pd from pathlib import Path diff --git a/src/python/get-merge-output.py b/src/python/get-merge-output.py deleted file mode 100755 index 9b7cfa1118..0000000000 --- a/src/python/get-merge-output.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -"""Given an index and tool, creates the merge for that index and tool.""" - -import argparse -import os -import shutil -import subprocess - -from pathlib import Path - -import pandas as pd -from git import Repo - - -CLONE_ROOT = "/scratch/mernst/ast-merging-clones/" -if not Path(CLONE_ROOT).is_dir(): - os.makedirs(CLONE_ROOT) - -parser = argparse.ArgumentParser("get-merge-output") -parser.add_argument( - "index", help="The index of the row whose merge to recreate.", type=int -) -parser.add_argument("tool", help="The name of the merge tool to use.", type=str) -args = parser.parse_args() - -df = pd.read_csv("../../results/combined/result.csv", index_col="idx") - -row = df.iloc[args.index] - -slug = row["repository"] -print(slug) -left_sha = row["left"] -right_sha = row["right"] - -slug_split = slug.split("/") -repo_org = slug_split[0] -repo_name = slug_split[1] - -clone_parent_dir_name = CLONE_ROOT + slug + "/" + row["merge"] + "/" + args.tool -clone_parent_dir = Path(clone_parent_dir_name) -if not clone_parent_dir.is_dir(): - os.makedirs(clone_parent_dir) -print("clone_parent_dir", clone_parent_dir) -clone_dir_name = clone_parent_dir_name + "/" + repo_name -clone_dir = Path(clone_dir_name) - - -if clone_dir.is_dir(): - shutil.rmtree(clone_dir) - -clone_repo = Repo.clone_from("https://github.com/" + slug + ".git", clone_dir) -clone_repo.git.checkout(row["branch_name"]) - -clone_repo.git.checkout(left_sha) -clone_repo.git.checkout("-b", "left-branch-for-merge") -clone_repo.git.checkout(right_sha) -clone_repo.git.checkout("-b", "right-branch-for-merge") - -subprocess.run( - [ - "../scripts/merge_tools/" + args.tool + ".sh", - clone_dir, - "left-branch-for-merge", - "right-branch-for-merge", - ] -) diff --git a/src/python/repo.py b/src/python/repo.py index 75c06fce1f..2e9134ab00 100755 --- a/src/python/repo.py +++ b/src/python/repo.py @@ -239,7 +239,7 @@ def copy_repo(self) -> None: if not self.repo_path.exists(): self.clone_repo() if self.local_repo_path.exists(): - return + shutil.rmtree(self.local_repo_path, ignore_errors=True) self.workdir.mkdir(parents=True, exist_ok=True) shutil.copytree( self.repo_path, diff --git a/src/python/add_jacoco_gradle.py b/src/python/utils/add_jacoco_gradle.py similarity index 100% rename from src/python/add_jacoco_gradle.py rename to src/python/utils/add_jacoco_gradle.py diff --git a/src/python/add_jacoco_maven.py b/src/python/utils/add_jacoco_maven.py similarity index 100% rename from src/python/add_jacoco_maven.py rename to src/python/utils/add_jacoco_maven.py diff --git a/src/python/utils/delete_adjacent_keys_from_cache.py b/src/python/utils/delete_adjacent_keys_from_cache.py deleted file mode 100644 index 968c34d600..0000000000 --- a/src/python/utils/delete_adjacent_keys_from_cache.py +++ /dev/null @@ -1,83 +0,0 @@ -# -*- coding: utf-8 -*- -"""Delete the keys containing 'imports' in the JSON files in the given directory.""" - -import os -import sys -import json -from pathlib import Path -from argparse import ArgumentParser - - -def count_import_keys(directory): - """Count the number of keys containing 'imports' in the JSON files in the given directory.""" - count = 0 - for root, _, files in os.walk(directory): - json_files = [f for f in files if f.endswith(".json")] - for json_file in json_files: - file_path = os.path.join(root, json_file) - with open(file_path, "r", encoding="utf-8") as file: - data = json.load(file) - - # Count keys containing 'adjacent' - keys_to_delete = [key for key in data if "adjacent" in key] - count += len(keys_to_delete) - return count - - -def delete_import_keys(directory): - """Delete the keys containing 'imports' in the JSON files in the given directory.""" - total_deleted = 0 - for root, _, files in os.walk(directory): - json_files = [f for f in files if f.endswith(".json")] - for json_file in json_files: - file_path = os.path.join(root, json_file) - with open(file_path, "r", encoding="utf-8") as file: - data = json.load(file) - - # Record keys to delete - keys_to_delete = [key for key in data.keys() if "adjacent" in key] - if keys_to_delete: - for key in keys_to_delete: - del data[key] - total_deleted += 1 - - # Save the modified data back to file - with open(file_path, "w", encoding="utf-8") as file: - json.dump(data, file, indent=4) - - return total_deleted - - -def main(): - """Main function.""" - parser = ArgumentParser() - parser.add_argument( - "--cache", - type=str, - default="cache", - help="The cache directory to delete keys from.", - ) - parser.add_argument( - "-y", "--yes", action="store_true", help="Skip the confirmation prompt." - ) - args = parser.parse_args() - cache_dir = Path(args.cache) - potential_deletions = count_import_keys(cache_dir) - if not cache_dir.exists(): - print(f"Directory '{cache_dir}' does not exist.") - return - - if not args.yes: - potential_deletions = count_import_keys(args.cache) - print(f"Potential deletions: {potential_deletions}") - confirm = input("Do you want to proceed with deleting these keys? (yes/no): ") - if confirm.lower() != "yes": - print("Operation cancelled.") - sys.exit(0) - - total_deleted = delete_import_keys(args.cache) - print(f"Total keys deleted: {total_deleted}") - - -if __name__ == "__main__": - main() diff --git a/src/python/delete_cache_placeholders.py b/src/python/utils/delete_cache_placeholders.py similarity index 100% rename from src/python/delete_cache_placeholders.py rename to src/python/utils/delete_cache_placeholders.py diff --git a/src/python/utils/delete_import_keys_from_cache.py b/src/python/utils/delete_import_keys_from_cache.py deleted file mode 100644 index 29421d28bc..0000000000 --- a/src/python/utils/delete_import_keys_from_cache.py +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- -"""Delete the keys containing 'imports' in the JSON files in the given directory.""" - -import os -import json -from pathlib import Path - - -def count_import_keys(directory: Path) -> int: - """Count the number of keys containing 'imports' in the JSON files in the given directory.""" - count = 0 - for root, _, files in os.walk(directory): - json_files = [f for f in files if f.endswith(".json")] - for json_file in json_files: - file_path = os.path.join(root, json_file) - with open(file_path, "r", encoding="utf-8") as file: - data = json.load(file) - - # Count keys containing 'imports' - keys_to_delete = [key for key in data if "imports" in key] - count += len(keys_to_delete) - return count - - -def delete_import_keys(directory: Path) -> int: - """Delete the keys containing 'imports' in the JSON files in the given directory.""" - total_deleted = 0 - for root, _, files in os.walk(directory): - json_files = [f for f in files if f.endswith(".json")] - for json_file in json_files: - file_path = os.path.join(root, json_file) - with open(file_path, "r", encoding="utf-8") as file: - data = json.load(file) - - # Record keys to delete - keys_to_delete = [key for key in data.keys() if "imports" in key] - if keys_to_delete: - for key in keys_to_delete: - del data[key] - total_deleted += 1 - - # Save the modified data back to file - with open(file_path, "w", encoding="utf-8") as file: - json.dump(data, file, indent=4) - - return total_deleted - - -def main(): - """Main function.""" - directory = "cache/merge_timing_results" - potential_deletions = count_import_keys(directory) - print(f"Potential deletions: {potential_deletions}") - confirm = input("Do you want to proceed with deleting these keys? (yes/no): ") - if confirm.lower() == "yes": - total_deleted = delete_import_keys(directory) - print(f"Total keys deleted: {total_deleted}") - else: - print("Operation cancelled.") - - -if __name__ == "__main__": - main() diff --git a/src/python/utils/delete_intellimerge_keys_from_cache.py b/src/python/utils/delete_intellimerge_keys_from_cache.py deleted file mode 100644 index 63d7e18866..0000000000 --- a/src/python/utils/delete_intellimerge_keys_from_cache.py +++ /dev/null @@ -1,82 +0,0 @@ -# -*- coding: utf-8 -*- -"""Delete the keys containing 'imports' in the JSON files in the given directory.""" - -import os -import sys -import json -from pathlib import Path -from argparse import ArgumentParser - - -def count_import_keys(directory): - """Count the number of keys containing 'imports' in the JSON files in the given directory.""" - count = 0 - for root, _, files in os.walk(directory): - json_files = [f for f in files if f.endswith(".json")] - for json_file in json_files: - file_path = os.path.join(root, json_file) - with open(file_path, "r", encoding="utf-8") as file: - data = json.load(file) - - # Count keys containing 'adjacent' - keys_to_delete = [key for key in data if "intellimerge" in key] - count += len(keys_to_delete) - return count - - -def delete_import_keys(directory): - """Delete the keys containing 'imports' in the JSON files in the given directory.""" - total_deleted = 0 - for root, _, files in os.walk(directory): - json_files = [f for f in files if f.endswith(".json")] - for json_file in json_files: - file_path = os.path.join(root, json_file) - with open(file_path, "r", encoding="utf-8") as file: - data = json.load(file) - - # Record keys to delete - keys_to_delete = [key for key in data.keys() if "intellimerge" in key] - if keys_to_delete: - for key in keys_to_delete: - del data[key] - total_deleted += 1 - - # Save the modified data back to file - with open(file_path, "w", encoding="utf-8") as file: - json.dump(data, file, indent=4) - - return total_deleted - - -def main(): - """Main function.""" - parser = ArgumentParser() - parser.add_argument( - "--cache", - type=str, - default="cache", - help="The cache directory to delete keys from.", - ) - parser.add_argument( - "-y", "--yes", action="store_true", help="Skip the confirmation prompt." - ) - args = parser.parse_args() - cache_dir = Path(args.cache) - if not cache_dir.exists(): - print(f"Directory '{cache_dir}' does not exist.") - return - - if not args.yes: - potential_deletions = count_import_keys(args.cache) - print(f"Potential deletions: {potential_deletions}") - confirm = input("Do you want to proceed with deleting these keys? (yes/no): ") - if confirm.lower() != "yes": - print("Operation cancelled.") - sys.exit(0) - - total_deleted = delete_import_keys(args.cache) - print(f"Total keys deleted: {total_deleted}") - - -if __name__ == "__main__": - main() diff --git a/src/python/utils/delete_plumelib_keys_from_cache.py b/src/python/utils/delete_keys_from_cache.py similarity index 61% rename from src/python/utils/delete_plumelib_keys_from_cache.py rename to src/python/utils/delete_keys_from_cache.py index e70fb9b82d..10194f87ca 100644 --- a/src/python/utils/delete_plumelib_keys_from_cache.py +++ b/src/python/utils/delete_keys_from_cache.py @@ -1,36 +1,37 @@ # -*- coding: utf-8 -*- -"""Delete the keys containing 'imports' in the JSON files in the given directory.""" +"""Delete the keys matching a given regex in the JSON files in the given directory.""" import os import sys import json +import re from pathlib import Path from argparse import ArgumentParser - -def traverse_cache_keys(directory, timing=False, delete=False, key="plumelib"): - """Count the number of keys containing 'imports' in the JSON files in the given directory.""" - count = 0 +def delete_keys_matching_regex(directory:Path, regex:str, dry_run:bool=False): + """Delete the keys matching the given regex in the JSON files in the given directory.""" + total_deleted = 0 + pattern = re.compile(regex) for root, _, files in os.walk(directory): json_files = [f for f in files if f.endswith(".json")] for json_file in json_files: - if not timing and "merge_timing_results" in root: - continue file_path = os.path.join(root, json_file) with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) - keys_to_delete = [k for k in data if key in k] - count += len(keys_to_delete) - if delete: - if keys_to_delete: - for k in keys_to_delete: - del data[k] + # Record keys to delete + keys_to_delete = [key for key in data.keys() if pattern.search(key)] + if keys_to_delete: + for key in keys_to_delete: + del data[key] + total_deleted += 1 + if not dry_run: # Save the modified data back to file with open(file_path, "w", encoding="utf-8") as file: json.dump(data, file, indent=4) - return count + + return total_deleted def main(): @@ -43,28 +44,31 @@ def main(): help="The cache directory to delete keys from.", ) parser.add_argument( - "-timing", - help="Delete the timing results as well.", - action="store_true", + "--regex", + type=str, + required=True, + help="The regex to match keys for deletion.", ) parser.add_argument( "-y", "--yes", action="store_true", help="Skip the confirmation prompt." ) args = parser.parse_args() cache_dir = Path(args.cache) + regex = args.regex + if not cache_dir.exists(): print(f"Directory '{cache_dir}' does not exist.") return if not args.yes: - potential_deletions = traverse_cache_keys(args.cache, timing=args.timing) + potential_deletions = delete_keys_matching_regex(args.cache, regex, dry_run=True) print(f"Potential deletions: {potential_deletions}") confirm = input("Do you want to proceed with deleting these keys? (yes/no): ") if confirm.lower() != "yes": print("Operation cancelled.") sys.exit(0) - total_deleted = traverse_cache_keys(args.cache, timing=args.timing, delete=True) + total_deleted = delete_keys_matching_regex(args.cache, regex) print(f"Total keys deleted: {total_deleted}") diff --git a/src/python/select_from_results.py b/src/python/utils/select_from_results.py similarity index 100% rename from src/python/select_from_results.py rename to src/python/utils/select_from_results.py diff --git a/src/python/variables.py b/src/python/variables.py index 3fc00f30e8..f74bc4254e 100644 --- a/src/python/variables.py +++ b/src/python/variables.py @@ -2,6 +2,7 @@ """Contains all used variables.""" from pathlib import Path +import os BRANCH_BASE_NAME = "___MERGE_TESTER" LEFT_BRANCH_NAME = BRANCH_BASE_NAME + "_LEFT" @@ -9,7 +10,7 @@ CACHE_BACKOFF_TIME = 2 * 60 # 2 minutes, in seconds DELETE_WORKDIRS = True -REPOS_PATH = Path("repos") +REPOS_PATH = Path(os.getenv('AST_REPOS_PATH')) if os.getenv('AST_REPOS_PATH') else Path("repos") WORKDIR_DIRECTORY = Path( ".workdir" ) # Merges and testing will be performed in this directory. diff --git a/src/scripts/run_repo_tests.sh b/src/scripts/run_repo_tests.sh index 8542115c51..749c51ccda 100755 --- a/src/scripts/run_repo_tests.sh +++ b/src/scripts/run_repo_tests.sh @@ -22,13 +22,13 @@ cd "$REPO_DIR" || exit 1 if [ -f "gradlew" ] ; then # Append JaCoCo plugin and task to build.gradle - python3 "$CURR_PATH"/src/python/add_jacoco_gradle.py pom.xml + python3 "$CURR_PATH"/src/python/utils/add_jacoco_gradle.py pom.xml command="./gradlew clean test jacocoTestReport" elif [ -f pom.xml ] ; then # Add Jacoco plugin to pom.xml if ! grep -q "jacoco-maven-plugin" pom.xml ; then echo "Adding Jacoco plugin to pom.xml" - python3 "$CURR_PATH"/src/python/add_jacoco_maven.py pom.xml + python3 "$CURR_PATH"/src/python/utils/add_jacoco_maven.py pom.xml fi mvn -version command="mvn clean jacoco:prepare-agent test jacoco:report" diff --git a/test.sh b/test.sh deleted file mode 100755 index 2c26042d34..0000000000 --- a/test.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -rm -rf replay_output .workdir run_small_output -python3 src/python/utils/delete_intellimerge_keys_from_cache.py --cache cache-small/sha_cache_entry/mangstadt --yes -./run_small.sh --include_trivial_merges --no_timing -cp -r .workdir/mangstadt/ez-vcard/merge-tester-intellimerge-ea6026ee62cc184db68d841d50d58474fcdf4862-ab2032ca9769d452d4906f51cf56ca7d983a27c4 run_small_output -python3 src/python/replay_merge.py --merges_csv results/small/result.csv --idx 1-7 -cp -r .workdir/mangstadt/ez-vcard-merge-replay-intellimerge-ea6026ee62cc184db68d841d50d58474fcdf4862-ab2032ca9769d452d4906f51cf56ca7d983a27c4 replay_output -rm -rf .workdir