-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
98ca72c
commit 0fe7226
Showing
25 changed files
with
529 additions
and
534 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
idx,repository,language,architecture,community,continuous_integration,documentation,history,issues,license,size,unit_test,stars,scorebased_org,randomforest_org,scorebased_utl,randomforest_utl | ||
100897,enonic/xp,Java,0.992782,5,0,0.067102,451.0,133.0,1,263754,0.254581,95,1,1,1,1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
idx,repository,language,architecture,community,continuous_integration,documentation,history,issues,license,size,unit_test,stars,scorebased_org,randomforest_org,scorebased_utl,randomforest_utl,head hash | ||
0,enonic/xp,Java,0.992782,5,0,0.067102,451.0,133.0,1,263754,0.254581,95,1,1,1,1,0cb08cf3dbf8fb71d82c3665229c3be6280c115f |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/usr/bin/env bash | ||
|
||
# usage: ./run_xp.sh [-i <machine_id> -n <num_machines>] [-d] | ||
# Runs the stack all the repositories | ||
# The output appears in result/ . | ||
# <machine_id> optional argument to specify the id of the current machine. | ||
# <num_machine> optional argument to specify the total number of machines used. | ||
# <diff> optional argument to specify whether to diff the merges. | ||
# Warning: This takes days to run. | ||
|
||
|
||
set -e | ||
set -o nounset | ||
|
||
# Check if cache.tar exists and cache is missing | ||
if [ -f cache.tar ] && [ ! -d cache ]; then | ||
echo "Decompressing cache.tar" | ||
make decompress-cache | ||
fi | ||
|
||
./run.sh input_data/repos_xp.csv results-xp 20 cache "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,277 @@ | ||
#!/usr/bin/env python3 | ||
""" Analyze the merges i.e. check if the parents pass tests and statistics between merges. | ||
usage: python3 merge_analyzer.py --repos_head_passes_csv <path_to_repos_head_passes.csv> | ||
--merges_path <path_to_merges> | ||
--output_dir <output_dir> | ||
--cache_dir <cache_dir> | ||
This script analyzes the merges i.e. it checks if the parents pass tests and it | ||
computes statistics between merges. | ||
The output is written in output_dir and consists of the same merges as the input | ||
but with the test results and statistics. | ||
""" | ||
|
||
import os | ||
import multiprocessing | ||
import subprocess | ||
import argparse | ||
from pathlib import Path | ||
from functools import partialmethod | ||
from typing import Tuple | ||
import random | ||
import numpy as np | ||
import pandas as pd | ||
from repo import Repository, MERGE_TOOL, TEST_STATE, MERGE_STATE | ||
from tqdm import tqdm | ||
from cache_utils import set_in_cache, lookup_in_cache, slug_repo_name | ||
from write_head_hashes import num_processes | ||
from variables import TIMEOUT_MERGING, TIMEOUT_TESTING_PARENT, N_TESTS | ||
|
||
if os.getenv("TERM", "dumb") == "dumb": | ||
tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) # type: ignore | ||
|
||
|
||
def is_test_passed(test_state: str) -> bool: | ||
"""Returns true if the test state indicates passed tests.""" | ||
return test_state == TEST_STATE.Tests_passed.name | ||
|
||
|
||
def merge_analyzer( # pylint: disable=too-many-locals | ||
args: Tuple[str, pd.Series, Path] | ||
) -> pd.Series: | ||
""" | ||
Merges two branches and returns the result. | ||
Args: | ||
args (Tuple[str,pd.Series,Path]): A tuple containing the repo slug, | ||
the merge data, and the cache path. | ||
Returns: | ||
dict: A dictionary containing the merge result. | ||
""" | ||
repo_slug, merge_data, cache_directory = args | ||
|
||
cache_key = merge_data["left"] + "_" + merge_data["right"] | ||
merge_cache_directory = cache_directory / "merge_analysis" | ||
|
||
cache_data = lookup_in_cache(cache_key, repo_slug, merge_cache_directory, True) | ||
if cache_data is not None and isinstance(cache_data, dict): | ||
for key, value in cache_data.items(): | ||
merge_data[key] = value | ||
return merge_data | ||
|
||
cache_data = {} | ||
repo_left = Repository(repo_slug, cache_directory=cache_directory) | ||
repo_right = Repository(repo_slug, cache_directory=cache_directory) | ||
left_success, _ = repo_left.checkout(merge_data["left"]) | ||
right_success, _ = repo_right.checkout(merge_data["right"]) | ||
|
||
# Compute diff size in lines between left and right | ||
assert repo_left.repo_path.exists() | ||
assert repo_right.repo_path.exists() | ||
process = subprocess.run( | ||
["diff", "-r", str(repo_left.repo_path), str(repo_right.repo_path)], | ||
stdout=subprocess.PIPE, | ||
text=True, | ||
) | ||
|
||
diff_size = len(process.stdout.split("\n")) if process.stdout else 0 | ||
cache_data["diff_size"] = diff_size | ||
|
||
# List all files that are different between left and right | ||
process = subprocess.run( | ||
["diff", "-r", "--brief", str(repo_left.repo_path), str(repo_right.repo_path)], | ||
stdout=subprocess.PIPE, | ||
text=True, | ||
) | ||
|
||
diff_files = process.stdout.split("\n") if process.stdout else [] | ||
diff_files = [line.split()[-1] for line in diff_files if line] | ||
|
||
# Check if diff contains a java file | ||
contains_java_file = any(file.endswith(".java") for file in diff_files) | ||
cache_data["diff_contains_java_file"] = contains_java_file | ||
|
||
# Test left parent | ||
if not left_success: | ||
cache_data["left test result"] = TEST_STATE.Git_checkout_failed.name | ||
cache_data["left_tree_fingerprint"] = None | ||
cache_data["parents pass"] = False | ||
else: | ||
cache_data["left_tree_fingerprint"] = repo_left.compute_tree_fingerprint() | ||
cache_data["left test result"] = repo_left.test( | ||
TIMEOUT_TESTING_PARENT, N_TESTS | ||
).name | ||
cache_data["parents pass"] = is_test_passed(cache_data["left test result"]) | ||
|
||
# Test right parent | ||
if not right_success: | ||
cache_data["right test result"] = TEST_STATE.Git_checkout_failed.name | ||
cache_data["right_tree_fingerprint"] = None | ||
cache_data["parents pass"] = False | ||
else: | ||
cache_data["right_tree_fingerprint"] = repo_right.compute_tree_fingerprint() | ||
cache_data["right test result"] = repo_right.test( | ||
TIMEOUT_TESTING_PARENT, N_TESTS | ||
).name | ||
cache_data["parents pass"] = cache_data["parents pass"] and is_test_passed( | ||
cache_data["right test result"] | ||
) | ||
|
||
cache_data["test merge"] = ( | ||
cache_data["parents pass"] and cache_data["diff_contains_java_file"] | ||
) | ||
|
||
set_in_cache(cache_key, cache_data, repo_slug, merge_cache_directory) | ||
|
||
for key, value in cache_data.items(): | ||
merge_data[key] = value | ||
|
||
return merge_data | ||
|
||
|
||
def build_merge_analyzer_arguments(args: argparse.Namespace, repo_slug: str): | ||
""" | ||
Creates the arguments for the merger function. | ||
Args: | ||
args (argparse.Namespace): The arguments to the script. | ||
repo_slug (str): The repository slug. | ||
Returns: | ||
list: A list of arguments for the merger function. | ||
""" | ||
merge_list_file = Path( | ||
os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv") | ||
) | ||
output_file = Path( | ||
os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") | ||
) | ||
if not merge_list_file.exists(): | ||
print( | ||
"merge_analyzer:", | ||
repo_slug, | ||
"does not have a list of merges. Missing file: ", | ||
merge_list_file, | ||
) | ||
return [] | ||
|
||
if output_file.exists(): | ||
print( | ||
"merge_analyzer: Skipping", | ||
repo_slug, | ||
"because it is already computed.", | ||
) | ||
return [] | ||
|
||
merges = pd.read_csv( | ||
merge_list_file, | ||
names=["idx", "branch_name", "merge", "left", "right", "notes"], | ||
dtype={ | ||
"idx": int, | ||
"branch_name": str, | ||
"merge": str, | ||
"left": str, | ||
"right": str, | ||
"notes": str, | ||
}, | ||
header=0, | ||
index_col="idx", | ||
) | ||
merges["notes"].replace(np.nan, "", inplace=True) | ||
|
||
arguments = [ | ||
(repo_slug, merge_data, Path(args.cache_dir)) | ||
for _, merge_data in merges.iterrows() | ||
] | ||
return arguments | ||
|
||
|
||
if __name__ == "__main__": | ||
print("merge_analyzer: Start") | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--repos_head_passes_csv", type=Path) | ||
parser.add_argument("--merges_path", type=Path) | ||
parser.add_argument("--output_dir", type=Path) | ||
parser.add_argument("--cache_dir", type=Path, default="cache/merges/") | ||
args = parser.parse_args() | ||
Path(args.cache_dir).mkdir(parents=True, exist_ok=True) | ||
Path(args.output_dir).mkdir(parents=True, exist_ok=True) | ||
|
||
repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx") | ||
|
||
print("merge_analyzer: Constructing Inputs") | ||
merger_arguments = [] | ||
for _, repository_data in tqdm(repos.iterrows(), total=len(repos)): | ||
repo_slug = repository_data["repository"] | ||
merger_arguments += build_merge_analyzer_arguments(args, repo_slug) | ||
|
||
# Shuffle input to reduce cache contention | ||
random.seed(42) | ||
random.shuffle(merger_arguments) | ||
|
||
print("merge_analyzer: Finished Constructing Inputs") | ||
# New merges are merges whose analysis does not appear in the output folder. | ||
print("merge_analyzer: Number of new merges:", len(merger_arguments)) | ||
|
||
print("merge_analyzer: Started Merging") | ||
with multiprocessing.Pool(processes=num_processes()) as pool: | ||
merger_results = list( | ||
tqdm( | ||
pool.imap(merge_analyzer, merger_arguments), total=len(merger_arguments) | ||
) | ||
) | ||
print("merge_analyzer: Finished Merging") | ||
|
||
repo_result = {repo_slug: [] for repo_slug in repos["repository"]} | ||
print("merge_analyzer: Constructing Output") | ||
n_new_analyzed = 0 | ||
n_new_to_test = 0 | ||
for i in tqdm(range(len(merger_arguments))): | ||
repo_slug = merger_arguments[i][0] | ||
results_data = merger_results[i] | ||
|
||
repo_result[repo_slug].append(merger_results[i]) | ||
n_new_analyzed += 1 | ||
if results_data["test merge"]: | ||
n_new_to_test += 1 | ||
|
||
n_total_analyzed = 0 | ||
n_total_to_test = 0 | ||
for repo_slug in repo_result: | ||
output_file = Path( | ||
os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv") | ||
) | ||
if output_file.exists(): | ||
try: | ||
df = pd.read_csv(output_file, header=0) | ||
n_total_analyzed += len(df) | ||
n_total_to_test += len(df[df["test merge"]]) | ||
except pd.errors.EmptyDataError: | ||
print( | ||
"merge_analyzer: Skipping", | ||
repo_slug, | ||
"because it does not contain any merges.", | ||
) | ||
continue | ||
df = pd.DataFrame(repo_result[repo_slug]) | ||
df.sort_index(inplace=True) | ||
df.to_csv(output_file, index_label="idx") | ||
n_total_analyzed += len(df) | ||
n_total_to_test += len(df[df["test merge"]]) | ||
|
||
print( | ||
"merge_analyzer: Number of merge tool outputs that have been newly compared:", | ||
n_new_analyzed, | ||
) | ||
print( | ||
"merge_analyzer: Number of merge tool outputs that have been newly \ | ||
compared and are to test:", | ||
n_new_to_test, | ||
) | ||
print( | ||
"merge_analyzer: Total number of merge tool outputs that have been compared:", | ||
n_total_analyzed, | ||
) | ||
print( | ||
"merge_analyzer: Total number of merge tool outputs that have been compared \ | ||
and are to test:", | ||
n_total_to_test, | ||
) | ||
print("merge_analyzer: Finished Constructing Output") | ||
print("merge_analyzer: Done") |
Oops, something went wrong.