Skip to content

Commit

Permalink
Added merge analyzer
Browse files Browse the repository at this point in the history
  • Loading branch information
benedikt-schesch committed Oct 18, 2023
1 parent 98ca72c commit 0fe7226
Show file tree
Hide file tree
Showing 25 changed files with 529 additions and 534 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ To run style checking run `make style`.

* latex_output.py -> Output latex code for the resulting plots and table.

* merge_tools_comparator.py -> Compares merges that produce different output.
* merge_analyzer.py -> Compares merges that produce different output.

* get_repos.py -> Downloads the repos list.

Expand Down
2 changes: 2 additions & 0 deletions input_data/repos_xp.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
idx,repository,language,architecture,community,continuous_integration,documentation,history,issues,license,size,unit_test,stars,scorebased_org,randomforest_org,scorebased_utl,randomforest_utl
100897,enonic/xp,Java,0.992782,5,0,0.067102,451.0,133.0,1,263754,0.254581,95,1,1,1,1
2 changes: 2 additions & 0 deletions input_data/repos_xp_with_hashes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
idx,repository,language,architecture,community,continuous_integration,documentation,history,issues,license,size,unit_test,stars,scorebased_org,randomforest_org,scorebased_utl,randomforest_utl,head hash
0,enonic/xp,Java,0.992782,5,0,0.067102,451.0,133.0,1,263754,0.254581,95,1,1,1,1,0cb08cf3dbf8fb71d82c3665229c3be6280c115f
7 changes: 4 additions & 3 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,16 +100,17 @@ python3 src/python/sample_merges.py \
--n_merges "$((20 * "$N_MERGES"))" \
"${merge_comparator_flags[@]}"

python3 src/python/merge_tools_comparator.py \
python3 src/python/merge_analyzer.py \
--repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
--merges_path "$OUT_DIR/merges_sampled/" \
--output_dir "$OUT_DIR/merges_compared/" \
--output_dir "$OUT_DIR/merges_analyzed/" \
--cache_dir "$CACHE_DIR" \

python3 src/python/merge_tester.py \
--repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
--merges_path "$OUT_DIR/merges_compared/" \
--merges_path "$OUT_DIR/merges_analyzed/" \
--output_dir "$OUT_DIR/merges_tested/" \
--n_sampled_merges "$N_MERGES" \
--cache_dir "$CACHE_DIR" \

python3 src/python/merge_differ.py \
Expand Down
21 changes: 21 additions & 0 deletions run_xp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash

# usage: ./run_xp.sh [-i <machine_id> -n <num_machines>] [-d]
# Runs the stack all the repositories
# The output appears in result/ .
# <machine_id> optional argument to specify the id of the current machine.
# <num_machine> optional argument to specify the total number of machines used.
# <diff> optional argument to specify whether to diff the merges.
# Warning: This takes days to run.


set -e
set -o nounset

# Check if cache.tar exists and cache is missing
if [ -f cache.tar ] && [ ! -d cache ]; then
echo "Decompressing cache.tar"
make decompress-cache
fi

./run.sh input_data/repos_xp.csv results-xp 20 cache "$@"
44 changes: 22 additions & 22 deletions src/python/latex_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,28 +435,28 @@ def main(): # pylint: disable=too-many-locals,too-many-branches,too-many-statem
) as file:
file.write(table2)

# Table run time
table3 = """% Do not edit. This file is automatically generated.
\\begin{tabular}{c|c|c|c}
& \\multicolumn{3}{c}{Run time (seconds)} \\\\
Tool & Mean & Median & Max \\\\
\\hline\n"""

for merge_tool in merge_tools:
table3 += f" {merge_tool_latex_name(merge_tool):32}"
for f in [np.mean, np.median, np.max]:
run_time = f(result_df[merge_tool + "_run_time"])
if run_time < 10:
table3 += f" & {run_time:0.2f}"
elif run_time < 100:
table3 += f" & {run_time:0.1f}"
else:
table3 += f" & {round(run_time)}"
table3 += " \\\\\n"
table3 += "\\end{tabular}\n"

with open(os.path.join(tables_output_path, "table_run_time.tex"), "w") as file:
file.write(table3)
# # Table run time
# table3 = """% Do not edit. This file is automatically generated.
# \\begin{tabular}{c|c|c|c}
# & \\multicolumn{3}{c}{Run time (seconds)} \\\\
# Tool & Mean & Median & Max \\\\
# \\hline\n"""

# for merge_tool in merge_tools:
# table3 += f" {merge_tool_latex_name(merge_tool):32}"
# for f in [np.mean, np.median, np.max]:
# run_time = f(result_df[merge_tool + "_run_time"])
# if run_time < 10:
# table3 += f" & {run_time:0.2f}"
# elif run_time < 100:
# table3 += f" & {run_time:0.1f}"
# else:
# table3 += f" & {round(run_time)}"
# table3 += " \\\\\n"
# table3 += "\\end{tabular}\n"

# with open(os.path.join(tables_output_path, "table_run_time.tex"), "w") as file:
# file.write(table3)

# Create defs.tex
full_repos_df = pd.read_csv(args.full_repos_csv)
Expand Down
277 changes: 277 additions & 0 deletions src/python/merge_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
#!/usr/bin/env python3
""" Analyze the merges i.e. check if the parents pass tests and statistics between merges.
usage: python3 merge_analyzer.py --repos_head_passes_csv <path_to_repos_head_passes.csv>
--merges_path <path_to_merges>
--output_dir <output_dir>
--cache_dir <cache_dir>
This script analyzes the merges i.e. it checks if the parents pass tests and it
computes statistics between merges.
The output is written in output_dir and consists of the same merges as the input
but with the test results and statistics.
"""

import os
import multiprocessing
import subprocess
import argparse
from pathlib import Path
from functools import partialmethod
from typing import Tuple
import random
import numpy as np
import pandas as pd
from repo import Repository, MERGE_TOOL, TEST_STATE, MERGE_STATE
from tqdm import tqdm
from cache_utils import set_in_cache, lookup_in_cache, slug_repo_name
from write_head_hashes import num_processes
from variables import TIMEOUT_MERGING, TIMEOUT_TESTING_PARENT, N_TESTS

if os.getenv("TERM", "dumb") == "dumb":
tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) # type: ignore


def is_test_passed(test_state: str) -> bool:
"""Returns true if the test state indicates passed tests."""
return test_state == TEST_STATE.Tests_passed.name


def merge_analyzer( # pylint: disable=too-many-locals
args: Tuple[str, pd.Series, Path]
) -> pd.Series:
"""
Merges two branches and returns the result.
Args:
args (Tuple[str,pd.Series,Path]): A tuple containing the repo slug,
the merge data, and the cache path.
Returns:
dict: A dictionary containing the merge result.
"""
repo_slug, merge_data, cache_directory = args

cache_key = merge_data["left"] + "_" + merge_data["right"]
merge_cache_directory = cache_directory / "merge_analysis"

cache_data = lookup_in_cache(cache_key, repo_slug, merge_cache_directory, True)
if cache_data is not None and isinstance(cache_data, dict):
for key, value in cache_data.items():
merge_data[key] = value
return merge_data

cache_data = {}
repo_left = Repository(repo_slug, cache_directory=cache_directory)
repo_right = Repository(repo_slug, cache_directory=cache_directory)
left_success, _ = repo_left.checkout(merge_data["left"])
right_success, _ = repo_right.checkout(merge_data["right"])

# Compute diff size in lines between left and right
assert repo_left.repo_path.exists()
assert repo_right.repo_path.exists()
process = subprocess.run(
["diff", "-r", str(repo_left.repo_path), str(repo_right.repo_path)],
stdout=subprocess.PIPE,
text=True,
)

diff_size = len(process.stdout.split("\n")) if process.stdout else 0
cache_data["diff_size"] = diff_size

# List all files that are different between left and right
process = subprocess.run(
["diff", "-r", "--brief", str(repo_left.repo_path), str(repo_right.repo_path)],
stdout=subprocess.PIPE,
text=True,
)

diff_files = process.stdout.split("\n") if process.stdout else []
diff_files = [line.split()[-1] for line in diff_files if line]

# Check if diff contains a java file
contains_java_file = any(file.endswith(".java") for file in diff_files)
cache_data["diff_contains_java_file"] = contains_java_file

# Test left parent
if not left_success:
cache_data["left test result"] = TEST_STATE.Git_checkout_failed.name
cache_data["left_tree_fingerprint"] = None
cache_data["parents pass"] = False
else:
cache_data["left_tree_fingerprint"] = repo_left.compute_tree_fingerprint()
cache_data["left test result"] = repo_left.test(
TIMEOUT_TESTING_PARENT, N_TESTS
).name
cache_data["parents pass"] = is_test_passed(cache_data["left test result"])

# Test right parent
if not right_success:
cache_data["right test result"] = TEST_STATE.Git_checkout_failed.name
cache_data["right_tree_fingerprint"] = None
cache_data["parents pass"] = False
else:
cache_data["right_tree_fingerprint"] = repo_right.compute_tree_fingerprint()
cache_data["right test result"] = repo_right.test(
TIMEOUT_TESTING_PARENT, N_TESTS
).name
cache_data["parents pass"] = cache_data["parents pass"] and is_test_passed(
cache_data["right test result"]
)

cache_data["test merge"] = (
cache_data["parents pass"] and cache_data["diff_contains_java_file"]
)

set_in_cache(cache_key, cache_data, repo_slug, merge_cache_directory)

for key, value in cache_data.items():
merge_data[key] = value

return merge_data


def build_merge_analyzer_arguments(args: argparse.Namespace, repo_slug: str):
"""
Creates the arguments for the merger function.
Args:
args (argparse.Namespace): The arguments to the script.
repo_slug (str): The repository slug.
Returns:
list: A list of arguments for the merger function.
"""
merge_list_file = Path(
os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv")
)
output_file = Path(
os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
)
if not merge_list_file.exists():
print(
"merge_analyzer:",
repo_slug,
"does not have a list of merges. Missing file: ",
merge_list_file,
)
return []

if output_file.exists():
print(
"merge_analyzer: Skipping",
repo_slug,
"because it is already computed.",
)
return []

merges = pd.read_csv(
merge_list_file,
names=["idx", "branch_name", "merge", "left", "right", "notes"],
dtype={
"idx": int,
"branch_name": str,
"merge": str,
"left": str,
"right": str,
"notes": str,
},
header=0,
index_col="idx",
)
merges["notes"].replace(np.nan, "", inplace=True)

arguments = [
(repo_slug, merge_data, Path(args.cache_dir))
for _, merge_data in merges.iterrows()
]
return arguments


if __name__ == "__main__":
print("merge_analyzer: Start")
parser = argparse.ArgumentParser()
parser.add_argument("--repos_head_passes_csv", type=Path)
parser.add_argument("--merges_path", type=Path)
parser.add_argument("--output_dir", type=Path)
parser.add_argument("--cache_dir", type=Path, default="cache/merges/")
args = parser.parse_args()
Path(args.cache_dir).mkdir(parents=True, exist_ok=True)
Path(args.output_dir).mkdir(parents=True, exist_ok=True)

repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx")

print("merge_analyzer: Constructing Inputs")
merger_arguments = []
for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
repo_slug = repository_data["repository"]
merger_arguments += build_merge_analyzer_arguments(args, repo_slug)

# Shuffle input to reduce cache contention
random.seed(42)
random.shuffle(merger_arguments)

print("merge_analyzer: Finished Constructing Inputs")
# New merges are merges whose analysis does not appear in the output folder.
print("merge_analyzer: Number of new merges:", len(merger_arguments))

print("merge_analyzer: Started Merging")
with multiprocessing.Pool(processes=num_processes()) as pool:
merger_results = list(
tqdm(
pool.imap(merge_analyzer, merger_arguments), total=len(merger_arguments)
)
)
print("merge_analyzer: Finished Merging")

repo_result = {repo_slug: [] for repo_slug in repos["repository"]}
print("merge_analyzer: Constructing Output")
n_new_analyzed = 0
n_new_to_test = 0
for i in tqdm(range(len(merger_arguments))):
repo_slug = merger_arguments[i][0]
results_data = merger_results[i]

repo_result[repo_slug].append(merger_results[i])
n_new_analyzed += 1
if results_data["test merge"]:
n_new_to_test += 1

n_total_analyzed = 0
n_total_to_test = 0
for repo_slug in repo_result:
output_file = Path(
os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
)
if output_file.exists():
try:
df = pd.read_csv(output_file, header=0)
n_total_analyzed += len(df)
n_total_to_test += len(df[df["test merge"]])
except pd.errors.EmptyDataError:
print(
"merge_analyzer: Skipping",
repo_slug,
"because it does not contain any merges.",
)
continue
df = pd.DataFrame(repo_result[repo_slug])
df.sort_index(inplace=True)
df.to_csv(output_file, index_label="idx")
n_total_analyzed += len(df)
n_total_to_test += len(df[df["test merge"]])

print(
"merge_analyzer: Number of merge tool outputs that have been newly compared:",
n_new_analyzed,
)
print(
"merge_analyzer: Number of merge tool outputs that have been newly \
compared and are to test:",
n_new_to_test,
)
print(
"merge_analyzer: Total number of merge tool outputs that have been compared:",
n_total_analyzed,
)
print(
"merge_analyzer: Total number of merge tool outputs that have been compared \
and are to test:",
n_total_to_test,
)
print("merge_analyzer: Finished Constructing Output")
print("merge_analyzer: Done")
Loading

0 comments on commit 0fe7226

Please sign in to comment.