Skip to content

Commit

Permalink
Merged v2-update
Browse files Browse the repository at this point in the history
  • Loading branch information
benedikt-schesch committed Sep 24, 2023
2 parents 0ecfc37 + a76a1a6 commit 18ed5f6
Show file tree
Hide file tree
Showing 19 changed files with 117 additions and 103 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ To run style checking run `make style`.

* merge_tester.py -> Main file which performs merges and evaluates all the results across all projects.

* test_repo_head.py -> Checks out all repos and removes all repos that fail their tests on main branch.
* test_repo_heads.py -> Checks out all repos and removes all repos that fail their tests on main branch.

* latex_output.py -> Output latex code for the resulting plots and table.

Expand Down
6 changes: 3 additions & 3 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ set -o nounset

REPOS_CSV="$1"
OUT_DIR="$2"
N_MERGES=$3
N_REPETITIONS=$3
CACHE_DIR="${4}"

comparator_flags=""
Expand Down Expand Up @@ -86,7 +86,7 @@ python3 src/python/split_repos.py \
--num_machines "$num_machines" \
--output_file "$OUT_DIR/local_repos.csv"

python3 src/python/test_repo_head.py \
python3 src/python/test_repo_heads.py \
--repos_csv_with_hashes "$OUT_DIR/local_repos.csv" \
--output_path "$OUT_DIR/repos_head_passes.csv" \
--cache_dir "$CACHE_DIR"
Expand Down Expand Up @@ -120,5 +120,5 @@ python3 src/python/latex_output.py \
--tested_merges_path "$OUT_DIR/merges_tested/" \
--full_repos_csv "$REPOS_CSV" \
--repos_head_passes_csv "$OUT_DIR/repos_head_passes.csv" \
--n_merges "$N_MERGES" \
--n_merges "$N_REPETITIONS" \
--output_dir "$OUT_DIR"
12 changes: 6 additions & 6 deletions src/python/cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
There will be 4 caches in total which are stored on disk after running the run.sh script:
1) cache/sha_cache_entry: A cache that maps the commit hash to a sha256 hash of the repository.
2) cache/test_cache: A cache that maps a sha256 to test results.
3) cache/merge_results:A cache that maps a merge to the result
of the merge (sha256, runtime and MERGE_STATE)
3) cache/merge_results: A cache that maps a merge to the result
of the merge (sha256, run time, and MERGE_STATE).
4) cache/merge_diffs: A cache that stores the diff between merge tools.
"""

Expand All @@ -24,9 +24,9 @@


def slug_repo_name(repo_slug: str) -> str:
"""Given a GitHub repository slug (owner/reponame), returns the reponame.
"""Given a GitHub repository slug ("owner/reponame"), returns the reponame.
Args:
repo_slug (str): The slug of the repository, which is 'owner/reponame'.
repo_slug (str): The slug of the repository, which is "owner/reponame".
Returns:
str: The reponame.
"""
Expand Down Expand Up @@ -60,8 +60,8 @@ def get_cache_path(repo_slug: str, cache_prefix: Path) -> Path:
Returns:
Path: The path to the cache file.
"""
cache_entry_name = slug_repo_name(repo_slug) + ".json"
cache_path = cache_prefix / cache_entry_name
cache_file_name = slug_repo_name(repo_slug) + ".json"
cache_path = cache_prefix / cache_file_name
cache_path.parent.mkdir(parents=True, exist_ok=True)
return cache_path

Expand Down
2 changes: 1 addition & 1 deletion src/python/latex_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
This script generates all the tables and plots for the paper. It requires the
following input files:
- full_repos_csv: csv file containing the full list of repositories
- repos_head_passes_csv: csv file containing the list of valid repositories
- repos_head_passes_csv: csv file containing the list of repositories whose head passes tests
- tested_merges_path: path to the folder containing the merge results
- merges_path: path to the folder containing all found merges.
TODO: Throughout, be consistent about "directory" vs "folder".
Expand Down
19 changes: 10 additions & 9 deletions src/python/merge_differ.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,21 @@
import pandas as pd
from repo import Repository, MERGE_TOOL, TEST_STATE, MERGE_STATE
from tqdm import tqdm
from write_head_hashes import compute_num_process_used
from write_head_hashes import num_processes
from cache_utils import slug_repo_name

if os.getenv("TERM", "dumb") == "dumb":
tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) # type: ignore

TIMEOUT_TESTING_PARENT = 60 * 30 # 30 minutes
TIMEOUT_TESTING_MERGE = 60 * 45 # 45 minutes
TIMEOUT_TESTING_PARENT = 60 * 30 # 30 minutes, in seconds
TIMEOUT_TESTING_MERGE = 60 * 45 # 45 minutes, in seconds


def get_merge_fingerprint(
merge_data: pd.Series, merge_tool: MERGE_TOOL, cache_prefix: Path
) -> Union[Tuple[None, None], Tuple[Repository, str]]:
"""Returns the repo and the fingerprint of a merge, or None.
"""Returns the repo and the fingerprint of a merge,
or (None, None) if the merge is not successful.
Does some sanity-checking too.
Args:
merge_data: The merge data.
Expand Down Expand Up @@ -112,7 +113,6 @@ def merge_differ(args: Tuple[pd.Series, Path]) -> None:
if repo2 is None or merge_fingerprint2 is None:
continue

# Use lexicographic order to prevent duplicates
diff_file = diff_file_prefix / diff_file_name(
merge_fingerprint1, merge_fingerprint2
)
Expand All @@ -135,6 +135,7 @@ def diff_file_name(sha1: str, sha2: str) -> Path:
Returns:
Path: The name of the diff file.
"""
# Use lexicographic order to prevent duplicates
if sha1 < sha2:
# TODO: Why does this use ".txt" rather than ".diff" as the file extension?
return Path(sha1 + "_" + sha2 + ".txt")
Expand All @@ -155,7 +156,7 @@ def diff_file_name(sha1: str, sha2: str) -> Path:

repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx")

print("merge_differ: Started listing diffs to compute")
print("merge_differ: Started collecting diffs to compute")
merge_differ_arguments = []
for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
merges_repo = []
Expand Down Expand Up @@ -213,11 +214,11 @@ def diff_file_name(sha1: str, sha2: str) -> Path:
random.seed(42)
random.shuffle(merge_differ_arguments)

print("merge_differ: Finished listing diffs to compute")
print("merge_differ: Number of tested merges:", len(merge_differ_arguments))
print("merge_differ: Finished collecting diffs to compute")
print("merge_differ: Number of merges to test:", len(merge_differ_arguments))

print("merge_differ: Started Diffing")
with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
with multiprocessing.Pool(processes=num_processes()) as pool:
tqdm(
pool.imap(merge_differ, merge_differ_arguments),
total=len(merge_differ_arguments),
Expand Down
23 changes: 11 additions & 12 deletions src/python/merge_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
import psutil
import pandas as pd
from repo import Repository, MERGE_TOOL, TEST_STATE
from write_head_hashes import compute_num_process_used
from merge_tools_comparator import is_merge_sucess
from write_head_hashes import num_processes
from merge_tools_comparator import is_merge_success
from cache_utils import slug_repo_name
from tqdm import tqdm

Expand Down Expand Up @@ -80,7 +80,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
merge_data["parents pass"] = True

for merge_tool in MERGE_TOOL:
if is_merge_sucess(merge_data[merge_tool.name]):
if is_merge_success(merge_data[merge_tool.name]):
# TODO: I suggest abstracting the body of this loop into a separate function, since it stands on its own logically.
repo = Repository(repo_slug, cache_prefix=cache_prefix)
(
Expand Down Expand Up @@ -109,6 +109,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
merge_fingerprint,
merge_data[merge_tool.name + "_merge_fingerprint"],
)
# Update the status from merge success to test result.
merge_data[merge_tool.name] = result.name
del repo
assert merge_tool.name in merge_data
Expand All @@ -130,7 +131,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx")

# TODO: I suggest abstracting this loop into a separate function (through printing "number of merges to test").
print("merge_tester: Started listing merges to test")
print("merge_tester: Started collecting merges to test")
merge_tester_arguments = []
for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
repo_slug = repository_data["repository"]
Expand Down Expand Up @@ -174,11 +175,11 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
random.seed(42)
random.shuffle(merge_tester_arguments)

print("merge_tester: Finished listing merges to test")
print("merge_tester: Finished collecting merges to test")
print("merge_tester: Number of merges to test:", len(merge_tester_arguments))

print("merge_tester: Started Testing")
with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
with multiprocessing.Pool(processes=num_processes()) as pool:
merge_tester_results = list(
tqdm(
pool.imap(merge_tester, merge_tester_arguments),
Expand Down Expand Up @@ -214,12 +215,10 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
"because it does not contain any merges.",
)
continue
n_total_merges += len(df)
n_total_merges_parents_pass += len(df[df["parents pass"]])
continue
df = pd.DataFrame(repo_result[repo_slug])
df.sort_index(inplace=True)
df.to_csv(output_file, index_label="idx")
else:
df = pd.DataFrame(repo_result[repo_slug])
df.sort_index(inplace=True)
df.to_csv(output_file, index_label="idx")
n_total_merges += len(df)
n_total_merges_parents_pass += len(df[df["parents pass"]])

Expand Down
23 changes: 13 additions & 10 deletions src/python/merge_tools_comparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@
from repo import Repository, MERGE_TOOL, MERGE_STATE
from tqdm import tqdm
from cache_utils import set_in_cache, lookup_in_cache, slug_repo_name
from write_head_hashes import compute_num_process_used
from variables import TIMEOUT_MERGING, N_MERGES
from write_head_hashes import num_processes
from variables import TIMEOUT_MERGING, N_REPETITIONS

if os.getenv("TERM", "dumb") == "dumb":
tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) # type: ignore


def is_merge_sucess(merge_state: str) -> bool:
def is_merge_success(merge_state: str) -> bool:
"""Returns true if the merge state indicates success."""
return merge_state == MERGE_STATE.Merge_success.name

Expand Down Expand Up @@ -74,8 +74,8 @@ def merger( # pylint: disable=too-many-locals
merge_tool.name,
)
cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []}
for i in range(N_MERGES):
# TODO: I suggest abstracting out the body of this loop as a function.
for i in range(N_REPETITIONS):
repo = Repository(repo_slug, cache_prefix=cache_prefix)
(
merge_status,
Expand Down Expand Up @@ -211,10 +211,11 @@ def merger( # pylint: disable=too-many-locals
random.shuffle(merger_arguments)

print("merge_tools_comparator: Finished Constructing Inputs")
# New merges are merges whose analysis does not appear in the output folder.
print("merge_tools_comparator: Number of new merges:", len(merger_arguments))

print("merge_tools_comparator: Started Merging")
with multiprocessing.Pool(processes=compute_num_process_used()) as pool:
with multiprocessing.Pool(processes=num_processes()) as pool:
merger_results = list(
tqdm(pool.imap(merger, merger_arguments), total=len(merger_arguments))
)
Expand All @@ -223,7 +224,6 @@ def merger( # pylint: disable=too-many-locals
repo_result = {repo_slug: [] for repo_slug in repos["repository"]}
print("merge_tools_comparator: Constructing Output")
n_new_compared = 0
n_new_tested = 0
for i in tqdm(range(len(merger_arguments))):
repo_slug = merger_arguments[i][0]
merge_data = merger_arguments[i][1]
Expand All @@ -232,13 +232,13 @@ def merger( # pylint: disable=too-many-locals
# TODO: I suggest abstracting this loop into a function. That will also make it easier to return early, which can be done just with "return". (Currently it makes more iterations even after discovering that two merge tools do differ.)
for merge_tool1 in MERGE_TOOL:
for merge_tool2 in MERGE_TOOL:
if is_merge_sucess(
if is_merge_success(
cache_data[merge_tool1.name]["results"][0]
) and is_merge_sucess(cache_data[merge_tool2.name]["results"][0]):
) and is_merge_success(cache_data[merge_tool2.name]["results"][0]):
two_merge_tools_differ = True
if is_merge_sucess(
if is_merge_success(
cache_data[merge_tool1.name]["results"][0]
) and is_merge_sucess(cache_data[merge_tool2.name]["results"][0]):
) and is_merge_success(cache_data[merge_tool2.name]["results"][0]):
if (
cache_data[merge_tool1.name]["merge_fingerprint"]
!= cache_data[merge_tool2.name]["merge_fingerprint"]
Expand Down Expand Up @@ -286,10 +286,13 @@ def merger( # pylint: disable=too-many-locals
df.to_csv(output_file, index_label="idx")
n_total_compared += len(df)

# This is the number of merges whose "two merge tools differ" bit has been set (to true or
# false).
print(
"merge_tools_comparator: Number of merge tool outputs that have been newly compared:",
n_new_compared,
)
# This is the number of merges whose "two merge tools differ" bit has been to true.
print(
"merge_tools_comparator: Total number of merge tool outputs that have been compared:",
n_total_compared,
Expand Down
20 changes: 11 additions & 9 deletions src/python/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,14 @@ def compute_explanation(
command: List[str],
source: Union[subprocess.TimeoutExpired, subprocess.CompletedProcess],
) -> str:
"""Poduces the explanation string of a timedout process or
"""Produces the explanation string of a timedout process or
a completed process.
"""
explanation = "Run Command: " + " ".join(command) + "\nTimed out"
explanation += (
"\nstdout:\n" + source.stdout.decode("utf-8") if source.stdout else ""
)
explanation += (
"\nstderr:\n" + source.stderr.decode("utf-8") if source.stderr else ""
)
if source.stdout:
explanation += "\nstdout:\n" + source.stdout.decode("utf-8")
if source.stderr:
explanation += "\nstderr:\n" + source.stderr.decode("utf-8")
return explanation


Expand Down Expand Up @@ -186,7 +184,7 @@ def _merge_and_test( # pylint: disable=too-many-arguments
left_commit (str): The left commit to merge.
right_commit (str): The right commit to merge.
timeout (int): The timeout limit, in seconds.
n_tests (int): The number of times to perform the test.
n_tests (int): The number of times to run the test.
Returns:
TEST_STATE: The result of the test.
str: The tree fingerprint of the merge result.
Expand Down Expand Up @@ -402,6 +400,8 @@ def merge( # pylint: disable=too-many-locals

def compute_tree_fingerprint(self) -> str:
"""Computes the tree fingerprint of the repository.
This function must never be run after running tests,
since running tests might write output files.
Returns:
str: The tree fingerprint.
"""
Expand Down Expand Up @@ -467,7 +467,9 @@ def _checkout_and_test(
timeout: int,
n_tests: int,
) -> TEST_STATE:
"""Checks out the given commit and tests the repository.
"""Helper function for `checkout_and_test`,
which checks out the given commit and tests the repository.
This function does not check the cache.
Args:
commit (str): The commit to checkout.
timeout (int): The timeout limit, in seconds.
Expand Down
2 changes: 1 addition & 1 deletion src/python/split_repos.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
parser.add_argument("--output_file", type=Path)
args = parser.parse_args()
df: pd.DataFrame = pd.read_csv(args.repos_csv, index_col="idx")
# Shuffle the dataframe so the ordering of the list doesn't bias the output
# Shuffle the dataframe so the ordering of the list doesn't bias the output.
df = df.sample(frac=1, random_state=42)
df = np.array_split(df, args.num_machines)[args.machine_id]
df.sort_index(inplace=True)
Expand Down
Loading

0 comments on commit 18ed5f6

Please sign in to comment.