Skip to content

Commit

Permalink
Further changes
Browse files Browse the repository at this point in the history
  • Loading branch information
benedikt-schesch committed Sep 25, 2023
1 parent 606b16c commit 40e90af
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 140 deletions.
5 changes: 0 additions & 5 deletions src/python/cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def set_in_cache(
cache_value: Union[str, dict, None],
repo_slug: str,
cache_directory: Path,
overwrite: bool = True,
acquire_lock: bool = True,
) -> None:
"""Puts an entry in the cache, then writes the cache to disk.
Expand All @@ -45,16 +44,12 @@ def set_in_cache(
cache_value (dict): The value to write.
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_directory (Path): The path to the cache directory.
overwrite (bool, optional) = True: Whether to overwrite an existing cache entry.
If False, attempting to overwrite an existing cache entry throws an exception.
"""
if acquire_lock:
lock = get_cache_lock(repo_slug, cache_directory)
lock.acquire()
cache_path = get_cache_path(repo_slug, cache_directory)
cache = load_cache(repo_slug, cache_directory)
if cache_key in cache and not overwrite:
raise ValueError("Cache key already exists")
cache[cache_key] = cache_value
output = json.dumps(cache, indent=4)
with open(cache_path, "w") as f:
Expand Down
2 changes: 0 additions & 2 deletions src/python/merge_differ.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,6 @@ def merge_differ(args: Tuple[pd.Series, Path]) -> None:
command.append(str(repo2.repo_path))
with open(diff_file, "w") as f:
subprocess.run(command, stdout=f, stderr=f)
del repo2
del repo1


def diff_file_name(sha1: str, sha2: str) -> Path:
Expand Down
89 changes: 51 additions & 38 deletions src/python/merge_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
print("merge_tester: Started ", repo_slug, merge_data["left"], merge_data["right"])

merge_data["parents pass"] = False
# TODO: It's not clear to me whether "left" and "right" are keys for branches, or shas, or something else. How about improving the key name? (We can change the find-merge-conflicts program if it is the source of the problem.)
for branch in ["left", "right"]:
commit_sha = merge_data[branch]
repo = Repository(repo_slug, cache_directory=cache_directory)
repo.checkout(merge_data[branch])
repo.checkout(commit_sha)
tree_fingerprint = repo.compute_tree_fingerprint()
if tree_fingerprint != merge_data[f"{branch}_tree_fingerprint"]:
raise Exception(
Expand Down Expand Up @@ -112,6 +112,54 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
return merge_data


def build_arguments(
args: argparse.Namespace,
repo_slug: str,
) -> list:
"""Builds the arguments for the merge_tester function.
Args:
args (argparse.Namespace): The arguments of the script.
repo_slug (str): The slug of the repository.
Returns:
list: The arguments for the merge_tester function.
"""
merge_list_file = Path(
os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv")
)
output_file = Path(
os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
)
if not merge_list_file.exists():
print(
"merge_tester:",
repo_slug,
"does not have a list of merges. Missing file: ",
merge_list_file,
)
return []

if output_file.exists():
print(
"merge_tester: Skipping",
repo_slug,
"because it is already computed.",
)
return []
try:
merges = pd.read_csv(merge_list_file, header=0, index_col="idx")
except pd.errors.EmptyDataError:
print(
"merge_tester: Skipping",
repo_slug,
"because it does not contain any merges.",
)
return []
return [
(repo_slug, merge_data, Path(args.cache_dir))
for _, merge_data in merges.iterrows()
]


def main(): # pylint: disable=too-many-locals,too-many-statements
"""Main function"""
print("merge_tester: Start")
Expand All @@ -126,46 +174,11 @@ def main(): # pylint: disable=too-many-locals,too-many-statements

repos = pd.read_csv(args.repos_head_passes_csv, index_col="idx")

# TODO: I suggest abstracting this loop into a separate function (through printing "number of merges to test").
print("merge_tester: Started collecting merges to test")
merge_tester_arguments = []
for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
repo_slug = repository_data["repository"]
merge_list_file = Path(
os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv")
)
output_file = Path(
os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
)
if not merge_list_file.exists():
print(
"merge_tester:",
repo_slug,
"does not have a list of merges. Missing file: ",
merge_list_file,
)
continue

if output_file.exists():
print(
"merge_tester: Skipping",
repo_slug,
"because it is already computed.",
)
continue
try:
merges = pd.read_csv(merge_list_file, header=0, index_col="idx")
except pd.errors.EmptyDataError:
print(
"merge_tester: Skipping",
repo_slug,
"because it does not contain any merges.",
)
continue
merge_tester_arguments += [
(repo_slug, merge_data, Path(args.cache_dir))
for _, merge_data in merges.iterrows()
]
merge_tester_arguments += build_arguments(args, repo_slug)

# Shuffle input to reduce cache contention
random.seed(42)
Expand Down
163 changes: 80 additions & 83 deletions src/python/merge_tools_comparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,18 @@ def merger( # pylint: disable=too-many-locals
merge_data["right"],
merge_tool.name,
)
cache_data[merge_tool.name] = {"results": [], "log_files": [], "run_time": []}
# TODO: I suggest abstracting out the body of this loop as a function.
cache_data[merge_tool.name] = {"results": [], "run_time": []}
log_file: Path = (
cache_directory
/ "merge_logs"
/ slug_repo_name(repo_slug)
/ merge_data["left"]
/ merge_data["right"]
/ (merge_tool.name + f".log")
)
log_file.parent.mkdir(parents=True, exist_ok=True)
log_file.unlink(missing_ok=True)

for i in range(N_REPETITIONS):
repo = Repository(repo_slug, cache_directory=cache_directory)
(
Expand All @@ -86,48 +96,25 @@ def merger( # pylint: disable=too-many-locals
right_commit=merge_data["right"],
timeout=TIMEOUT_MERGING,
)
log_file: Path = (
cache_directory
/ "merge_logs"
/ slug_repo_name(repo_slug)
/ merge_data["left"]
/ merge_data["right"]
/ (merge_tool.name + f".log")
)
log_file.parent.mkdir(parents=True, exist_ok=True)
if log_file.exists():
log_file.unlink()
with open(log_file, "w") as f:
f.write(explanation)
if i == 0: # Only write the log file once.
with open(log_file, "w") as f:
f.write(explanation)

cache_data[merge_tool.name]["results"].append(merge_status.name)
cache_data[merge_tool.name]["log_file"] = str(log_file)
cache_data[merge_tool.name]["run_time"].append(run_time)
if "merge_fingerprint" not in cache_data[merge_tool.name]:
cache_data[merge_tool.name]["merge_fingerprint"] = merge_fingerprint
else:
if (
cache_data[merge_tool.name]["merge_fingerprint"]
!= merge_fingerprint
):
raise Exception(
"merge_tools_comparator: Merge fingerprint mismatch",
i,
repo_slug,
merge_data["left"],
merge_data["right"],
merge_tool.name,
merge_status,
cache_data[merge_tool.name]["merge_fingerprint"],
merge_fingerprint,
cache_data,
)

if "left_tree_fingerprint" not in cache_data:
cache_data["left_tree_fingerprint"] = left_fingerprint
cache_data["right_tree_fingerprint"] = right_fingerprint
else:
assert (
cache_data[merge_tool.name]["merge_fingerprint"]
== merge_fingerprint
)
assert cache_data["left_tree_fingerprint"] == left_fingerprint
assert cache_data["right_tree_fingerprint"] == right_fingerprint

set_in_cache(cache_key, cache_data, repo_slug, merge_cache_directory)
return cache_data

Expand Down Expand Up @@ -156,6 +143,64 @@ def check_if_two_merges_differ(cache_data: dict) -> bool:
return False


def build_merge_arguments(args: argparse.Namespace, repo_slug: str):
"""
Creates the arguments for the merger function.
Args:
args (argparse.Namespace): The arguments to the script.
repo_slug (str): The repository slug.
Returns:
list: A list of arguments for the merger function.
"""
merge_list_file = Path(
os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv")
)
output_file = Path(
os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
)
if not merge_list_file.exists():
raise Exception(
"merge_tools_comparator:",
repo_slug,
"does not have a list of merges. Missing file: ",
merge_list_file,
)

if output_file.exists():
print(
"merge_tools_comparator: Skipping",
repo_slug,
"because it is already computed.",
)
return []

merges = pd.read_csv(
merge_list_file,
names=["idx", "branch_name", "merge", "left", "right", "notes"],
dtype={
"idx": int,
"branch_name": str,
"merge": str,
"left": str,
"right": str,
"notes": str,
},
header=0,
index_col="idx",
)
merges["notes"].replace(np.nan, "", inplace=True)

if args.only_trivial_merges:
merges = merges[merges["notes"].str.contains("a parent is the base")]
elif not args.include_trivial_merges:
merges = merges[~merges["notes"].str.contains("a parent is the base")]
arguments = [
(repo_slug, merge_data, Path(args.cache_dir))
for _, merge_data in merges.iterrows()
]
return arguments


if __name__ == "__main__":
print("merge_tools_comparator: Start")
parser = argparse.ArgumentParser()
Expand All @@ -174,56 +219,8 @@ def check_if_two_merges_differ(cache_data: dict) -> bool:
print("merge_tools_comparator: Constructing Inputs")
merger_arguments = []
for _, repository_data in tqdm(repos.iterrows(), total=len(repos)):
# TODO: I suggest abstracting out the body of this loop as a function.
merges_repo = []
repo_slug = repository_data["repository"]
merge_list_file = Path(
os.path.join(args.merges_path, slug_repo_name(repo_slug) + ".csv")
)
output_file = Path(
os.path.join(args.output_dir, slug_repo_name(repo_slug) + ".csv")
)
if not merge_list_file.exists():
raise Exception(
"merge_tools_comparator:",
repo_slug,
"does not have a list of merges. Missing file: ",
merge_list_file,
)

if output_file.exists():
print(
"merge_tools_comparator: Skipping",
repo_slug,
"because it is already computed.",
)
continue

merges = pd.read_csv(
merge_list_file,
names=["idx", "branch_name", "merge", "left", "right", "notes"],
dtype={
"idx": int,
"branch_name": str,
"merge": str,
"left": str,
"right": str,
"notes": str,
},
header=0,
index_col="idx",
)
merges["notes"].replace(np.nan, "", inplace=True)

if args.only_trivial_merges:
merges = merges[merges["notes"].str.contains("a parent is the base")]
elif not args.include_trivial_merges:
merges = merges[~merges["notes"].str.contains("a parent is the base")]

merger_arguments += [
(repo_slug, merge_data, Path(args.cache_dir))
for _, merge_data in merges.iterrows()
]
merger_arguments += build_merge_arguments(args, repo_slug)

# Shuffle input to reduce cache contention
random.seed(42)
Expand Down
Loading

0 comments on commit 40e90af

Please sign in to comment.