Skip to content

Commit

Permalink
Make merge_idx a field of the Repository class, for diagnostics (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
mernst authored Jul 10, 2024
1 parent 65e2e2f commit 18e99be
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 28 deletions.
50 changes: 34 additions & 16 deletions src/python/merge_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def get_diff_files(


def diff_merge_analyzer(
merge_idx: str,
repo_slug: str,
left_sha: str,
right_sha: str,
Expand All @@ -75,6 +76,7 @@ def diff_merge_analyzer(
Computes the diff between two branches using git diff.
Args:
repo (Repository): The repository object.
merge_idx (str): The merge index, such as 42-123.
repo_slug (str): The repository slug.
left_sha (str): The left sha.
right_sha (str): The right sha.
Expand All @@ -91,6 +93,7 @@ def diff_merge_analyzer(
return cache_data

repo = Repository(
merge_idx,
repo_slug,
cache_directory=cache_dir,
workdir_id=repo_slug + "/diff-" + left_sha + "-" + right_sha,
Expand Down Expand Up @@ -149,34 +152,41 @@ def diff_merge_analyzer(


def merge_analyzer(
args: Tuple[str, pd.Series, Path],
args: Tuple[str, str, pd.Series, Path],
) -> pd.Series:
"""
Merges two branches and returns the result.
Args:
args (Tuple[str,pd.Series,Path]): A tuple containing the repo slug,
args (Tuple[str,str,pd.Series,Path]): A tuple containing the merge index, the repo slug,
the merge data (which is side-effected), and the cache path.
Returns:
dict: A dictionary containing the merge result.
"""
repo_slug, merge_data, cache_directory = args
merge_idx, repo_slug, merge_data, cache_directory = args

left_sha = merge_data["left"]
right_sha = merge_data["right"]

logger.info(f"merge_analyzer: Analyzing {repo_slug} {left_sha} {right_sha}")
logger.info(
f"merge_analyzer: Analyzing {merge_idx} {repo_slug} {left_sha} {right_sha}"
)

# Compute diff size in lines between left and right
cache_data = diff_merge_analyzer(repo_slug, left_sha, right_sha, cache_directory)
cache_data = diff_merge_analyzer(
merge_idx, repo_slug, left_sha, right_sha, cache_directory
)

if cache_data["diff contains java file"] in (False, None):
merge_data["test merge"] = False
merge_data["diff contains java file"] = False
logger.info(f"merge_analyzer: Analyzed {repo_slug} {left_sha} {right_sha}")
logger.info(
f"merge_analyzer: Analyzed {merge_idx} {repo_slug} {left_sha} {right_sha}"
)
return merge_data

# Checkout left parent
repo_left = Repository(
merge_idx,
repo_slug,
cache_directory=cache_directory,
workdir_id=repo_slug + "/left-" + left_sha + "-" + right_sha,
Expand All @@ -185,6 +195,7 @@ def merge_analyzer(

# Checkout right parent
repo_right = Repository(
merge_idx,
repo_slug,
cache_directory=cache_directory,
workdir_id=repo_slug + "/right-" + left_sha + "-" + right_sha,
Expand Down Expand Up @@ -214,15 +225,20 @@ def merge_analyzer(
merge_data["parents pass"] and merge_data["diff contains java file"] is True
)

logger.info(f"merge_analyzer: Analyzed {repo_slug} {left_sha} {right_sha}")
logger.info(
f"merge_analyzer: Analyzed {merge_idx} {repo_slug} {left_sha} {right_sha}"
)

return merge_data


def build_merge_analyzer_arguments(args: argparse.Namespace, repo_slug: str):
def build_merge_analyzer_arguments(
repo_idx: str, args: argparse.Namespace, repo_slug: str
):
"""
Creates the arguments for the merger function.
Args:
reo_idx (str): The repository index.
args (argparse.Namespace): The arguments to the script.
repo_slug (str): The repository slug.
Returns:
Expand Down Expand Up @@ -255,8 +271,8 @@ def build_merge_analyzer_arguments(args: argparse.Namespace, repo_slug: str):
merges["notes"] = merges["notes"].fillna("")

arguments = [
(repo_slug, merge_data, Path(args.cache_dir))
for _, merge_data in merges.iterrows()
(f"{repo_idx}-{idx}", repo_slug, merge_data, Path(args.cache_dir))
for idx, merge_data in merges.iterrows()
]
return arguments

Expand Down Expand Up @@ -297,9 +313,11 @@ def plot_vertical_histogram(data, title, ax):
TimeRemainingColumn(),
) as progress:
task = progress.add_task("[green]Constructing Input...", total=len(repos))
for _, repository_data in repos.iterrows():
for repo_idx, repository_data in repos.iterrows():
repo_slug = repository_data["repository"]
merger_arguments += build_merge_analyzer_arguments(args, repo_slug)
merger_arguments += build_merge_analyzer_arguments(
repo_idx, args, repo_slug
)
progress.update(task, advance=1)

# Shuffle input to reduce cache contention
Expand Down Expand Up @@ -339,10 +357,10 @@ def plot_vertical_histogram(data, title, ax):
TimeRemainingColumn(),
) as progress:
task = progress.add_task("[green]Processing...", total=len(merger_arguments))
for idx, merge_data in enumerate(merger_arguments):
repo_slug = merge_data[0]
results_data = merger_results[idx]
repo_result[repo_slug].append(merger_results[idx])
for new_merges_idx, merge_data in enumerate(merger_arguments):
repo_slug = merge_data[1]
results_data = merger_results[new_merges_idx]
repo_result[repo_slug].append(merger_results[new_merges_idx])
n_new_analyzed += 1
if "test merge" in results_data and results_data["test merge"]:
n_new_candidates_to_test += 1
Expand Down
7 changes: 4 additions & 3 deletions src/python/merge_runtime_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def main():
merges = merges.sample(frac=1, random_state=42)
if len(merges) > args.n_sampled_timing:
merges = merges.iloc[: args.n_sampled_timing]
for idx, merge_data in merges.iterrows():
for merge_idx, merge_data in merges.iterrows():
for merge_tool in MERGE_TOOL:
left_hash, right_hash = (
merge_data["left"],
Expand All @@ -75,7 +75,7 @@ def main():
)

if cache_entry is not None:
merges.at[idx, f"{merge_tool.name}_run_time"] = np.median(
merges.at[merge_idx, f"{merge_tool.name}_run_time"] = np.median(
cache_entry["run_time"] # type: ignore
)
else:
Expand All @@ -86,6 +86,7 @@ def main():
run_times = []
for _ in range(args.n_timings):
repo = Repository(
merge_idx,
repo_slug,
workdir_id=repo_slug
+ f"/merge-tester-{merge_tool.name}-"
Expand Down Expand Up @@ -118,7 +119,7 @@ def main():
acquire_lock=False,
)

merges.at[idx, f"{merge_tool.name}_run_time"] = np.median(
merges.at[merge_idx, f"{merge_tool.name}_run_time"] = np.median(
cache_entry["run_time"] # type: ignore
)
out_file = args.output_dir / f"{repo_slug}.csv"
Expand Down
21 changes: 13 additions & 8 deletions src/python/merge_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,25 @@
)


def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:
def merge_tester(args: Tuple[str, str, pd.Series, Path]) -> pd.Series:
"""Tests a merge with each merge tool.
Args:
args (Tuple[str,pd.Series,Path]): A tuple containing the repository slug,
the repository info, and the cache path.
args (Tuple[str,str,pd.Series,Path]): A tuple containing the merge index,
the repository slug, the repository info, and the cache path.
Returns:
pd.Series: The result of the test.
"""
repo_slug, merge_data, cache_directory = args
merge_idx, repo_slug, merge_data, cache_directory = args
print(
f"merge_tester: Started {merge_idx} {repo_slug} {merge_data['left']} {merge_data['right']}"
)
logger.info(
f"merge_tester: Started {repo_slug} {merge_data['left']} {merge_data['right']}"
f"merge_tester: Started {merge_idx} {repo_slug} {merge_data['left']} {merge_data['right']}"
)
while psutil.cpu_percent() > 90 or psutil.virtual_memory().percent > 85:
logger.trace(
"merge_tester: Waiting for CPU or memory to be available."
+ merge_idx
+ repo_slug
+ merge_data["left"]
+ merge_data["right"]
Expand All @@ -57,6 +61,7 @@ def merge_tester(args: Tuple[str, pd.Series, Path]) -> pd.Series:

for merge_tool in MERGE_TOOL:
repo = Repository(
merge_idx,
repo_slug,
cache_directory=cache_directory,
workdir_id=repo_slug
Expand Down Expand Up @@ -138,8 +143,8 @@ def build_arguments(
return []
merges = merges[merges["sampled for testing"]]
return [
(repo_slug, merge_data, Path(args.cache_dir))
for _, merge_data in merges.iterrows()
(merge_idx, repo_slug, merge_data, Path(args.cache_dir))
for merge_idx, merge_data in merges.iterrows()
]


Expand Down Expand Up @@ -214,7 +219,7 @@ def main():
)
for idx, merge_results in enumerate(merge_tester_arguments):
progress.update(task, advance=1)
repo_slug = merge_results[0]
repo_slug = merge_results[1]
repo_result[repo_slug].append(merge_tester_results[idx])

n_total_merges = 0
Expand Down
5 changes: 5 additions & 0 deletions src/python/replay_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def merge_replay(
)
if not (WORKDIR_DIRECTORY / workdir).exists():
repo = Repository(
merge_idx,
repo_slug,
cache_directory=Path("no_cache/"),
workdir_id=workdir,
Expand All @@ -127,6 +128,7 @@ def merge_replay(
)
if not (WORKDIR_DIRECTORY / workdir).exists():
repo = Repository(
merge_idx,
repo_slug,
cache_directory=Path("no_cache/"),
workdir_id=workdir,
Expand All @@ -141,6 +143,7 @@ def merge_replay(
)
if not (WORKDIR_DIRECTORY / workdir).exists():
repo = Repository(
merge_idx,
repo_slug,
cache_directory=Path("no_cache/"),
workdir_id=workdir,
Expand All @@ -164,6 +167,7 @@ def merge_replay(
)
if not (WORKDIR_DIRECTORY / workdir).exists():
repo = Repository(
merge_idx,
repo_slug,
cache_directory=Path("no_cache/"),
workdir_id=workdir,
Expand Down Expand Up @@ -209,6 +213,7 @@ def merge_replay(
continue
try:
repo = Repository(
merge_idx,
repo_slug,
cache_directory=Path("no_cache/"),
workdir_id=workdir,
Expand Down
6 changes: 5 additions & 1 deletion src/python/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,10 +182,13 @@ def repo_test(wcopy_dir: Path, timeout: int) -> Tuple[TEST_STATE, str]:


class Repository:
"""A class that represents a repository."""
"""A class that represents a repository.
merge_idx is purely for diagnostic purposes.
"""

def __init__(
self,
merge_idx: str,
repo_slug: str,
cache_directory: Path = Path(""),
workdir_id: str = uuid.uuid4().hex, # uuid4 is a random UID
Expand All @@ -197,6 +200,7 @@ def __init__(
repo_slug (str): The slug of the repository, which is "owner/reponame".
cache_directory (Path): The prefix of the cache.
"""
self.merge_idx = merge_idx
self.repo_slug = repo_slug.lower()
self.owner, self.name = self.repo_slug.split("/")
self.repo_path = REPOS_PATH / repo_slug
Expand Down
1 change: 1 addition & 0 deletions src/python/test_repo_heads.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def head_passes_tests(args: Tuple[pd.Series, Path]) -> pd.Series:
# Load repo
try:
repo = Repository(
"HEAD",
repo_slug,
cache_directory=cache,
workdir_id=repo_slug + "/head-" + repo_info["repository"],
Expand Down
1 change: 1 addition & 0 deletions src/python/write_head_hashes.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def get_latest_hash(args):
try:
logger.info("write_head_hashes " + repo_slug + " : Cloning repo")
repo = Repository(
"HEAD",
repo_slug,
workdir_id=repo_slug + "/head-" + repo_slug,
lazy_clone=False,
Expand Down

0 comments on commit 18e99be

Please sign in to comment.