Skip to content

Commit

Permalink
Renaming prep
Browse files Browse the repository at this point in the history
  • Loading branch information
benedikt-schesch committed Dec 3, 2023
1 parent 6134cb2 commit 42d3245
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/python/cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def get_cache_path(repo_slug: str, cache_directory: Path) -> Path:
Returns:
Path: The path to the cache file.
"""
cache_file_name = slug_repo_name(repo_slug) + ".json"
cache_file_name = repo_slug + ".json"
cache_path = cache_directory / cache_file_name
cache_path.parent.mkdir(parents=True, exist_ok=True)
return cache_path
Expand Down
53 changes: 53 additions & 0 deletions src/python/renaming_caches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
""" Cache renaming."""
import json
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from cache_utils import slug_repo_name, write_cache


def old_path(repo_slug: str, cache_directory: Path):
"""Returns the old path to the cache file."""
cache_file_name = slug_repo_name(repo_slug) + ".json"
cache_path = cache_directory / cache_file_name
return cache_path


if __name__ == "__main__":
cache_root = Path("cache")
cache_dirs = [
cache_root / cache_dir
for cache_dir in [
"merge_analysis",
"repos_head_info",
"sha_cache_entry",
"test_cache",
]
]
repos_df = pd.read_csv("input_data/repos.csv")
for cache_directory in tqdm(cache_dirs):
for idx, row in tqdm(repos_df.iterrows(), total=len(repos_df)):
analyze = True
# Check if name conflict exists
for idx2, row2 in repos_df.iterrows():
if idx2 == idx:
continue
if slug_repo_name(row["repository"]) == slug_repo_name(
row2["repository"]
):
print("Name conflict exists", row["repository"], row2["repository"])
analyze = False
if not analyze:
continue

repo_slug = row["repository"]
old_cache_path = old_path(repo_slug, cache_directory)
try:
with open(old_cache_path, "r", encoding="utf-8") as f:
cache = json.load(f)
except FileNotFoundError:
continue

write_cache(cache, repo_slug, cache_directory)
old_cache_path.unlink()
48 changes: 48 additions & 0 deletions src/python/renaming_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
"""Renames the cache files to the new naming scheme."""
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from cache_utils import slug_repo_name


def old_path(repo_slug: str, cache_directory: Path):
"""Returns the old path to the cache file."""
cache_file_name = slug_repo_name(repo_slug) + ".csv"
cache_path = cache_directory / cache_file_name
return cache_path


if __name__ == "__main__":
cache_root = Path("results")
cache_dirs = [
cache_root / cache_dir
for cache_dir in ["merges", "merges_analyzed", "merges_sampled", "merges_tests"]
]
repos_df = pd.read_csv("input_data/repos.csv")
for cache_directory in tqdm(cache_dirs):
for idx, row in tqdm(repos_df.iterrows(), total=len(repos_df)):
analyze = True
# Check if name conflict exists
for idx2, row2 in repos_df.iterrows():
if idx2 == idx:
continue
if slug_repo_name(row["repository"]) == slug_repo_name(
row2["repository"]
):
print("Name conflict exists", row["repository"], row2["repository"])
analyze = False
if not analyze:
continue

repo_slug = row["repository"]
old_cache_path = old_path(repo_slug, cache_directory)
try:
res = pd.read_csv(old_cache_path)
except FileNotFoundError:
continue

new_path = cache_directory / (repo_slug + ".csv")
new_path.parent.mkdir(parents=True, exist_ok=True)
res.to_csv(new_path, index=False)
old_cache_path.unlink()

0 comments on commit 42d3245

Please sign in to comment.