Renaming prep

benedikt-schesch · Dec 3, 2023 · 42d3245 · 42d3245
1 parent 6134cb2
commit 42d3245
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 1 deletion.
diff --git a/src/python/cache_utils.py b/src/python/cache_utils.py
@@ -127,7 +127,7 @@ def get_cache_path(repo_slug: str, cache_directory: Path) -> Path:
     Returns:
         Path: The path to the cache file.
     """
-    cache_file_name = slug_repo_name(repo_slug) + ".json"
+    cache_file_name = repo_slug + ".json"
     cache_path = cache_directory / cache_file_name
     cache_path.parent.mkdir(parents=True, exist_ok=True)
     return cache_path

diff --git a/src/python/renaming_caches.py b/src/python/renaming_caches.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+""" Cache renaming."""
+import json
+from pathlib import Path
+from tqdm import tqdm
+import pandas as pd
+from cache_utils import slug_repo_name, write_cache
+
+
+def old_path(repo_slug: str, cache_directory: Path):
+    """Returns the old path to the cache file."""
+    cache_file_name = slug_repo_name(repo_slug) + ".json"
+    cache_path = cache_directory / cache_file_name
+    return cache_path
+
+
+if __name__ == "__main__":
+    cache_root = Path("cache")
+    cache_dirs = [
+        cache_root / cache_dir
+        for cache_dir in [
+            "merge_analysis",
+            "repos_head_info",
+            "sha_cache_entry",
+            "test_cache",
+        ]
+    ]
+    repos_df = pd.read_csv("input_data/repos.csv")
+    for cache_directory in tqdm(cache_dirs):
+        for idx, row in tqdm(repos_df.iterrows(), total=len(repos_df)):
+            analyze = True
+            # Check if name conflict exists
+            for idx2, row2 in repos_df.iterrows():
+                if idx2 == idx:
+                    continue
+                if slug_repo_name(row["repository"]) == slug_repo_name(
+                    row2["repository"]
+                ):
+                    print("Name conflict exists", row["repository"], row2["repository"])
+                    analyze = False
+            if not analyze:
+                continue
+
+            repo_slug = row["repository"]
+            old_cache_path = old_path(repo_slug, cache_directory)
+            try:
+                with open(old_cache_path, "r", encoding="utf-8") as f:
+                    cache = json.load(f)
+            except FileNotFoundError:
+                continue
+
+            write_cache(cache, repo_slug, cache_directory)
+            old_cache_path.unlink()
diff --git a/src/python/renaming_files.py b/src/python/renaming_files.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+"""Renames the cache files to the new naming scheme."""
+from pathlib import Path
+import pandas as pd
+from tqdm import tqdm
+from cache_utils import slug_repo_name
+
+
+def old_path(repo_slug: str, cache_directory: Path):
+    """Returns the old path to the cache file."""
+    cache_file_name = slug_repo_name(repo_slug) + ".csv"
+    cache_path = cache_directory / cache_file_name
+    return cache_path
+
+
+if __name__ == "__main__":
+    cache_root = Path("results")
+    cache_dirs = [
+        cache_root / cache_dir
+        for cache_dir in ["merges", "merges_analyzed", "merges_sampled", "merges_tests"]
+    ]
+    repos_df = pd.read_csv("input_data/repos.csv")
+    for cache_directory in tqdm(cache_dirs):
+        for idx, row in tqdm(repos_df.iterrows(), total=len(repos_df)):
+            analyze = True
+            # Check if name conflict exists
+            for idx2, row2 in repos_df.iterrows():
+                if idx2 == idx:
+                    continue
+                if slug_repo_name(row["repository"]) == slug_repo_name(
+                    row2["repository"]
+                ):
+                    print("Name conflict exists", row["repository"], row2["repository"])
+                    analyze = False
+            if not analyze:
+                continue
+
+            repo_slug = row["repository"]
+            old_cache_path = old_path(repo_slug, cache_directory)
+            try:
+                res = pd.read_csv(old_cache_path)
+            except FileNotFoundError:
+                continue
+
+            new_path = cache_directory / (repo_slug + ".csv")
+            new_path.parent.mkdir(parents=True, exist_ok=True)
+            res.to_csv(new_path, index=False)
+            old_cache_path.unlink()