chore: add simple script to pull wandb run data for easy downloading …

…of metrics.zip
EdanToledo · Feb 10, 2025 · 56beb38 · 56beb38
1 parent f6534f9
commit 56beb38
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ The current code in Stoix was initially **largely** taken and subsequently adapt
 
 3. **Hydra Config System:** Leverage the Hydra configuration system for efficient and consistent management of experiments, network architectures, and environments. Hydra facilitates the easy addition of new hyperparameters and supports multi-runs and Optuna hyperparameter optimization. No more need to create large bash scripts to run a series of experiments with differing hyperparameters, network architectures or environments.
 
-4. **Advanced Logging:** Stoix features advanced and configurable logging, ready for output to the terminal, TensorBoard, and other ML tracking dashboards (WandB and Neptune). It also supports logging experiments in JSON format ready for statistical tests and generating RLiable plots (see the plotting notebook). This enables statistically confident comparisons of algorithms natively.
+4. **Advanced Logging:** Stoix features advanced and configurable logging, ready for output to the terminal, TensorBoard, and other ML tracking dashboards (WandB and Neptune). It also supports logging experiments in JSON format ready for statistical tests and generating RLiable plots (see the notebook in the plotting folder). This enables statistically confident comparisons of algorithms natively.
 
 Stoix currently offers the following building blocks for Single-Agent RL research:
 

diff --git a/plotting.ipynb → plotting/plotting.ipynb b/plotting.ipynb → plotting/plotting.ipynb
diff --git a/plotting/pull_wandb_files.py b/plotting/pull_wandb_files.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import zipfile
+from pathlib import Path
+
+import wandb
+
+
+def parse_args():
+    """
+    Parses command-line arguments for this script.
+    """
+    parser = argparse.ArgumentParser(
+        description="""Download and optionally unzip files from W&B runs based on given filters.
+        This is mainly a helper function to pull the JSON results files from different wandb runs
+        logged when using stoix systems."""
+    )
+    parser.add_argument(
+        "--entity",
+        type=str,
+        default="",
+        help="W&B entity (user or organization) from which to download.",
+    )
+    parser.add_argument(
+        "--project", type=str, default="", help="Name of the W&B project from which to download."
+    )
+    parser.add_argument(
+        "--tags",
+        type=str,
+        nargs="+",
+        default=["stoix"],
+        help=(
+            "One or more tags used to filter runs (logical OR). "
+            "For example, '--tags tag1 tag2' matches runs that have tag1 OR tag2."
+        ),
+    )
+    parser.add_argument(
+        "--filename",
+        type=str,
+        default="metrics.zip",
+        help="Name of the file to look for and download from each run.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="unzipped_files",
+        help="Directory where downloaded files (and unzipped contents) will be stored.",
+    )
+    parser.add_argument(
+        "--finished_only",
+        action="store_true",
+        help="If set, only download from runs that have finished (state=finished).",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    """
+    Main entry point: queries W&B for runs with specified tags, downloads the desired file,
+    and (if it's a zip) unzips it into a dedicated subdirectory.
+
+    :return: Exit code (0 if everything succeeds).
+    """
+    args = parse_args()
+
+    # Create the output directory if it doesn't exist
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Initialize W&B API
+    api = wandb.Api()
+
+    # Build filters for runs based on tags and possibly state
+    filters = {"tags": {"$in": args.tags}}
+    if args.finished_only:
+        filters["state"] = "finished"
+
+    print(f"Querying runs from '{args.entity}/{args.project}' with filters: {filters}")
+    runs = api.runs(path=f"{args.entity}/{args.project}", filters=filters)
+
+    if not runs:
+        print("No runs found with the given filters.")
+        return 0
+
+    # Loop over each run that matches the filters
+    for run in runs:
+        run_name = run.name or run.id  # Some runs may not have a name
+        run_id = run.id
+        print(f"Processing run '{run_name}' (ID: {run_id})")
+
+        # Check if the desired file is in this run
+        desired_file = None
+        for wandb_file in run.files():
+            # We do a simple substring check here:
+            if args.filename in wandb_file.name:
+                desired_file = wandb_file
+                break
+
+        if not desired_file:
+            print(f" - File '{args.filename}' NOT FOUND in this run. Skipping.\n")
+            continue
+
+        # Create a unique subdirectory for this run.
+        # Combining run.id and run.name ensures uniqueness, even if names repeat.
+        safe_run_name = run_name.replace(" ", "_")
+        run_output_dir = Path(args.output_dir) / f"{run_id}_{safe_run_name}"
+        run_output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Download the file into the run's subdirectory
+        print(f" - Downloading '{args.filename}' to '{run_output_dir}'...")
+        downloaded_path = desired_file.download(root=str(run_output_dir), replace=True)
+
+        # Check if it's a ZIP
+        if not downloaded_path.name.endswith(".zip"):
+            print(
+                f" - The downloaded file '{downloaded_path.name}' is not a ZIP. Skipping unzip.\n"
+            )
+            continue
+
+        # Unzip the file into the same subdirectory
+        print(f" - Unzipping '{downloaded_path.name}' in '{run_output_dir}'...")
+        try:
+            # Resolve the full path to avoid confusion in relative directories
+            zip_full_path = Path(downloaded_path.name).resolve()
+            with zipfile.ZipFile(zip_full_path, "r") as zip_ref:
+                zip_ref.extractall(run_output_dir)
+            print(" - Extraction complete.\n")
+        except zipfile.BadZipFile:
+            print(f" - ERROR: '{downloaded_path.name}' is not a valid zip file.\n")
+
+    print("All matching runs processed.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())