From ed2a73590853965dd081ed091ec88895120437a3 Mon Sep 17 00:00:00 2001
From: Siddharth Krishna <siddharth-krishna@users.noreply.github.com>
Date: Thu, 16 Sep 2021 14:42:25 -0700
Subject: [PATCH 1/2] [WIP] A combined script to run all experiments

---
 examples/mlsys_experiments.json |  9 +++++
 examples/mlsys_experiments.py   | 46 ++++++++++++++++++++++
 examples/mlsys_experiments.sh   | 67 +++++++++++++++++++++++++++++++++
 3 files changed, 122 insertions(+)
 create mode 100644 examples/mlsys_experiments.json
 create mode 100644 examples/mlsys_experiments.sh

diff --git a/examples/mlsys_experiments.json b/examples/mlsys_experiments.json
new file mode 100644
index 00000000..0610463c
--- /dev/null
+++ b/examples/mlsys_experiments.json
@@ -0,0 +1,9 @@
+[
+    {
+        "model": "mlp",
+        "model_size": "mlp-xs",
+        "max_world_size": 4,
+        "min_batch_size": 16,
+        "max_batch_size": 64
+    }
+]
\ No newline at end of file
diff --git a/examples/mlsys_experiments.py b/examples/mlsys_experiments.py
index e4b49486..43130fd8 100644
--- a/examples/mlsys_experiments.py
+++ b/examples/mlsys_experiments.py
@@ -1,5 +1,7 @@
 import argparse
+from itertools import count, takewhile
 import json
+import math
 import os
 import pandas as pd
 
@@ -8,6 +10,9 @@
     calibrate_network_bandwidth,
     calibrate_allreduce_parameters,
 )
+from .mlp_grid_search import MLPGridSearch
+from .gpt2_grid_search import GPTGridSearch
+from .grid_search import run_grid_search
 
 
 def calibrate_parameters(args):
@@ -71,7 +76,47 @@ def calibrate_parameters(args):
             json.dump(simulation_parameters, f)
 
 
+def simulate_from_file(args):
+    with open(args.experiments_file, "r") as f:
+        experiments = json.load(f)
+
+    for experiment in experiments:
+        grid_search_class = {
+            "mlp": MLPGridSearch,
+            "gpt": GPTGridSearch,
+        }
+
+        all_model_sizes = [experiment["model_size"]]
+        # 1, 2, 4, ... experiment['max_world_size']
+        all_world_sizes = list(
+            takewhile(
+                lambda x: x <= experiment["max_world_size"], (2 ** i for i in count())
+            )
+        )
+        # experiment['min_batch_size'], 2**i, 2**(i+1), .., experiment['max_batch_size']
+        all_batch_sizes = list(
+            takewhile(
+                lambda x: x <= experiment["max_batch_size"],
+                (2 ** i for i in count(int(math.log(experiment["min_batch_size"], 2)))),
+            )
+        )
+        grid_search_args = argparse.Namespace(
+            simulation_parameters_file=args.simulation_parameters_file,
+            backend="simulate",
+            mode="grid",
+            all_model_sizes=all_model_sizes,
+            all_world_sizes=all_world_sizes,
+            all_batch_sizes=all_batch_sizes,
+            output_file=args.output_file,
+        )
+        if experiment["model"] == "gpt":
+            grid_search_args.model_path = "gpt2-10.onnx"
+
+        run_grid_search(grid_search_args, grid_search_class[experiment["model"]])
+
+
 def prepare_best_grid_search_configs(args):
+    # TODO for both these, remove throughput/latency columns so it's clear they are input files
     if args.simulation_file is None:
         raise ValueError("Simulation file must be provided")
     # TODO handle files containing multiple model(-size)s
@@ -82,6 +127,7 @@ def prepare_best_grid_search_configs(args):
 
 
 def prepare_accuracy_sample_configs(args):
+    # TODO generate configs again from experiments file to avoid having to simulate
     if args.simulation_file is None:
         raise ValueError("Simulation file must be provided")
     df = pd.read_csv(args.simulation_file)
diff --git a/examples/mlsys_experiments.sh b/examples/mlsys_experiments.sh
new file mode 100644
index 00000000..1ab186d7
--- /dev/null
+++ b/examples/mlsys_experiments.sh
@@ -0,0 +1,67 @@
+#! /bin/bash
+
+# A script to run all MLSys experiments
+
+# TODO parse args etc
+machine_name=$1
+world_size=$2
+
+
+print_header () {
+    printf '=%.0s' {1..100}
+    echo $1
+    printf '=%.0s' {1..100}
+    echo
+}
+
+
+params_file="${machine_name}.params.json"
+backend_file="${model_size}_backend.csv"
+simulated_file="${model_size}_simulated.csv"
+best_file="${model_size}_best.csv"
+sample_file="${model_size}_sample.csv"
+
+# 1. Run calibration on hardware
+
+print_header "Calibrating simulator"
+python -m examples.mlsys_experiments --mode calibrate \
+    --calibrate_device_parameters --calibrate_allreduce_parameters \
+    --calibrate_network_bandwidth \
+    --output_file $params_file
+
+## TODO For each model:
+
+model="gpt"
+model_size="gpt3-6.7B"
+
+# 2. Run pure baselines on hardware
+
+print_header "Running pure baselines"
+for strategy in DP HP PP; do
+    ./examples/run_pure_baseline.sh $model $model_size $strategy $world_size \
+        $backend_file
+
+# TODO
+# mlsys_experiments.py reads a JSON containing BS etc and runs sim(?)
+# also outputs list of inputs to pure baseline runner above
+# prep-best and prep-sample should be updated to work for file containing multiple models
+
+
+# 3. Run grid search using simulation to find estimated best strategies
+
+python -m examples.gpt2_grid_search --model_path gpt2-10.onnx  \
+    --backend simulate --simulation_parameters_file $params_file \
+    --mode grid --all_world_sizes 1 2 4 --all_batch_sizes 512 1024 2048 4096 8192 16384 32768 --all_model_sizes $model_size \
+    --output_file $simulated_file
+
+# 4. Run best strategies on hardware
+
+    python -m examples.mlsys_experiments --mode prep-best --simulation_file gpt_6.7B_simulated.csv --output_file gpt_6.7B_best.csv
+
+    ./examples/run_grid_search.sh gpt gpt_6.7B_best.csv $backend_file
+
+# 5. Run (small/random subset of) grid search on hardware for simulator accuracy
+
+    python -m examples.mlsys_experiments --mode prep-sample --simulation_file gpt_6.7B_simulated.csv --output_file gpt_6.7B_sample.csv
+
+    ./examples/run_grid_search.sh gpt gpt_6.7B_sample.csv $backend_file

From 665c3abd9b7e85b277cecec53dc79802bf5959b5 Mon Sep 17 00:00:00 2001
From: Keshav Santhanam <keshav2@stanford.edu>
Date: Fri, 17 Sep 2021 09:22:37 -0700
Subject: [PATCH 2/2] Add Amulet yaml file

---
 mlsys_experiments.yaml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 mlsys_experiments.yaml

diff --git a/mlsys_experiments.yaml b/mlsys_experiments.yaml
new file mode 100644
index 00000000..ad0bd94e
--- /dev/null
+++ b/mlsys_experiments.yaml
@@ -0,0 +1,29 @@
+description: Run DistIR MLSys experiments
+
+
+
+target:
+  service: amlk8s
+  name: v100-32gb-eus
+  vc: aml-ds
+
+
+environment:
+  image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
+  registry: docker.io
+  setup:
+    - pip install -r requirements.txt
+
+
+code:
+  # local directory of the code. this will be uploaded to the server.
+  # $CONFIG_DIR is expanded to the directory of this config file
+  local_dir: $CONFIG_DIR
+
+
+jobs:
+  - name: dist_ir_mlsys_experiments
+    sku: G4
+    command:
+      - python -m examples.mlsys_experiments --mode calibrate --output_file v100.simulation_parameters.json --calibrate_device_parameters --calibrate_allreduce_parameters --calibrate_network_bandwidth 
+      - mv v100.simulation_parameters.json $$AMLT_OUTPUT_DIR/