microsoft · siddharth-krishna · Sep 16, 2021 · Sep 17, 2021
diff --git a/examples/mlsys_experiments.json b/examples/mlsys_experiments.json
@@ -0,0 +1,9 @@
+[
+    {
+        "model": "mlp",
+        "model_size": "mlp-xs",
+        "max_world_size": 4,
+        "min_batch_size": 16,
+        "max_batch_size": 64
+    }
+]
diff --git a/examples/mlsys_experiments.py b/examples/mlsys_experiments.py
@@ -1,5 +1,7 @@
 import argparse
+from itertools import count, takewhile
 import json
+import math
 import os
 import pandas as pd
 
@@ -8,6 +10,9 @@
     calibrate_network_bandwidth,
     calibrate_allreduce_parameters,
 )
+from .mlp_grid_search import MLPGridSearch
+from .gpt2_grid_search import GPTGridSearch
+from .grid_search import run_grid_search
 
 
 def calibrate_parameters(args):
@@ -71,7 +76,47 @@ def calibrate_parameters(args):
             json.dump(simulation_parameters, f)
 
 
+def simulate_from_file(args):
+    with open(args.experiments_file, "r") as f:
+        experiments = json.load(f)
+
+    for experiment in experiments:
+        grid_search_class = {
+            "mlp": MLPGridSearch,
+            "gpt": GPTGridSearch,
+        }
+
+        all_model_sizes = [experiment["model_size"]]
+        # 1, 2, 4, ... experiment['max_world_size']
+        all_world_sizes = list(
+            takewhile(
+                lambda x: x <= experiment["max_world_size"], (2 ** i for i in count())
+            )
+        )
+        # experiment['min_batch_size'], 2**i, 2**(i+1), .., experiment['max_batch_size']
+        all_batch_sizes = list(
+            takewhile(
+                lambda x: x <= experiment["max_batch_size"],
+                (2 ** i for i in count(int(math.log(experiment["min_batch_size"], 2)))),
+            )
+        )
+        grid_search_args = argparse.Namespace(
+            simulation_parameters_file=args.simulation_parameters_file,
+            backend="simulate",
+            mode="grid",
+            all_model_sizes=all_model_sizes,
+            all_world_sizes=all_world_sizes,
+            all_batch_sizes=all_batch_sizes,
+            output_file=args.output_file,
+        )
+        if experiment["model"] == "gpt":
+            grid_search_args.model_path = "gpt2-10.onnx"
+
+        run_grid_search(grid_search_args, grid_search_class[experiment["model"]])
+
+
 def prepare_best_grid_search_configs(args):
+    # TODO for both these, remove throughput/latency columns so it's clear they are input files
     if args.simulation_file is None:
         raise ValueError("Simulation file must be provided")
     # TODO handle files containing multiple model(-size)s
@@ -82,6 +127,7 @@ def prepare_best_grid_search_configs(args):
 
 
 def prepare_accuracy_sample_configs(args):
+    # TODO generate configs again from experiments file to avoid having to simulate
     if args.simulation_file is None:
         raise ValueError("Simulation file must be provided")
     df = pd.read_csv(args.simulation_file)

diff --git a/examples/mlsys_experiments.sh b/examples/mlsys_experiments.sh
@@ -0,0 +1,67 @@
+#! /bin/bash
+
+# A script to run all MLSys experiments
+
+# TODO parse args etc
+machine_name=$1
+world_size=$2
+
+
+print_header () {
+    printf '=%.0s' {1..100}
+    echo $1
+    printf '=%.0s' {1..100}
+    echo
+}
+
+
+params_file="${machine_name}.params.json"
+backend_file="${model_size}_backend.csv"
+simulated_file="${model_size}_simulated.csv"
+best_file="${model_size}_best.csv"
+sample_file="${model_size}_sample.csv"
+
+# 1. Run calibration on hardware
+
+print_header "Calibrating simulator"
+python -m examples.mlsys_experiments --mode calibrate \
+    --calibrate_device_parameters --calibrate_allreduce_parameters \
+    --calibrate_network_bandwidth \
+    --output_file $params_file
+
+## TODO For each model:
+
+model="gpt"
+model_size="gpt3-6.7B"
+
+# 2. Run pure baselines on hardware
+
+print_header "Running pure baselines"
+for strategy in DP HP PP; do
+    ./examples/run_pure_baseline.sh $model $model_size $strategy $world_size \
+        $backend_file
+
+# TODO
+# mlsys_experiments.py reads a JSON containing BS etc and runs sim(?)
+# also outputs list of inputs to pure baseline runner above
+# prep-best and prep-sample should be updated to work for file containing multiple models
+
+
+# 3. Run grid search using simulation to find estimated best strategies
+
+python -m examples.gpt2_grid_search --model_path gpt2-10.onnx  \
+    --backend simulate --simulation_parameters_file $params_file \
+    --mode grid --all_world_sizes 1 2 4 --all_batch_sizes 512 1024 2048 4096 8192 16384 32768 --all_model_sizes $model_size \
+    --output_file $simulated_file
+
+# 4. Run best strategies on hardware
+
+    python -m examples.mlsys_experiments --mode prep-best --simulation_file gpt_6.7B_simulated.csv --output_file gpt_6.7B_best.csv
+
+    ./examples/run_grid_search.sh gpt gpt_6.7B_best.csv $backend_file
+
+# 5. Run (small/random subset of) grid search on hardware for simulator accuracy
+
+    python -m examples.mlsys_experiments --mode prep-sample --simulation_file gpt_6.7B_simulated.csv --output_file gpt_6.7B_sample.csv
+
+    ./examples/run_grid_search.sh gpt gpt_6.7B_sample.csv $backend_file
diff --git a/mlsys_experiments.yaml b/mlsys_experiments.yaml
@@ -0,0 +1,29 @@
+description: Run DistIR MLSys experiments
+
+
+
+target:
+  service: amlk8s
+  name: v100-32gb-eus
+  vc: aml-ds
+
+
+environment:
+  image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
+  registry: docker.io
+  setup:
+    - pip install -r requirements.txt
+
+
+code:
+  # local directory of the code. this will be uploaded to the server.
+  # $CONFIG_DIR is expanded to the directory of this config file
+  local_dir: $CONFIG_DIR
+
+
+jobs:
+  - name: dist_ir_mlsys_experiments
+    sku: G4
+    command:
+      - python -m examples.mlsys_experiments --mode calibrate --output_file v100.simulation_parameters.json --calibrate_device_parameters --calibrate_allreduce_parameters --calibrate_network_bandwidth 
+      - mv v100.simulation_parameters.json $$AMLT_OUTPUT_DIR/