From ed2a73590853965dd081ed091ec88895120437a3 Mon Sep 17 00:00:00 2001 From: Siddharth Krishna Date: Thu, 16 Sep 2021 14:42:25 -0700 Subject: [PATCH 1/2] [WIP] A combined script to run all experiments --- examples/mlsys_experiments.json | 9 +++++ examples/mlsys_experiments.py | 46 ++++++++++++++++++++++ examples/mlsys_experiments.sh | 67 +++++++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+) create mode 100644 examples/mlsys_experiments.json create mode 100644 examples/mlsys_experiments.sh diff --git a/examples/mlsys_experiments.json b/examples/mlsys_experiments.json new file mode 100644 index 00000000..0610463c --- /dev/null +++ b/examples/mlsys_experiments.json @@ -0,0 +1,9 @@ +[ + { + "model": "mlp", + "model_size": "mlp-xs", + "max_world_size": 4, + "min_batch_size": 16, + "max_batch_size": 64 + } +] \ No newline at end of file diff --git a/examples/mlsys_experiments.py b/examples/mlsys_experiments.py index e4b49486..43130fd8 100644 --- a/examples/mlsys_experiments.py +++ b/examples/mlsys_experiments.py @@ -1,5 +1,7 @@ import argparse +from itertools import count, takewhile import json +import math import os import pandas as pd @@ -8,6 +10,9 @@ calibrate_network_bandwidth, calibrate_allreduce_parameters, ) +from .mlp_grid_search import MLPGridSearch +from .gpt2_grid_search import GPTGridSearch +from .grid_search import run_grid_search def calibrate_parameters(args): @@ -71,7 +76,47 @@ def calibrate_parameters(args): json.dump(simulation_parameters, f) +def simulate_from_file(args): + with open(args.experiments_file, "r") as f: + experiments = json.load(f) + + for experiment in experiments: + grid_search_class = { + "mlp": MLPGridSearch, + "gpt": GPTGridSearch, + } + + all_model_sizes = [experiment["model_size"]] + # 1, 2, 4, ... experiment['max_world_size'] + all_world_sizes = list( + takewhile( + lambda x: x <= experiment["max_world_size"], (2 ** i for i in count()) + ) + ) + # experiment['min_batch_size'], 2**i, 2**(i+1), .., experiment['max_batch_size'] + all_batch_sizes = list( + takewhile( + lambda x: x <= experiment["max_batch_size"], + (2 ** i for i in count(int(math.log(experiment["min_batch_size"], 2)))), + ) + ) + grid_search_args = argparse.Namespace( + simulation_parameters_file=args.simulation_parameters_file, + backend="simulate", + mode="grid", + all_model_sizes=all_model_sizes, + all_world_sizes=all_world_sizes, + all_batch_sizes=all_batch_sizes, + output_file=args.output_file, + ) + if experiment["model"] == "gpt": + grid_search_args.model_path = "gpt2-10.onnx" + + run_grid_search(grid_search_args, grid_search_class[experiment["model"]]) + + def prepare_best_grid_search_configs(args): + # TODO for both these, remove throughput/latency columns so it's clear they are input files if args.simulation_file is None: raise ValueError("Simulation file must be provided") # TODO handle files containing multiple model(-size)s @@ -82,6 +127,7 @@ def prepare_best_grid_search_configs(args): def prepare_accuracy_sample_configs(args): + # TODO generate configs again from experiments file to avoid having to simulate if args.simulation_file is None: raise ValueError("Simulation file must be provided") df = pd.read_csv(args.simulation_file) diff --git a/examples/mlsys_experiments.sh b/examples/mlsys_experiments.sh new file mode 100644 index 00000000..1ab186d7 --- /dev/null +++ b/examples/mlsys_experiments.sh @@ -0,0 +1,67 @@ +#! /bin/bash + +# A script to run all MLSys experiments + +# TODO parse args etc +machine_name=$1 +world_size=$2 + + +print_header () { + printf '=%.0s' {1..100} + echo $1 + printf '=%.0s' {1..100} + echo +} + + +params_file="${machine_name}.params.json" +backend_file="${model_size}_backend.csv" +simulated_file="${model_size}_simulated.csv" +best_file="${model_size}_best.csv" +sample_file="${model_size}_sample.csv" + +# 1. Run calibration on hardware + +print_header "Calibrating simulator" +python -m examples.mlsys_experiments --mode calibrate \ + --calibrate_device_parameters --calibrate_allreduce_parameters \ + --calibrate_network_bandwidth \ + --output_file $params_file + +## TODO For each model: + +model="gpt" +model_size="gpt3-6.7B" + +# 2. Run pure baselines on hardware + +print_header "Running pure baselines" +for strategy in DP HP PP; do + ./examples/run_pure_baseline.sh $model $model_size $strategy $world_size \ + $backend_file + +# TODO +# mlsys_experiments.py reads a JSON containing BS etc and runs sim(?) +# also outputs list of inputs to pure baseline runner above +# prep-best and prep-sample should be updated to work for file containing multiple models + + +# 3. Run grid search using simulation to find estimated best strategies + +python -m examples.gpt2_grid_search --model_path gpt2-10.onnx \ + --backend simulate --simulation_parameters_file $params_file \ + --mode grid --all_world_sizes 1 2 4 --all_batch_sizes 512 1024 2048 4096 8192 16384 32768 --all_model_sizes $model_size \ + --output_file $simulated_file + +# 4. Run best strategies on hardware + + python -m examples.mlsys_experiments --mode prep-best --simulation_file gpt_6.7B_simulated.csv --output_file gpt_6.7B_best.csv + + ./examples/run_grid_search.sh gpt gpt_6.7B_best.csv $backend_file + +# 5. Run (small/random subset of) grid search on hardware for simulator accuracy + + python -m examples.mlsys_experiments --mode prep-sample --simulation_file gpt_6.7B_simulated.csv --output_file gpt_6.7B_sample.csv + + ./examples/run_grid_search.sh gpt gpt_6.7B_sample.csv $backend_file From 665c3abd9b7e85b277cecec53dc79802bf5959b5 Mon Sep 17 00:00:00 2001 From: Keshav Santhanam Date: Fri, 17 Sep 2021 09:22:37 -0700 Subject: [PATCH 2/2] Add Amulet yaml file --- mlsys_experiments.yaml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 mlsys_experiments.yaml diff --git a/mlsys_experiments.yaml b/mlsys_experiments.yaml new file mode 100644 index 00000000..ad0bd94e --- /dev/null +++ b/mlsys_experiments.yaml @@ -0,0 +1,29 @@ +description: Run DistIR MLSys experiments + + + +target: + service: amlk8s + name: v100-32gb-eus + vc: aml-ds + + +environment: + image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel + registry: docker.io + setup: + - pip install -r requirements.txt + + +code: + # local directory of the code. this will be uploaded to the server. + # $CONFIG_DIR is expanded to the directory of this config file + local_dir: $CONFIG_DIR + + +jobs: + - name: dist_ir_mlsys_experiments + sku: G4 + command: + - python -m examples.mlsys_experiments --mode calibrate --output_file v100.simulation_parameters.json --calibrate_device_parameters --calibrate_allreduce_parameters --calibrate_network_bandwidth + - mv v100.simulation_parameters.json $$AMLT_OUTPUT_DIR/