Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiments for MLSys submission #37

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions examples/mlsys_experiments.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[
{
"model": "mlp",
"model_size": "mlp-xs",
"max_world_size": 4,
"min_batch_size": 16,
"max_batch_size": 64
}
]
46 changes: 46 additions & 0 deletions examples/mlsys_experiments.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import argparse
from itertools import count, takewhile
import json
import math
import os
import pandas as pd

Expand All @@ -8,6 +10,9 @@
calibrate_network_bandwidth,
calibrate_allreduce_parameters,
)
from .mlp_grid_search import MLPGridSearch
from .gpt2_grid_search import GPTGridSearch
from .grid_search import run_grid_search


def calibrate_parameters(args):
Expand Down Expand Up @@ -71,7 +76,47 @@ def calibrate_parameters(args):
json.dump(simulation_parameters, f)


def simulate_from_file(args):
with open(args.experiments_file, "r") as f:
experiments = json.load(f)

for experiment in experiments:
grid_search_class = {
"mlp": MLPGridSearch,
"gpt": GPTGridSearch,
}

all_model_sizes = [experiment["model_size"]]
# 1, 2, 4, ... experiment['max_world_size']
all_world_sizes = list(
takewhile(
lambda x: x <= experiment["max_world_size"], (2 ** i for i in count())
)
)
# experiment['min_batch_size'], 2**i, 2**(i+1), .., experiment['max_batch_size']
all_batch_sizes = list(
takewhile(
lambda x: x <= experiment["max_batch_size"],
(2 ** i for i in count(int(math.log(experiment["min_batch_size"], 2)))),
)
)
grid_search_args = argparse.Namespace(
simulation_parameters_file=args.simulation_parameters_file,
backend="simulate",
mode="grid",
all_model_sizes=all_model_sizes,
all_world_sizes=all_world_sizes,
all_batch_sizes=all_batch_sizes,
output_file=args.output_file,
)
if experiment["model"] == "gpt":
grid_search_args.model_path = "gpt2-10.onnx"

run_grid_search(grid_search_args, grid_search_class[experiment["model"]])


def prepare_best_grid_search_configs(args):
# TODO for both these, remove throughput/latency columns so it's clear they are input files
if args.simulation_file is None:
raise ValueError("Simulation file must be provided")
# TODO handle files containing multiple model(-size)s
Expand All @@ -82,6 +127,7 @@ def prepare_best_grid_search_configs(args):


def prepare_accuracy_sample_configs(args):
# TODO generate configs again from experiments file to avoid having to simulate
if args.simulation_file is None:
raise ValueError("Simulation file must be provided")
df = pd.read_csv(args.simulation_file)
Expand Down
67 changes: 67 additions & 0 deletions examples/mlsys_experiments.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#! /bin/bash

# A script to run all MLSys experiments

# TODO parse args etc
machine_name=$1
world_size=$2


print_header () {
printf '=%.0s' {1..100}
echo $1
printf '=%.0s' {1..100}
echo
}


params_file="${machine_name}.params.json"
backend_file="${model_size}_backend.csv"
simulated_file="${model_size}_simulated.csv"
best_file="${model_size}_best.csv"
sample_file="${model_size}_sample.csv"

# 1. Run calibration on hardware

print_header "Calibrating simulator"
python -m examples.mlsys_experiments --mode calibrate \
--calibrate_device_parameters --calibrate_allreduce_parameters \
--calibrate_network_bandwidth \
--output_file $params_file

## TODO For each model:

model="gpt"
model_size="gpt3-6.7B"

# 2. Run pure baselines on hardware

print_header "Running pure baselines"
for strategy in DP HP PP; do
./examples/run_pure_baseline.sh $model $model_size $strategy $world_size \
$backend_file

# TODO
# mlsys_experiments.py reads a JSON containing BS etc and runs sim(?)
# also outputs list of inputs to pure baseline runner above
# prep-best and prep-sample should be updated to work for file containing multiple models


# 3. Run grid search using simulation to find estimated best strategies

python -m examples.gpt2_grid_search --model_path gpt2-10.onnx \
--backend simulate --simulation_parameters_file $params_file \
--mode grid --all_world_sizes 1 2 4 --all_batch_sizes 512 1024 2048 4096 8192 16384 32768 --all_model_sizes $model_size \
--output_file $simulated_file

# 4. Run best strategies on hardware

python -m examples.mlsys_experiments --mode prep-best --simulation_file gpt_6.7B_simulated.csv --output_file gpt_6.7B_best.csv

./examples/run_grid_search.sh gpt gpt_6.7B_best.csv $backend_file

# 5. Run (small/random subset of) grid search on hardware for simulator accuracy

python -m examples.mlsys_experiments --mode prep-sample --simulation_file gpt_6.7B_simulated.csv --output_file gpt_6.7B_sample.csv

./examples/run_grid_search.sh gpt gpt_6.7B_sample.csv $backend_file
29 changes: 29 additions & 0 deletions mlsys_experiments.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
description: Run DistIR MLSys experiments



target:
service: amlk8s
name: v100-32gb-eus
vc: aml-ds


environment:
image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
registry: docker.io
setup:
- pip install -r requirements.txt


code:
# local directory of the code. this will be uploaded to the server.
# $CONFIG_DIR is expanded to the directory of this config file
local_dir: $CONFIG_DIR


jobs:
- name: dist_ir_mlsys_experiments
sku: G4
command:
- python -m examples.mlsys_experiments --mode calibrate --output_file v100.simulation_parameters.json --calibrate_device_parameters --calibrate_allreduce_parameters --calibrate_network_bandwidth
- mv v100.simulation_parameters.json $$AMLT_OUTPUT_DIR/