Skip to content

Commit

Permalink
Make supervisor a server and introduce modyn client (#329)
Browse files Browse the repository at this point in the history
Addresses issues #302, #317
  • Loading branch information
jenny011 authored Jan 18, 2024
1 parent f4cb0fe commit 15ad8d9
Show file tree
Hide file tree
Showing 100 changed files with 4,950 additions and 2,167 deletions.
4 changes: 2 additions & 2 deletions .github/actions/mamba/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ runs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: '3.11'

- name: Install prerequisites for pystack
run: sudo apt-get install --no-install-recommends -qy libdw-dev libelf-dev
Expand All @@ -16,7 +16,7 @@ runs:

# increase to reset cache manually
- name: Set cache number
run: echo "CACHE_NUMBER=1" >> $GITHUB_ENV
run: echo "CACHE_NUMBER=2" >> $GITHUB_ENV
shell: bash

- name: Setup micromamba
Expand Down
1 change: 1 addition & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ ignore-paths=^modyn/trainer_server/internal/grpc/generated/.*$,
^modyn/models/dlrm/utils/.*$,
^modyn/models/dlrm/nn/.*$,
^modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/.*$,
^modyn/supervisor/internal/grpc/generated/.*$,

# Files or directories matching the regex patterns are skipped. The regex
# matches against base names, not paths. The default value ignores Emacs file
Expand Down
2 changes: 0 additions & 2 deletions benchmark/criteo_1TB/pipelines/exp0_finetune.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ training:
parallel_prefetch_requests: 2
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
2 changes: 0 additions & 2 deletions benchmark/criteo_1TB/pipelines/exp5_current_day_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ training:
parallel_prefetch_requests: 2
use_previous_model: False
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
2 changes: 0 additions & 2 deletions benchmark/criteo_1TB/pipelines/exp6_retrain_keep_model.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ training:
parallel_prefetch_requests: 2
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
2 changes: 0 additions & 2 deletions benchmark/criteo_1TB/pipelines/exp7_retrain_new_model.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ training:
parallel_prefetch_requests: 2
use_previous_model: False # Same amount of computation (retraining on all data), but on different starting weights
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
2 changes: 0 additions & 2 deletions benchmark/mnist/mnist.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ training:
dataloader_workers: 2
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 64
optimizers:
- name: "default"
Expand Down
2 changes: 0 additions & 2 deletions benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ training:
dataloader_workers: 2
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 96
optimizers:
- name: "default"
Expand Down
2 changes: 0 additions & 2 deletions benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ training:
dataloader_workers: 2
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 64
optimizers:
- name: "default"
Expand Down
2 changes: 0 additions & 2 deletions benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ training:
dataloader_workers: 2
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 64
optimizers:
- name: "default"
Expand Down
2 changes: 0 additions & 2 deletions benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ training:
dataloader_workers: 2
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 64
optimizers:
- name: "default"
Expand Down
3 changes: 3 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ services:
# - .:/modyn_host
# shm_size: 4gb
supervisor:
restart: on-failure
depends_on:
- storage
- metadata-db
Expand All @@ -144,6 +145,8 @@ services:
- 3000:50062
tests:
depends_on:
supervisor:
condition: service_started
storage:
condition: service_started
selector:
Expand Down
5 changes: 4 additions & 1 deletion docker/Supervisor/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
FROM modynbase:latest

CMD tail -f /dev/null
RUN chmod a+x /src/modyn/supervisor/modyn-supervisor

# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
CMD mamba run -n modyn --no-capture-output ./modyn/supervisor/modyn-supervisor ./modyn/config/examples/modyn_config.yaml
13 changes: 9 additions & 4 deletions docs/EXAMPLE.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,18 @@ This is required for some trainings, e.g., for Criteo training, as you run into
Optionally, you can uncomment the `.:/modyn_host` mount for all services to enable faster development cycles.
This is not required if you do not iterate.

### Starting the containers and the pipeline
### Starting the containers
Next, run `./scripts/run_modyn.sh` to build the containers and start them.
This may take several minutes for the first time.
After building the containers, run `tmuxp load tmuxp.yaml` to have access to all container shells and logs.
Switch to the supervisor pane (using regular tmux bindings).
There, you can now submit a pipeline using the `modyn-supervisor` command.
For example, you can run `modyn-supervisor --start-replay-at 0 benchmark/mnist/mnist.yaml modyn/config/examples/modyn_config.yaml`.

### Starting the pipeline
You can now submit a pipeline to the supervisor container using the `modyn-client` command in `modynclient/client/`.\
For example, you can run `modyn-client --start-replay-at 0 --maximum-triggers 1 <pipeline config file> <modyn client config file> .`\
pipeline config file example: modynclient/config/examples/mnist.yaml\
modyn client config file example:
- on your local machine: modynclient/config/examples/modyn_client_config.yaml
- in one of the containers: modynclient/config/examples/modyn_client_config_container.yaml

### Iterating (for development)
Since we copy the Modyn sources into the containers, if we change something locally outside of the containers, this does not get reflected in the containers.
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ channels:
- huggingface

dependencies:
- python>=3.9
- python>=3.11
- pip
- tqdm
- conda-forge::enlighten
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
pipeline:
name: 8workers_8prefetch_8parallel
description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
version: 1.0.0
model:
id: DLRM
config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
embedding_dim: 128
interaction_op: "cuda_dot"
hash_indices: False
bottom_mlp_sizes: [512, 256, 128]
top_mlp_sizes: [1024, 1024, 512, 256, 1]
embedding_type: "joint_fused"
num_numerical_features: 13
use_cpp_mlp: True
categorical_features_info:
cat_0: 7912889
cat_1: 33823
cat_2: 17139
cat_3: 7339
cat_4: 20046
cat_5: 4
cat_6: 7105
cat_7: 1382
cat_8: 63
cat_9: 5554114
cat_10: 582469
cat_11: 245828
cat_12: 11
cat_13: 2209
cat_14: 10667
cat_15: 104
cat_16: 4
cat_17: 968
cat_18: 15
cat_19: 8165896
cat_20: 2675940
cat_21: 7156453
cat_22: 302516
cat_23: 12022
cat_24: 97
cat_25: 35
training:
gpus: 1
device: "cuda:0"
amp: True
dataloader_workers: 8
num_prefetched_partitions: 8
parallel_prefetch_requests: 8
use_previous_model: True
initial_model: random
batch_size: 65536
optimizers:
- name: "mlp"
algorithm: "FusedSGD"
source: "APEX"
param_groups:
- module: "model.top_model"
config:
lr: 24
- module: "model.bottom_model.mlp"
config:
lr: 24
- name: "opt_1"
algorithm: "SGD"
source: "PyTorch"
param_groups:
- module: "model.bottom_model.embeddings"
config:
lr: 24
lr_scheduler:
name: "DLRMScheduler"
source: "Custom"
optimizers: ["mlp", "opt_1"]
config:
base_lrs: [[24, 24], [24]]
warmup_steps: 8000
warmup_factor: 0
decay_steps: 24000
decay_start_step: 48000
decay_power: 2
end_lr_factor: 0
optimization_criterion:
name: "BCEWithLogitsLoss"
grad_scaler_config:
growth_interval: 1000000000
checkpointing:
activated: False
selection_strategy:
name: NewDataStrategy
maximum_keys_in_memory: 2000000
config:
limit: -1
reset_after_trigger: True
data:
dataset_id: criteo
bytes_parser_function: |
import torch
import numpy as np
def bytes_parser_function(x: bytes) -> dict:
num_features = x[:52]
cat_features = x[52:]
num_features_array = np.frombuffer(num_features, dtype=np.float32)
cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
return {
"numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
"categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
}
label_transformer_function: |
import torch
# we need to convert our integer-type labels to floats,
# since the BCEWithLogitsLoss function does not work with integers.
def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
return x.to(torch.float32)
trigger:
id: DataAmountTrigger
trigger_config:
data_points_for_trigger: 20000000

Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ training:
parallel_prefetch_requests: 2
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ training:
parallel_prefetch_requests: 2
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ training:
parallel_prefetch_requests: 1
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ training:
parallel_prefetch_requests: 1
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ training:
parallel_prefetch_requests: 1
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ training:
parallel_prefetch_requests: 2
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ training:
parallel_prefetch_requests: 2
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ training:
parallel_prefetch_requests: 4
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ training:
parallel_prefetch_requests: 4
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ training:
parallel_prefetch_requests: 4
use_previous_model: True
initial_model: random
initial_pass:
activated: False
batch_size: 65536
optimizers:
- name: "mlp"
Expand Down
Loading

0 comments on commit 15ad8d9

Please sign in to comment.