Make supervisor a server and introduce modyn client (#329)

Addresses issues #302, #317
eth-easl · Jan 18, 2024 · 15ad8d9 · 15ad8d9
1 parent f4cb0fe
commit 15ad8d9
Show file tree

Hide file tree

Showing 100 changed files with 4,950 additions and 2,167 deletions.
diff --git a/.github/actions/mamba/action.yml b/.github/actions/mamba/action.yml
@@ -4,7 +4,7 @@ runs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: '3.11'
 
       - name: Install prerequisites for pystack
         run: sudo apt-get install --no-install-recommends -qy libdw-dev libelf-dev
@@ -16,7 +16,7 @@ runs:
 
       # increase to reset cache manually
       - name: Set cache number
-        run: echo "CACHE_NUMBER=1" >> $GITHUB_ENV 
+        run: echo "CACHE_NUMBER=2" >> $GITHUB_ENV 
         shell: bash
 
       - name: Setup micromamba

diff --git a/.pylintrc b/.pylintrc
@@ -63,6 +63,7 @@ ignore-paths=^modyn/trainer_server/internal/grpc/generated/.*$,
              ^modyn/models/dlrm/utils/.*$,
              ^modyn/models/dlrm/nn/.*$,
              ^modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/.*$,
+             ^modyn/supervisor/internal/grpc/generated/.*$,
 
 # Files or directories matching the regex patterns are skipped. The regex
 # matches against base names, not paths. The default value ignores Emacs file

diff --git a/benchmark/criteo_1TB/pipelines/exp0_finetune.yml b/benchmark/criteo_1TB/pipelines/exp0_finetune.yml
@@ -52,8 +52,6 @@ training:
   parallel_prefetch_requests: 2
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/benchmark/criteo_1TB/pipelines/exp5_current_day_only.yml b/benchmark/criteo_1TB/pipelines/exp5_current_day_only.yml
@@ -52,8 +52,6 @@ training:
   parallel_prefetch_requests: 2
   use_previous_model: False
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/benchmark/criteo_1TB/pipelines/exp6_retrain_keep_model.yml b/benchmark/criteo_1TB/pipelines/exp6_retrain_keep_model.yml
@@ -52,8 +52,6 @@ training:
   parallel_prefetch_requests: 2
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/benchmark/criteo_1TB/pipelines/exp7_retrain_new_model.yml b/benchmark/criteo_1TB/pipelines/exp7_retrain_new_model.yml
@@ -52,8 +52,6 @@ training:
   parallel_prefetch_requests: 2
   use_previous_model: False # Same amount of computation (retraining on all data), but on different starting weights
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/benchmark/mnist/mnist.yaml b/benchmark/mnist/mnist.yaml
@@ -15,8 +15,6 @@ training:
   dataloader_workers: 2
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 64
   optimizers:
     - name: "default"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml b/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml
@@ -15,8 +15,6 @@ training:
   dataloader_workers: 2
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 96
   optimizers:
     - name: "default"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml b/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml
@@ -15,8 +15,6 @@ training:
   dataloader_workers: 2
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 64
   optimizers:
     - name: "default"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml b/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml
@@ -15,8 +15,6 @@ training:
   dataloader_workers: 2
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 64
   optimizers:
     - name: "default"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml
@@ -16,8 +16,6 @@ training:
   dataloader_workers: 2
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 64
   optimizers:
     - name: "default"

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -126,6 +126,7 @@ services:
 #      - .:/modyn_host
 #    shm_size: 4gb
   supervisor:
+    restart: on-failure
     depends_on:
     - storage
     - metadata-db
@@ -144,6 +145,8 @@ services:
       - 3000:50062
   tests:
     depends_on:
+      supervisor:
+        condition: service_started
       storage:
         condition: service_started
       selector:

diff --git a/docker/Supervisor/Dockerfile b/docker/Supervisor/Dockerfile
@@ -1,3 +1,6 @@
 FROM modynbase:latest
 
-CMD tail -f /dev/null
+RUN chmod a+x /src/modyn/supervisor/modyn-supervisor
+
+# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
+CMD mamba run -n modyn --no-capture-output ./modyn/supervisor/modyn-supervisor ./modyn/config/examples/modyn_config.yaml
diff --git a/docs/EXAMPLE.md b/docs/EXAMPLE.md
@@ -26,13 +26,18 @@ This is required for some trainings, e.g., for Criteo training, as you run into
 Optionally, you can uncomment the `.:/modyn_host` mount for all services to enable faster development cycles.
 This is not required if you do not iterate.
 
-### Starting the containers and the pipeline
+### Starting the containers
 Next, run `./scripts/run_modyn.sh` to build the containers and start them. 
 This may take several minutes for the first time.
 After building the containers, run `tmuxp load tmuxp.yaml` to have access to all container shells and logs.
-Switch to the supervisor pane (using regular tmux bindings).
-There, you can now submit a pipeline using the `modyn-supervisor` command.
-For example, you can run `modyn-supervisor --start-replay-at 0 benchmark/mnist/mnist.yaml modyn/config/examples/modyn_config.yaml`.
+
+### Starting the pipeline
+You can now submit a pipeline to the supervisor container using the `modyn-client` command in `modynclient/client/`.\
+For example, you can run `modyn-client --start-replay-at 0 --maximum-triggers 1 <pipeline config file> <modyn client config file> .`\
+pipeline config file example: modynclient/config/examples/mnist.yaml\
+modyn client config file example:
+- on your local machine: modynclient/config/examples/modyn_client_config.yaml
+- in one of the containers: modynclient/config/examples/modyn_client_config_container.yaml
 
 ### Iterating (for development)
 Since we copy the Modyn sources into the containers, if we change something locally outside of the containers, this does not get reflected in the containers.

diff --git a/environment.yml b/environment.yml
@@ -14,7 +14,7 @@ channels:
   - huggingface
 
 dependencies:
-  - python>=3.9
+  - python>=3.11
   - pip
   - tqdm
   - conda-forge::enlighten

diff --git a/experiments/criteo_online_dataset/pipelines/8workers_8prefetch_8parallel.yml b/experiments/criteo_online_dataset/pipelines/8workers_8prefetch_8parallel.yml
@@ -0,0 +1,119 @@
+pipeline:
+    name: 8workers_8prefetch_8parallel
+    description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
+    version: 1.0.0
+model:
+  id: DLRM
+  config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo:  https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
+    embedding_dim: 128
+    interaction_op: "cuda_dot"
+    hash_indices: False
+    bottom_mlp_sizes: [512, 256, 128]
+    top_mlp_sizes: [1024, 1024, 512, 256, 1]
+    embedding_type: "joint_fused"
+    num_numerical_features: 13
+    use_cpp_mlp: True
+    categorical_features_info:
+      cat_0: 7912889
+      cat_1: 33823
+      cat_2: 17139
+      cat_3: 7339
+      cat_4: 20046
+      cat_5: 4
+      cat_6: 7105
+      cat_7: 1382
+      cat_8: 63
+      cat_9: 5554114
+      cat_10: 582469
+      cat_11: 245828
+      cat_12: 11
+      cat_13: 2209
+      cat_14: 10667
+      cat_15: 104
+      cat_16: 4
+      cat_17: 968
+      cat_18: 15
+      cat_19: 8165896
+      cat_20: 2675940
+      cat_21: 7156453
+      cat_22: 302516
+      cat_23: 12022
+      cat_24: 97
+      cat_25: 35
+training:
+  gpus: 1
+  device: "cuda:0"
+  amp: True
+  dataloader_workers: 8
+  num_prefetched_partitions: 8
+  parallel_prefetch_requests: 8
+  use_previous_model: True
+  initial_model: random
+  batch_size: 65536
+  optimizers:
+    - name: "mlp"
+      algorithm: "FusedSGD"
+      source: "APEX"
+      param_groups:
+        - module: "model.top_model"
+          config:
+            lr: 24
+        - module: "model.bottom_model.mlp"
+          config:
+            lr: 24
+    - name: "opt_1"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model.bottom_model.embeddings"
+          config:
+            lr: 24
+  lr_scheduler:
+    name: "DLRMScheduler"
+    source: "Custom"
+    optimizers: ["mlp", "opt_1"]
+    config:
+      base_lrs: [[24, 24], [24]]
+      warmup_steps: 8000
+      warmup_factor: 0
+      decay_steps: 24000
+      decay_start_step: 48000
+      decay_power: 2
+      end_lr_factor: 0
+  optimization_criterion:
+    name: "BCEWithLogitsLoss"
+  grad_scaler_config:
+    growth_interval: 1000000000
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 2000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: criteo
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(x: bytes) -> dict:
+      num_features = x[:52]
+      cat_features = x[52:]
+      num_features_array = np.frombuffer(num_features, dtype=np.float32)
+      cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
+      return {
+        "numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
+        "categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
+      }
+  label_transformer_function: |
+    import torch
+    # we need to convert our integer-type labels to floats,
+    # since the BCEWithLogitsLoss function does not work with integers.
+    def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
+      return x.to(torch.float32)
+trigger:
+  id: DataAmountTrigger
+  trigger_config:
+    data_points_for_trigger: 20000000
+
diff --git a/experiments/criteo_online_dataset/pipelines/criteo_16_2_2_100000.yml b/experiments/criteo_online_dataset/pipelines/criteo_16_2_2_100000.yml
@@ -53,8 +53,6 @@ training:
   parallel_prefetch_requests: 2
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/experiments/criteo_online_dataset/pipelines/criteo_16_2_2_2500000.yml b/experiments/criteo_online_dataset/pipelines/criteo_16_2_2_2500000.yml
@@ -53,8 +53,6 @@ training:
   parallel_prefetch_requests: 2
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/experiments/criteo_online_dataset/pipelines/criteo_4_0_1_100000.yml b/experiments/criteo_online_dataset/pipelines/criteo_4_0_1_100000.yml
@@ -53,8 +53,6 @@ training:
   parallel_prefetch_requests: 1
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/experiments/criteo_online_dataset/pipelines/criteo_8_0_1_100000.yml b/experiments/criteo_online_dataset/pipelines/criteo_8_0_1_100000.yml
@@ -53,8 +53,6 @@ training:
   parallel_prefetch_requests: 1
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/experiments/criteo_online_dataset/pipelines/criteo_8_1_1_100000.yml b/experiments/criteo_online_dataset/pipelines/criteo_8_1_1_100000.yml
@@ -53,8 +53,6 @@ training:
   parallel_prefetch_requests: 1
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/experiments/criteo_online_dataset/pipelines/criteo_8_2_2_100000.yml b/experiments/criteo_online_dataset/pipelines/criteo_8_2_2_100000.yml
@@ -53,8 +53,6 @@ training:
   parallel_prefetch_requests: 2
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/experiments/criteo_online_dataset/pipelines/criteo_8_2_2_2500000.yml b/experiments/criteo_online_dataset/pipelines/criteo_8_2_2_2500000.yml
@@ -53,8 +53,6 @@ training:
   parallel_prefetch_requests: 2
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/experiments/criteo_online_dataset/pipelines/criteo_8_6_4_100000.yml b/experiments/criteo_online_dataset/pipelines/criteo_8_6_4_100000.yml
@@ -53,8 +53,6 @@ training:
   parallel_prefetch_requests: 4
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/experiments/criteo_online_dataset/pipelines/criteo_8_6_4_2500000.yml b/experiments/criteo_online_dataset/pipelines/criteo_8_6_4_2500000.yml
@@ -53,8 +53,6 @@ training:
   parallel_prefetch_requests: 4
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"

diff --git a/experiments/criteo_online_dataset/pipelines/criteo_8_6_4_5000000.yml b/experiments/criteo_online_dataset/pipelines/criteo_8_6_4_5000000.yml
@@ -53,8 +53,6 @@ training:
   parallel_prefetch_requests: 4
   use_previous_model: True
   initial_model: random
-  initial_pass:
-    activated: False
   batch_size: 65536
   optimizers:
     - name: "mlp"