Merge branch 'main' into feature/MaxiBoether/fast-selector-insertion

eth-easl · Oct 25, 2023 · d9a6f10 · d9a6f10
2 parents ea8779a + 98bd787
commit d9a6f10
Show file tree

Hide file tree

Showing 97 changed files with 3,378 additions and 666 deletions.
diff --git a/benchmark/criteo_1TB/pipelines/exp0_finetune.yml b/benchmark/criteo_1TB/pipelines/exp0_finetune.yml
@@ -40,6 +40,9 @@ model:
       cat_23: 12022
       cat_24: 97
       cat_25: 35
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
 training:
   gpus: 1
   device: "cuda:0"

diff --git a/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml b/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml
@@ -40,6 +40,9 @@ model:
       cat_23: 12022
       cat_24: 97
       cat_25: 35
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
 training:
   gpus: 1
   device: "cuda:0"

diff --git a/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml b/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml
@@ -40,6 +40,9 @@ model:
       cat_23: 12022
       cat_24: 97
       cat_25: 35
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
 training:
   gpus: 1
   device: "cuda:0"

diff --git a/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml b/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml
@@ -40,6 +40,9 @@ model:
       cat_23: 12022
       cat_24: 97
       cat_25: 35
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
 training:
   gpus: 1
   device: "cuda:0"

diff --git a/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml b/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml
@@ -40,6 +40,9 @@ model:
       cat_23: 12022
       cat_24: 97
       cat_25: 35
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
 training:
   gpus: 1
   device: "cuda:0"

diff --git a/benchmark/mnist/mnist.yaml b/benchmark/mnist/mnist.yaml
@@ -6,6 +6,9 @@ model:
   id: ResNet18
   config:
     num_classes: 10
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
 training:
   gpus: 1
   device: "cuda:0"
@@ -43,7 +46,6 @@ data:
     import io
     def bytes_parser_function(data: bytes) -> Image:
       return Image.open(io.BytesIO(data)).convert("RGB")
-
 trigger:
   id: DataAmountTrigger
   trigger_config:

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml b/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml
@@ -6,6 +6,9 @@ model:
   id: ArticleNet
   config:
     num_classes: 172
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
 training:
   gpus: 1
   device: "cuda:0"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml b/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml
@@ -6,6 +6,9 @@ model:
   id: FmowNet
   config:
     num_classes: 62
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
 training:
   gpus: 1
   device: "cuda:0"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml b/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml
@@ -6,6 +6,9 @@ model:
   id: ArticleNet
   config:
     num_classes: 55
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
 training:
   gpus: 1
   device: "cuda:0"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml
@@ -7,6 +7,9 @@ model:
   config:
     num_input_channels: 1
     num_classes: 2
+model_storage:
+  full_model_strategy:
+    name: "PyTorchFullModel"
 training:
   gpus: 1
   device: "cuda:0"

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -71,6 +71,8 @@ services:
     build:
       context: .
       dockerfile: docker/Model_Storage/Dockerfile
+    volumes:
+      - model_storage-data:/tmp/models
   evaluator:
     restart: on-failure
     depends_on:
@@ -85,6 +87,7 @@ services:
       - storage
       - selector
       - model_storage
+      - metadata-db
     build:
       context: .
       dockerfile: docker/Trainer_Server/Dockerfile
@@ -159,4 +162,5 @@ services:
 volumes:
   storage-data:
   selector-data:
-  downsampling-data:
+  downsampling-data:
+  model_storage-data:
diff --git a/docker/Model_Storage/Dockerfile b/docker/Model_Storage/Dockerfile
@@ -1,6 +1,8 @@
 FROM modynbase:latest
 
 RUN chmod a+x /src/modyn/model_storage/modyn-model-storage
+RUN mkdir -p /tmp/models
+RUN chown appuser /tmp/models
 
 # During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
 CMD mamba run -n modyn --no-capture-output ./modyn/model_storage/modyn-model-storage ./modyn/config/examples/modyn_config.yaml
diff --git a/environment.yml b/environment.yml
@@ -27,6 +27,7 @@ dependencies:
   - pyaml
   - numpy
   - pandas
+  - bitstring
   - tensorboard
   - scipy
   - pyftpdlib

diff --git a/integrationtests/metadata_processor/integrationtest_metadata_processor.py b/integrationtests/metadata_processor/integrationtest_metadata_processor.py
@@ -6,6 +6,7 @@
 import yaml
 from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection
 from modyn.metadata_database.models import SampleTrainingMetadata, TriggerTrainingMetadata
+from modyn.metadata_database.utils import ModelStorageStrategyConfig
 
 # pylint: disable-next=no-name-in-module
 from modyn.metadata_processor.internal.grpc.generated.metadata_processor_pb2 import (  # noqa: E402, E501
@@ -49,7 +50,9 @@ def get_grpc_channel(config: dict, component: str) -> grpc.Channel:
 
 def send_metadata_and_check_database(processor_client: MetadataProcessorClient, config: dict) -> int:
     with MetadataDatabaseConnection(config) as database:
-        pipeline_id = database.register_pipeline(2)
+        pipeline_id = database.register_pipeline(
+            2, "ResNet18", "{}", False, ModelStorageStrategyConfig("PyTorchFullModel")
+        )
 
     req = TrainingMetadataRequest(
         pipeline_id=pipeline_id,
-Original file line number
+Diff line change
@@ Expand Up / @@ -27,6 +27,7 @@ dependencies: @@
       - pyaml
       - numpy
       - pandas
+      - bitstring
       - tensorboard
       - scipy
       - pyftpdlib
@@ Expand Down @@