diff --git a/benchmark/criteo_1TB/pipelines/exp0_finetune.yml b/benchmark/criteo_1TB/pipelines/exp0_finetune.yml index c8d0a1275..3b65773ed 100644 --- a/benchmark/criteo_1TB/pipelines/exp0_finetune.yml +++ b/benchmark/criteo_1TB/pipelines/exp0_finetune.yml @@ -40,6 +40,9 @@ model: cat_23: 12022 cat_24: 97 cat_25: 35 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml b/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml index e6957ce5b..66ea23ea8 100644 --- a/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml +++ b/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml @@ -40,6 +40,9 @@ model: cat_23: 12022 cat_24: 97 cat_25: 35 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml b/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml index c5697972c..8fe7031d1 100644 --- a/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml +++ b/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml @@ -40,6 +40,9 @@ model: cat_23: 12022 cat_24: 97 cat_25: 35 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml b/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml index e4a51eff4..a10bfeb05 100644 --- a/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml +++ b/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml @@ -40,6 +40,9 @@ model: cat_23: 12022 cat_24: 97 cat_25: 35 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml b/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml index eadfb1341..8e334a3a5 100644 --- a/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml +++ b/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml @@ -40,6 +40,9 @@ model: cat_23: 12022 cat_24: 97 cat_25: 35 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/mnist/mnist.yaml b/benchmark/mnist/mnist.yaml index 4b45d946d..f8a9be21c 100644 --- a/benchmark/mnist/mnist.yaml +++ b/benchmark/mnist/mnist.yaml @@ -6,6 +6,9 @@ model: id: ResNet18 config: num_classes: 10 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" @@ -43,7 +46,6 @@ data: import io def bytes_parser_function(data: bytes) -> Image: return Image.open(io.BytesIO(data)).convert("RGB") - trigger: id: DataAmountTrigger trigger_config: diff --git a/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml b/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml index 0415e6a61..d2161e813 100644 --- a/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml +++ b/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml @@ -6,6 +6,9 @@ model: id: ArticleNet config: num_classes: 172 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml b/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml index 80bd1aa28..bf25c56ec 100644 --- a/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml +++ b/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml @@ -6,6 +6,9 @@ model: id: FmowNet config: num_classes: 62 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml b/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml index 667522f18..556e92fed 100644 --- a/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml +++ b/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml @@ -6,6 +6,9 @@ model: id: ArticleNet config: num_classes: 55 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml index d35dd09b5..2dd650266 100644 --- a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml +++ b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml @@ -7,6 +7,9 @@ model: config: num_input_channels: 1 num_classes: 2 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/docker-compose.yml b/docker-compose.yml index fcf14bf03..965867f92 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -71,6 +71,8 @@ services: build: context: . dockerfile: docker/Model_Storage/Dockerfile + volumes: + - model_storage-data:/tmp/models evaluator: restart: on-failure depends_on: @@ -85,6 +87,7 @@ services: - storage - selector - model_storage + - metadata-db build: context: . dockerfile: docker/Trainer_Server/Dockerfile @@ -159,4 +162,5 @@ services: volumes: storage-data: selector-data: - downsampling-data: \ No newline at end of file + downsampling-data: + model_storage-data: \ No newline at end of file diff --git a/docker/Model_Storage/Dockerfile b/docker/Model_Storage/Dockerfile index 6b26823dd..144555e9e 100644 --- a/docker/Model_Storage/Dockerfile +++ b/docker/Model_Storage/Dockerfile @@ -1,6 +1,8 @@ FROM modynbase:latest RUN chmod a+x /src/modyn/model_storage/modyn-model-storage +RUN mkdir -p /tmp/models +RUN chown appuser /tmp/models # During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug CMD mamba run -n modyn --no-capture-output ./modyn/model_storage/modyn-model-storage ./modyn/config/examples/modyn_config.yaml \ No newline at end of file diff --git a/environment.yml b/environment.yml index 09c921b32..d89af153c 100644 --- a/environment.yml +++ b/environment.yml @@ -27,6 +27,7 @@ dependencies: - pyaml - numpy - pandas + - bitstring - tensorboard - scipy - pyftpdlib diff --git a/integrationtests/metadata_processor/integrationtest_metadata_processor.py b/integrationtests/metadata_processor/integrationtest_metadata_processor.py index 096105d2f..6b4fba996 100644 --- a/integrationtests/metadata_processor/integrationtest_metadata_processor.py +++ b/integrationtests/metadata_processor/integrationtest_metadata_processor.py @@ -6,6 +6,7 @@ import yaml from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.metadata_database.models import SampleTrainingMetadata, TriggerTrainingMetadata +from modyn.metadata_database.utils import ModelStorageStrategyConfig # pylint: disable-next=no-name-in-module from modyn.metadata_processor.internal.grpc.generated.metadata_processor_pb2 import ( # noqa: E402, E501 @@ -49,7 +50,9 @@ def get_grpc_channel(config: dict, component: str) -> grpc.Channel: def send_metadata_and_check_database(processor_client: MetadataProcessorClient, config: dict) -> int: with MetadataDatabaseConnection(config) as database: - pipeline_id = database.register_pipeline(2) + pipeline_id = database.register_pipeline( + 2, "ResNet18", "{}", False, ModelStorageStrategyConfig("PyTorchFullModel") + ) req = TrainingMetadataRequest( pipeline_id=pipeline_id, diff --git a/integrationtests/model_storage/integrationtest_model_storage.py b/integrationtests/model_storage/integrationtest_model_storage.py index f1e7e64d1..e142788c1 100644 --- a/integrationtests/model_storage/integrationtest_model_storage.py +++ b/integrationtests/model_storage/integrationtest_model_storage.py @@ -1,12 +1,18 @@ # end-to-end testing of the model storage component +import io +import json +import logging import pathlib import shutil +from typing import Optional import grpc +import torch from integrationtests.utils import get_modyn_config -from modyn.common.ftp import delete_file, download_file, upload_file +from modyn.common.ftp import delete_file, download_trained_model, upload_file from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection -from modyn.metadata_database.models import Trigger +from modyn.metadata_database.models import Pipeline, Trigger +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( DeleteModelRequest, DeleteModelResponse, @@ -16,19 +22,23 @@ RegisterModelResponse, ) from modyn.model_storage.internal.grpc.generated.model_storage_pb2_grpc import ModelStorageStub -from modyn.utils import grpc_connection_established +from modyn.models import ResNet18 +from modyn.utils import calculate_checksum, grpc_connection_established TEST_MODELS_PATH = pathlib.Path("/app") / "model_storage" / "test_models" -TEST_FILE_NAME_LOCAL = "test_model_local.txt" -TEST_FILE_NAME_LOCAL_RESP = "test_model_local_response.txt" -TEST_FILE_NAME_REMOTE = "test_model_remote.txt" + +FILE_NAME_PARENT = "test_parent.modyn" +MODEL_PARENT = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) + +FILE_NAME_CHILD = "test_child.modyn" +MODEL_CHILD = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) def create_dummy_file(): pathlib.Path(TEST_MODELS_PATH).mkdir(parents=True, exist_ok=True) - with open(TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL, "w") as f: - f.write("Test model storage component") + for model, file_name in [(MODEL_PARENT, FILE_NAME_PARENT), (MODEL_CHILD, FILE_NAME_CHILD)]: + torch.save({"model": model.model.state_dict(), "metadata": True}, TEST_MODELS_PATH / file_name) def cleanup_models_dir() -> None: @@ -45,60 +55,96 @@ def connect_to_model_storage(config: dict) -> grpc.Channel: return model_storage_channel -def upload_dummy_file_to_trainer(config: dict): - upload_file( - config["trainer_server"]["hostname"], - int(config["trainer_server"]["ftp_port"]), - "modyn", - "modyn", - local_file_path=TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL, - remote_file_path=pathlib.Path(TEST_FILE_NAME_REMOTE), - ) - - -def delete_dummy_file_from_trainer(config: dict): - delete_file( - config["trainer_server"]["hostname"], - int(config["trainer_server"]["ftp_port"]), - "modyn", - "modyn", - pathlib.Path(TEST_FILE_NAME_REMOTE), - ) - - -def insert_trigger_into_database(config: dict) -> (int, int): - with MetadataDatabaseConnection(config) as database: - pipeline_id = database.register_pipeline(2, "{}") - - trigger = Trigger(trigger_id=10, pipeline_id=pipeline_id) - database.session.add(trigger) +def upload_dummy_files_to_trainer(config: dict): + for file_name in [FILE_NAME_PARENT, FILE_NAME_CHILD]: + upload_file( + config["trainer_server"]["hostname"], + int(config["trainer_server"]["ftp_port"]), + "modyn", + "modyn", + local_file_path=TEST_MODELS_PATH / file_name, + remote_file_path=pathlib.Path(file_name), + ) + + +def delete_dummy_files_from_trainer(config: dict): + for file_name in [FILE_NAME_PARENT, FILE_NAME_CHILD]: + delete_file( + config["trainer_server"]["hostname"], + int(config["trainer_server"]["ftp_port"]), + "modyn", + "modyn", + pathlib.Path(file_name), + ) + + +def insert_triggers_into_database( + modyn_config: dict, + full_strategy: ModelStorageStrategyConfig, + inc_strategy: Optional[ModelStorageStrategyConfig], + full_model_interval: Optional[int], +) -> (int, int): + with MetadataDatabaseConnection(modyn_config) as database: + pipeline_id = database.register_pipeline( + 2, + "ResNet18", + json.dumps({"num_classes": 10}), + False, + "{}", + full_strategy, + inc_strategy, + full_model_interval, + ) + + trigger_parent = Trigger(trigger_id=0, pipeline_id=pipeline_id) + trigger_child = Trigger(trigger_id=1, pipeline_id=pipeline_id) + database.session.add(trigger_parent) + database.session.add(trigger_child) database.session.commit() - return trigger.pipeline_id, trigger.trigger_id + return pipeline_id, trigger_parent.trigger_id, trigger_child.trigger_id -def delete_data_from_database(config: dict, pipeline_id: int, trigger_id: int): - with MetadataDatabaseConnection(config) as database: +def delete_data_from_database(modyn_config: dict, pipeline_id: int): + with MetadataDatabaseConnection(modyn_config) as database: database.session.query(Trigger).filter( - Trigger.pipeline_id == pipeline_id and Trigger.trigger_id == trigger_id + Trigger.pipeline_id == pipeline_id, ).delete() + database.session.query(Pipeline).filter(Pipeline.pipeline_id == pipeline_id).delete() database.session.commit() -def test_model_storage(config: dict): - # register pipeline and trigger - pipeline_id, trigger_id = insert_trigger_into_database(config) +def check_loaded_model(path: pathlib.Path, original_model_state: dict) -> None: + with open(path, "rb") as state_file: + checkpoint = torch.load(io.BytesIO(state_file.read())) - model_storage_channel = connect_to_model_storage(config) - model_storage = ModelStorageStub(model_storage_channel) + assert "model" in checkpoint, "Model state is not stored in file" + resnet = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) + resnet.model.load_state_dict(checkpoint["model"]) + + assert checkpoint["metadata"] + + loaded_state = resnet.model.state_dict() + for layer_name, _ in resnet.model.state_dict().items(): + assert torch.allclose(loaded_state[layer_name], original_model_state[layer_name], rtol=1e-04, atol=1e-05) - # try to register a new model in the model storage + +def download_and_check_model( + pipeline_id: int, + trigger_id: int, + modyn_config: dict, + model_storage: ModelStorageStub, + file_name: str, + original_model_state: dict, +) -> int: + # try to register a new model at model storage request_register = RegisterModelRequest( pipeline_id=pipeline_id, trigger_id=trigger_id, - hostname=config["trainer_server"]["hostname"], - port=int(config["trainer_server"]["ftp_port"]), - model_path=str(TEST_FILE_NAME_REMOTE), + hostname=modyn_config["trainer_server"]["hostname"], + port=int(modyn_config["trainer_server"]["ftp_port"]), + model_path=file_name, + checksum=calculate_checksum(TEST_MODELS_PATH / file_name), ) response_register: RegisterModelResponse = model_storage.RegisterModel(request_register) @@ -106,56 +152,123 @@ def test_model_storage(config: dict): model_id = response_register.model_id # try to fetch the registered model - request_fetch = FetchModelRequest(model_id=model_id) + request_fetch = FetchModelRequest(model_id=model_id, load_metadata=True) response_fetch: FetchModelResponse = model_storage.FetchModel(request_fetch) - model_path = pathlib.Path(response_fetch.model_path) assert response_fetch.success, "Could not find model with this id" # download the model (dummy file) from model storage - download_file( - config["model_storage"]["hostname"], - int(config["model_storage"]["ftp_port"]), - "modyn", - "modyn", - remote_file_path=model_path, - local_file_path=TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL_RESP, + downloaded_path = download_trained_model( + logging.getLogger(__name__), + modyn_config["model_storage"], + remote_path=pathlib.Path(response_fetch.model_path), + checksum=response_fetch.checksum, + identifier=42, + base_directory=TEST_MODELS_PATH, + ) + + assert downloaded_path is not None + + # compare if content matches initial dummy file & delete it + check_loaded_model(downloaded_path, original_model_state) + downloaded_path.unlink() + + return model_id + + +def test_model_storage( + modyn_config: dict, + full_strategy: ModelStorageStrategyConfig, + inc_strategy: Optional[ModelStorageStrategyConfig], + full_model_interval: Optional[int], +): + # register pipeline and trigger + pipeline_id, parent_trigger, child_trigger = insert_triggers_into_database( + modyn_config, full_strategy, inc_strategy, full_model_interval + ) + + with MetadataDatabaseConnection(modyn_config) as database: + model_class_name, model_config, amp = database.get_model_configuration(pipeline_id) + + assert model_class_name == "ResNet18" + assert json.loads(model_config) == {"num_classes": 10} + assert not amp + + model_storage_channel = connect_to_model_storage(modyn_config) + model_storage = ModelStorageStub(model_storage_channel) + + parent_id = download_and_check_model( + pipeline_id, parent_trigger, modyn_config, model_storage, FILE_NAME_PARENT, MODEL_PARENT.model.state_dict() ) - # compare if content matches initial dummy file - with open(TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL_RESP, "r") as resp_file: - assert resp_file.read() == "Test model storage component", "File contents do not match" + child_id = download_and_check_model( + pipeline_id, child_trigger, modyn_config, model_storage, FILE_NAME_CHILD, MODEL_CHILD.model.state_dict() + ) + + if inc_strategy is not None: + # try to delete parent on model storage + request_delete = DeleteModelRequest(model_id=parent_id) + response_delete: DeleteModelResponse = model_storage.DeleteModel(request_delete) + + assert not response_delete.success - # delete model on model storage component - request_delete = DeleteModelRequest(model_id=model_id) + # delete child on model storage + request_delete = DeleteModelRequest(model_id=child_id) response_delete: DeleteModelResponse = model_storage.DeleteModel(request_delete) assert response_delete.success # fetch a (now) invalid model - request_invalid_fetch = FetchModelRequest(model_id=model_id) + request_invalid_fetch = FetchModelRequest(model_id=child_id) response_invalid_fetch: FetchModelResponse = model_storage.FetchModel(request_invalid_fetch) assert not response_invalid_fetch.success # delete a (now) invalid model - request_invalid_delete = DeleteModelRequest(model_id=model_id) + request_invalid_delete = DeleteModelRequest(model_id=child_id) response_invalid_delete: DeleteModelResponse = model_storage.DeleteModel(request_invalid_delete) assert not response_invalid_delete.success + # delete parent on model storage + request_delete = DeleteModelRequest(model_id=parent_id) + response_delete: DeleteModelResponse = model_storage.DeleteModel(request_delete) + + assert response_delete.success + # clean-up database - delete_data_from_database(config, pipeline_id, trigger_id) + delete_data_from_database(modyn_config, pipeline_id) def main() -> None: modyn_config = get_modyn_config() + + pytorch_full = ModelStorageStrategyConfig("PyTorchFullModel") + + compressed_full = ModelStorageStrategyConfig("BinaryFullModel") + compressed_full.zip = True + compressed_full.zip_algorithm = "ZIP_LZMA" + + sub_delta_inc = ModelStorageStrategyConfig("WeightsDifference") + sub_delta_inc.config = json.dumps({"operator": "sub"}) + + xor_full = ModelStorageStrategyConfig("WeightsDifference") + xor_full.zip = True + xor_full.config = json.dumps({"operator": "xor", "split_exponent": True, "rle": True}) + + policies = [ + (pytorch_full, None, None), + (compressed_full, sub_delta_inc, 5), + (pytorch_full, xor_full, 5), + ] try: create_dummy_file() - upload_dummy_file_to_trainer(modyn_config) - test_model_storage(modyn_config) + upload_dummy_files_to_trainer(modyn_config) + + for policy in policies: + test_model_storage(modyn_config, *policy) finally: - delete_dummy_file_from_trainer(modyn_config) + delete_dummy_files_from_trainer(modyn_config) cleanup_models_dir() diff --git a/integrationtests/selector/integrationtest_selector.py b/integrationtests/selector/integrationtest_selector.py index c88bbed83..5d8410dd9 100644 --- a/integrationtests/selector/integrationtest_selector.py +++ b/integrationtests/selector/integrationtest_selector.py @@ -8,8 +8,10 @@ GetNumberOfPartitionsRequest, GetSamplesRequest, JsonString, + ModelStoragePolicyInfo, RegisterPipelineRequest, SamplesResponse, + StrategyConfig, ) from modyn.selector.internal.grpc.generated.selector_pb2_grpc import SelectorStub from modyn.utils import grpc_connection_established @@ -29,6 +31,10 @@ def connect_to_selector_servicer() -> grpc.Channel: return selector_channel +def get_model_storage_policy() -> ModelStoragePolicyInfo: + return ModelStoragePolicyInfo(full_model_strategy_config=StrategyConfig(name="PyTorchFullModel")) + + def test_label_balanced_presampling_huge() -> None: selector_channel = connect_to_selector_servicer() selector = SelectorStub(selector_channel) @@ -44,7 +50,14 @@ def test_label_balanced_presampling_huge() -> None: } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_class_name="ResNet10", + model_configuration=JsonString(value="{}"), + amp=False, + model_storage_policy=get_model_storage_policy(), + ) ).pipeline_id trigger_id = selector.inform_data_and_trigger( @@ -124,7 +137,14 @@ def test_label_balanced_force_same_size(): } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_class_name="ResNet10", + model_configuration=JsonString(value="{}"), + amp=False, + model_storage_policy=get_model_storage_policy(), + ) ).pipeline_id # now we just have 2 classes with 4 samples each @@ -208,7 +228,14 @@ def test_label_balanced_force_all_samples(): } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_class_name="ResNet10", + model_configuration=JsonString(value="{}"), + amp=False, + model_storage_policy=get_model_storage_policy(), + ) ).pipeline_id # same classes as before @@ -298,7 +325,14 @@ def test_newdata() -> None: } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_class_name="ResNet10", + model_configuration=JsonString(value="{}"), + amp=False, + model_storage_policy=get_model_storage_policy(), + ) ).pipeline_id selector.inform_data( @@ -437,7 +471,14 @@ def test_abstract_downsampler(reset_after_trigger) -> None: } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_class_name="ResNet10", + model_configuration=JsonString(value="{}"), + amp=False, + model_storage_policy=get_model_storage_policy(), + ) ).pipeline_id selector.inform_data( @@ -586,7 +627,14 @@ def test_empty_triggers() -> None: } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_class_name="ResNet10", + model_configuration=JsonString(value="{}"), + amp=False, + model_storage_policy=get_model_storage_policy(), + ) ).pipeline_id selector.inform_data( @@ -754,7 +802,14 @@ def test_many_samples_evenly_distributed(): } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_class_name="ResNet10", + model_configuration=JsonString(value="{}"), + amp=False, + model_storage_policy=get_model_storage_policy(), + ) ).pipeline_id selector.inform_data( @@ -824,7 +879,14 @@ def test_many_samples_unevenly_distributed(): } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_class_name="ResNet10", + model_configuration=JsonString(value="{}"), + amp=False, + model_storage_policy=get_model_storage_policy(), + ) ).pipeline_id selector.inform_data( @@ -895,7 +957,14 @@ def test_get_available_labels(reset_after_trigger: bool): } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_class_name="ResNet10", + model_configuration=JsonString(value="{}"), + amp=False, + model_storage_policy=get_model_storage_policy(), + ) ).pipeline_id selector.inform_data( diff --git a/modyn/common/ftp/__init__.py b/modyn/common/ftp/__init__.py index d9f091f01..de047c5ea 100644 --- a/modyn/common/ftp/__init__.py +++ b/modyn/common/ftp/__init__.py @@ -1,7 +1,13 @@ import os from .ftp_server import FTPServer # noqa: F401 -from .ftp_utils import delete_file, download_file, get_pretrained_model_callback, upload_file # noqa: F401 +from .ftp_utils import ( # noqa: F401 + delete_file, + download_file, + download_trained_model, + get_pretrained_model_callback, + upload_file, +) files = os.listdir(os.path.dirname(__file__)) files.remove("__init__.py") diff --git a/modyn/common/ftp/ftp_utils.py b/modyn/common/ftp/ftp_utils.py index 18afc8487..5e675752f 100644 --- a/modyn/common/ftp/ftp_utils.py +++ b/modyn/common/ftp/ftp_utils.py @@ -1,10 +1,11 @@ # Utils file containing functions in order to simplify FTP server interactions. +import logging import pathlib from ftplib import FTP from logging import Logger from typing import Any, Callable, Optional -from modyn.utils import EMIT_MESSAGE_PERCENTAGES +from modyn.utils import EMIT_MESSAGE_PERCENTAGES, calculate_checksum def download_file( @@ -15,7 +16,8 @@ def download_file( remote_file_path: pathlib.Path, local_file_path: pathlib.Path, callback: Optional[Callable[[float], None]] = None, -) -> None: + checksum: Optional[bytes] = None, +) -> bool: """Downloads a file from a given host to the local filesystem. If the file already exists, it gets overwritten. Args: @@ -26,9 +28,9 @@ def download_file( remote_file_path: path to the remote file. local_file_path: local path to the file. callback(float): function called every block of data with the current progress in [0, 1]. - + checksum: the expected hash of the file. Returns: - + bool: whether the file was successfully downloaded. """ ftp = FTP() ftp.connect(hostname, port, timeout=3) @@ -54,6 +56,11 @@ def write_callback(data: Any) -> None: ftp.close() + if checksum: + local_hash = calculate_checksum(local_file_path) + return local_hash == checksum + return True + def upload_file( hostname: str, port: int, user: str, password: str, local_file_path: pathlib.Path, remote_file_path: pathlib.Path @@ -121,3 +128,41 @@ def download_callback(current_progress: float) -> None: last_progress = current_progress return download_callback + + +def download_trained_model( + logger: logging.Logger, + model_storage_config: dict, + remote_path: pathlib.Path, + checksum: bytes, + identifier: int, + base_directory: pathlib.Path, +) -> Optional[pathlib.Path]: + model_path = base_directory / f"trained_model_{identifier}.modyn" + + success = download_file( + hostname=model_storage_config["hostname"], + port=int(model_storage_config["ftp_port"]), + user="modyn", + password="modyn", + remote_file_path=remote_path, + local_file_path=model_path, + callback=get_pretrained_model_callback(logger), + checksum=checksum, + ) + + if not success: + logger.error("Checksums did not match, evaluation cannot be started.") + return None + + delete_file( + hostname=model_storage_config["hostname"], + port=int(model_storage_config["ftp_port"]), + user="modyn", + password="modyn", + remote_file_path=pathlib.Path(remote_path), + ) + + logger.info(f"Successfully downloaded trained model to {model_path}.") + + return model_path diff --git a/modyn/config/examples/example-pipeline.yaml b/modyn/config/examples/example-pipeline.yaml index 10e194c86..66e664093 100644 --- a/modyn/config/examples/example-pipeline.yaml +++ b/modyn/config/examples/example-pipeline.yaml @@ -6,6 +6,16 @@ model: id: ResNet18 config: num_classes: 10 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" + incremental_model_strategy: + name: "WeightsDifference" + zip: True + zip_algorithm: ZIP_DEFLATED + config: + operator: xor + full_model_interval: 10 training: gpus: 1 device: "cpu" @@ -63,7 +73,6 @@ trigger: data_points_for_trigger: 100 evaluation: device: "cpu" - amp: False result_writers: ["json", "tensorboard"] datasets: - dataset_id: mnist diff --git a/modyn/config/examples/modyn_config.yaml b/modyn/config/examples/modyn_config.yaml index b2e80a1f1..1177384aa 100644 --- a/modyn/config/examples/modyn_config.yaml +++ b/modyn/config/examples/modyn_config.yaml @@ -173,6 +173,7 @@ model_storage: hostname: "model_storage" port: "50059" ftp_port: "50060" + models_directory: "/tmp/models" evaluator: hostname: "evaluator" diff --git a/modyn/config/schema/modyn_config_schema.yaml b/modyn/config/schema/modyn_config_schema.yaml index a5e9d4497..5227ad24b 100644 --- a/modyn/config/schema/modyn_config_schema.yaml +++ b/modyn/config/schema/modyn_config_schema.yaml @@ -214,6 +214,10 @@ properties: type: string description: | The port of the FDP server used by the model_storage component. + models_directory: + type: string + description: | + The directory where we store the trained models. required: - hostname - port @@ -363,6 +367,8 @@ properties: required: - project - storage + - evaluator + - model_storage - metadata_database - selector - trainer_server \ No newline at end of file diff --git a/modyn/config/schema/pipeline-schema.yaml b/modyn/config/schema/pipeline-schema.yaml index 8bbdcf792..42d77713e 100644 --- a/modyn/config/schema/pipeline-schema.yaml +++ b/modyn/config/schema/pipeline-schema.yaml @@ -37,6 +37,61 @@ properties: Configuration dictionary that will be passed to the model on initialization. required: - id + model_storage: + type: object + properties: + full_model_strategy: + type: object + description: | + Which full model strategy is used. + properties: + name: + type: string + description: | + Name of the full model strategy. We currently support PyTorchFullModel and BinaryFullModel. + config: + type: object + description: | + Configuration dictionary that will be passed to the strategy. + zip: + type: boolean + description: | + Whether to zip the file in the end. Defaults to False. + zip_algorithm: + type: string + description: | + Which zip algorithm to use. Default is ZIP_DEFLATED. + required: + - name + incremental_model_strategy: + type: object + description: | + Which incremental model strategy is used. + properties: + name: + type: string + description: | + Name of the incremental model strategy. We currently support WeightsDifference. + config: + type: object + description: | + Configuration dictionary that will be passed to the strategy. + zip: + type: boolean + description: | + Whether to zip the file in the end. Defaults to False. + zip_algorithm: + type: string + description: | + Which zip algorithm to use. Default is ZIP_DEFLATED. + full_model_interval: + type: number + description: | + In which interval we are using the full model strategy. + required: + - name + required: + - full_model_strategy training: type: object properties: @@ -365,10 +420,6 @@ properties: description: | The device the model should be put on. In the future (#131), we might want this to be either "cpu" or "gpu" and let the evaluator figure out the exact device, but for now, this really is the identifier of the device. - amp: - type: boolean - description: | - If True, automatic mixed precision will be used. result_writers: type: array description: | @@ -452,6 +503,7 @@ properties: required: - pipeline - model + - model_storage - training - data - trigger diff --git a/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py b/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py index fabe505af..1ffc517d2 100644 --- a/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py +++ b/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py @@ -8,7 +8,7 @@ from typing import Any, Optional import grpc -from modyn.common.ftp import download_file, get_pretrained_model_callback +from modyn.common.ftp import download_trained_model # pylint: disable-next=no-name-in-module from modyn.evaluator.internal.grpc.generated.evaluator_pb2 import ( @@ -26,6 +26,8 @@ from modyn.evaluator.internal.metrics import AbstractEvaluationMetric from modyn.evaluator.internal.pytorch_evaluator import evaluate from modyn.evaluator.internal.utils import EvaluationInfo, EvaluationProcessInfo, EvaluatorMessages +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.models import TrainedModel # pylint: disable-next=no-name-in-module from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import FetchModelRequest, FetchModelResponse @@ -87,19 +89,28 @@ def connect_to_storage(storage_address: str) -> StorageStub: raise ConnectionError(f"Could not establish gRPC connection to storage at address {storage_address}.") return StorageStub(storage_channel) + # pylint: disable=too-many-locals def evaluate_model(self, request: EvaluateModelRequest, context: grpc.ServicerContext) -> EvaluateModelResponse: logger.info("Received evaluate model request.") - if not hasattr(dynamic_module_import("modyn.models"), request.model_id): - logger.error(f"Model {request.model_id} not available!") + with MetadataDatabaseConnection(self._config) as database: + trained_model: Optional[TrainedModel] = database.session.get(TrainedModel, request.model_id) + + if not trained_model: + logger.error(f"Trained model {request.model_id} does not exist!") + return EvaluateModelResponse(evaluation_started=False) + model_class_name, model_config, amp = database.get_model_configuration(trained_model.pipeline_id) + + if not hasattr(dynamic_module_import("modyn.models"), model_class_name): + logger.error(f"Model {model_class_name} not available!") return EvaluateModelResponse(evaluation_started=False) - fetch_request = FetchModelRequest(model_id=request.trained_model_id) + fetch_request = FetchModelRequest(model_id=request.model_id, load_metadata=False) fetch_resp: FetchModelResponse = self._model_storage_stub.FetchModel(fetch_request) if not fetch_resp.success: logger.error( - f"Trained model {request.trained_model_id} cannot be fetched from model storage. " + f"Trained model {request.model_id} cannot be fetched from model storage. " f"Evaluation cannot be started." ) return EvaluateModelResponse(evaluation_started=False) @@ -118,10 +129,29 @@ def evaluate_model(self, request: EvaluateModelRequest, context: grpc.ServicerCo evaluation_id = self._next_evaluation_id self._next_evaluation_id += 1 - local_model_path = self._download_trained_model(fetch_resp, evaluation_id) + trained_model_path = download_trained_model( + logger=logger, + model_storage_config=self._config["model_storage"], + remote_path=pathlib.Path(fetch_resp.model_path), + checksum=fetch_resp.checksum, + identifier=evaluation_id, + base_directory=self._base_dir, + ) + + if not trained_model_path: + return EvaluateModelResponse(evaluation_started=False) metrics = self._setup_metrics(request.metrics) - evaluation_info = EvaluationInfo(request, evaluation_id, self._storage_address, metrics, local_model_path) + evaluation_info = EvaluationInfo( + request, + evaluation_id, + model_class_name, + model_config, + amp, + self._storage_address, + metrics, + trained_model_path, + ) self._evaluation_dict[evaluation_id] = evaluation_info self._run_evaluation(evaluation_id) @@ -130,23 +160,6 @@ def evaluate_model(self, request: EvaluateModelRequest, context: grpc.ServicerCo evaluation_started=True, evaluation_id=evaluation_id, dataset_size=dataset_size_response.num_keys ) - def _download_trained_model(self, fetch_resp: FetchModelResponse, evaluation_id: int) -> pathlib.Path: - local_model_path = self._base_dir / f"trained_model_{evaluation_id}.modyn" - - download_file( - hostname=self._config["model_storage"]["hostname"], - port=int(self._config["model_storage"]["ftp_port"]), - user="modyn", - password="modyn", - remote_file_path=pathlib.Path(fetch_resp.model_path), - local_file_path=local_model_path, - callback=get_pretrained_model_callback(logger), - ) - - logger.info(f"Successfully downloaded trained model to {local_model_path}.") - - return local_model_path - @staticmethod def _setup_metrics(metric_configurations: list[MetricConfiguration]) -> list[AbstractEvaluationMetric]: metrics = [] diff --git a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py index 35f6f4eef..1b4ea738a 100644 --- a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py +++ b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x65valuator.proto\x12\x0fmodyn.evaluator\":\n\x0b\x44\x61tasetInfo\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x8f\x01\n\x13MetricConfiguration\x12\x0c\n\x04name\x18\x01 \x01(\t\x12+\n\x06\x63onfig\x18\x02 \x01(\x0b\x32\x1b.modyn.evaluator.JsonString\x12=\n\x16\x65valuation_transformer\x18\x03 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"\x9f\x03\n\x14\x45valuateModelRequest\x12\x18\n\x10trained_model_id\x18\x01 \x01(\x05\x12\x32\n\x0c\x64\x61taset_info\x18\x02 \x01(\x0b\x32\x1c.modyn.evaluator.DatasetInfo\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x0b\n\x03\x61mp\x18\x04 \x01(\x08\x12\x12\n\nbatch_size\x18\x05 \x01(\x05\x12\x35\n\x07metrics\x18\x06 \x03(\x0b\x32$.modyn.evaluator.MetricConfiguration\x12\x10\n\x08model_id\x18\x07 \x01(\t\x12\x38\n\x13model_configuration\x18\x08 \x01(\x0b\x32\x1b.modyn.evaluator.JsonString\x12\x16\n\x0etransform_list\x18\t \x03(\t\x12\x33\n\x0c\x62ytes_parser\x18\n \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\x12\x38\n\x11label_transformer\x18\x0b \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"`\n\x15\x45valuateModelResponse\x12\x1a\n\x12\x65valuation_started\x18\x01 \x01(\x08\x12\x15\n\revaluation_id\x18\x02 \x01(\x05\x12\x14\n\x0c\x64\x61taset_size\x18\x03 \x01(\x03\"0\n\x17\x45valuationStatusRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"\xe5\x01\n\x18\x45valuationStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x17\n\x0fstate_available\x18\x03 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x04 \x01(\x08\x12\x16\n\texception\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x06 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x07 \x01(\x03H\x02\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seen\"0\n\x0e\x45valuationData\x12\x0e\n\x06metric\x18\x01 \x01(\t\x12\x0e\n\x06result\x18\x02 \x01(\x02\"0\n\x17\x45valuationResultRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"c\n\x18\x45valuationResultResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x38\n\x0f\x65valuation_data\x18\x02 \x03(\x0b\x32\x1f.modyn.evaluator.EvaluationData2\xce\x02\n\tEvaluator\x12\x61\n\x0e\x65valuate_model\x12%.modyn.evaluator.EvaluateModelRequest\x1a&.modyn.evaluator.EvaluateModelResponse\"\x00\x12n\n\x15get_evaluation_status\x12(.modyn.evaluator.EvaluationStatusRequest\x1a).modyn.evaluator.EvaluationStatusResponse\"\x00\x12n\n\x15get_evaluation_result\x12(.modyn.evaluator.EvaluationResultRequest\x1a).modyn.evaluator.EvaluationResultResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x65valuator.proto\x12\x0fmodyn.evaluator\":\n\x0b\x44\x61tasetInfo\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x8f\x01\n\x13MetricConfiguration\x12\x0c\n\x04name\x18\x01 \x01(\t\x12+\n\x06\x63onfig\x18\x02 \x01(\x0b\x32\x1b.modyn.evaluator.JsonString\x12=\n\x16\x65valuation_transformer\x18\x03 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"\xbe\x02\n\x14\x45valuateModelRequest\x12\x10\n\x08model_id\x18\x01 \x01(\x05\x12\x32\n\x0c\x64\x61taset_info\x18\x02 \x01(\x0b\x32\x1c.modyn.evaluator.DatasetInfo\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\x05\x12\x35\n\x07metrics\x18\x05 \x03(\x0b\x32$.modyn.evaluator.MetricConfiguration\x12\x16\n\x0etransform_list\x18\x06 \x03(\t\x12\x33\n\x0c\x62ytes_parser\x18\x07 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\x12\x38\n\x11label_transformer\x18\x08 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"`\n\x15\x45valuateModelResponse\x12\x1a\n\x12\x65valuation_started\x18\x01 \x01(\x08\x12\x15\n\revaluation_id\x18\x02 \x01(\x05\x12\x14\n\x0c\x64\x61taset_size\x18\x03 \x01(\x03\"0\n\x17\x45valuationStatusRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"\xe5\x01\n\x18\x45valuationStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x17\n\x0fstate_available\x18\x03 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x04 \x01(\x08\x12\x16\n\texception\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x06 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x07 \x01(\x03H\x02\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seen\"0\n\x0e\x45valuationData\x12\x0e\n\x06metric\x18\x01 \x01(\t\x12\x0e\n\x06result\x18\x02 \x01(\x02\"0\n\x17\x45valuationResultRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"c\n\x18\x45valuationResultResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x38\n\x0f\x65valuation_data\x18\x02 \x03(\x0b\x32\x1f.modyn.evaluator.EvaluationData2\xce\x02\n\tEvaluator\x12\x61\n\x0e\x65valuate_model\x12%.modyn.evaluator.EvaluateModelRequest\x1a&.modyn.evaluator.EvaluateModelResponse\"\x00\x12n\n\x15get_evaluation_status\x12(.modyn.evaluator.EvaluationStatusRequest\x1a).modyn.evaluator.EvaluationStatusResponse\"\x00\x12n\n\x15get_evaluation_result\x12(.modyn.evaluator.EvaluationResultRequest\x1a).modyn.evaluator.EvaluationResultResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'evaluator_pb2', globals()) @@ -30,19 +30,19 @@ _METRICCONFIGURATION._serialized_start=157 _METRICCONFIGURATION._serialized_end=300 _EVALUATEMODELREQUEST._serialized_start=303 - _EVALUATEMODELREQUEST._serialized_end=718 - _EVALUATEMODELRESPONSE._serialized_start=720 - _EVALUATEMODELRESPONSE._serialized_end=816 - _EVALUATIONSTATUSREQUEST._serialized_start=818 - _EVALUATIONSTATUSREQUEST._serialized_end=866 - _EVALUATIONSTATUSRESPONSE._serialized_start=869 - _EVALUATIONSTATUSRESPONSE._serialized_end=1098 - _EVALUATIONDATA._serialized_start=1100 - _EVALUATIONDATA._serialized_end=1148 - _EVALUATIONRESULTREQUEST._serialized_start=1150 - _EVALUATIONRESULTREQUEST._serialized_end=1198 - _EVALUATIONRESULTRESPONSE._serialized_start=1200 - _EVALUATIONRESULTRESPONSE._serialized_end=1299 - _EVALUATOR._serialized_start=1302 - _EVALUATOR._serialized_end=1636 + _EVALUATEMODELREQUEST._serialized_end=621 + _EVALUATEMODELRESPONSE._serialized_start=623 + _EVALUATEMODELRESPONSE._serialized_end=719 + _EVALUATIONSTATUSREQUEST._serialized_start=721 + _EVALUATIONSTATUSREQUEST._serialized_end=769 + _EVALUATIONSTATUSRESPONSE._serialized_start=772 + _EVALUATIONSTATUSRESPONSE._serialized_end=1001 + _EVALUATIONDATA._serialized_start=1003 + _EVALUATIONDATA._serialized_end=1051 + _EVALUATIONRESULTREQUEST._serialized_start=1053 + _EVALUATIONRESULTREQUEST._serialized_end=1101 + _EVALUATIONRESULTRESPONSE._serialized_start=1103 + _EVALUATIONRESULTRESPONSE._serialized_end=1202 + _EVALUATOR._serialized_start=1205 + _EVALUATOR._serialized_end=1539 # @@protoc_insertion_point(module_scope) diff --git a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi index efb9d14c2..b6b1803f7 100644 --- a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi +++ b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi @@ -93,28 +93,21 @@ global___MetricConfiguration = MetricConfiguration class EvaluateModelRequest(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor - TRAINED_MODEL_ID_FIELD_NUMBER: builtins.int + MODEL_ID_FIELD_NUMBER: builtins.int DATASET_INFO_FIELD_NUMBER: builtins.int DEVICE_FIELD_NUMBER: builtins.int - AMP_FIELD_NUMBER: builtins.int BATCH_SIZE_FIELD_NUMBER: builtins.int METRICS_FIELD_NUMBER: builtins.int - MODEL_ID_FIELD_NUMBER: builtins.int - MODEL_CONFIGURATION_FIELD_NUMBER: builtins.int TRANSFORM_LIST_FIELD_NUMBER: builtins.int BYTES_PARSER_FIELD_NUMBER: builtins.int LABEL_TRANSFORMER_FIELD_NUMBER: builtins.int - trained_model_id: builtins.int + model_id: builtins.int @property def dataset_info(self) -> global___DatasetInfo: ... device: builtins.str - amp: builtins.bool batch_size: builtins.int @property def metrics(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___MetricConfiguration]: ... - model_id: builtins.str - @property - def model_configuration(self) -> global___JsonString: ... @property def transform_list(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ... @property @@ -124,20 +117,17 @@ class EvaluateModelRequest(google.protobuf.message.Message): def __init__( self, *, - trained_model_id: builtins.int = ..., + model_id: builtins.int = ..., dataset_info: global___DatasetInfo | None = ..., device: builtins.str = ..., - amp: builtins.bool = ..., batch_size: builtins.int = ..., metrics: collections.abc.Iterable[global___MetricConfiguration] | None = ..., - model_id: builtins.str = ..., - model_configuration: global___JsonString | None = ..., transform_list: collections.abc.Iterable[builtins.str] | None = ..., bytes_parser: global___PythonString | None = ..., label_transformer: global___PythonString | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["bytes_parser", b"bytes_parser", "dataset_info", b"dataset_info", "label_transformer", b"label_transformer", "model_configuration", b"model_configuration"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["amp", b"amp", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "dataset_info", b"dataset_info", "device", b"device", "label_transformer", b"label_transformer", "metrics", b"metrics", "model_configuration", b"model_configuration", "model_id", b"model_id", "trained_model_id", b"trained_model_id", "transform_list", b"transform_list"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["bytes_parser", b"bytes_parser", "dataset_info", b"dataset_info", "label_transformer", b"label_transformer"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "dataset_info", b"dataset_info", "device", b"device", "label_transformer", b"label_transformer", "metrics", b"metrics", "model_id", b"model_id", "transform_list", b"transform_list"]) -> None: ... global___EvaluateModelRequest = EvaluateModelRequest diff --git a/modyn/evaluator/internal/pytorch_evaluator.py b/modyn/evaluator/internal/pytorch_evaluator.py index 68dd6b4d9..7115966d3 100644 --- a/modyn/evaluator/internal/pytorch_evaluator.py +++ b/modyn/evaluator/internal/pytorch_evaluator.py @@ -89,7 +89,7 @@ def _load_state(self, path: pathlib.Path) -> None: self._model.model.load_state_dict(checkpoint["model"]) # delete trained model from disk - os.remove(path) + path.unlink() def send_status_to_server(self, batch_number: int) -> None: self._status_response_queue.put({"num_batches": batch_number, "num_samples": self._num_samples}) diff --git a/modyn/evaluator/internal/utils/evaluation_info.py b/modyn/evaluator/internal/utils/evaluation_info.py index 2ab27459b..a7c804d8e 100644 --- a/modyn/evaluator/internal/utils/evaluation_info.py +++ b/modyn/evaluator/internal/utils/evaluation_info.py @@ -17,23 +17,26 @@ def __init__( self, request: EvaluateModelRequest, evaluation_id: int, + model_class_name: str, + model_config: str, + amp: bool, storage_address: str, metrics: list[AbstractEvaluationMetric], model_path: pathlib.Path, ) -> None: - self.trained_model_id = request.trained_model_id + self.model_id = request.model_id self.dataset_id = request.dataset_info.dataset_id self.num_dataloaders = request.dataset_info.num_dataloaders self.device = request.device - self.amp = request.amp + self.amp = amp self.batch_size = request.batch_size self.metrics = metrics - self.model_id = request.model_id + self.model_class_name = model_class_name model_module = dynamic_module_import("modyn.models") - self.model_handler = getattr(model_module, self.model_id) - self.model_configuration_dict = json.loads(request.model_configuration.value) + self.model_handler = getattr(model_module, self.model_class_name) + self.model_configuration_dict = json.loads(model_config) self.transform_list = list(request.transform_list) self.bytes_parser = request.bytes_parser.value diff --git a/modyn/metadata_database/metadata_database_connection.py b/modyn/metadata_database/metadata_database_connection.py index 311a8c3f0..fd1d620f3 100644 --- a/modyn/metadata_database/metadata_database_connection.py +++ b/modyn/metadata_database/metadata_database_connection.py @@ -3,12 +3,14 @@ from __future__ import annotations import logging +from typing import Optional from modyn.database.abstract_database_connection import AbstractDatabaseConnection from modyn.metadata_database.metadata_base import MetadataBase from modyn.metadata_database.models import Pipeline from modyn.metadata_database.models.selector_state_metadata import SelectorStateMetadata from modyn.metadata_database.models.trained_models import TrainedModel +from modyn.metadata_database.utils import ModelStorageStrategyConfig from sqlalchemy import func logger = logging.getLogger(__name__) @@ -67,17 +69,50 @@ def create_tables(self) -> None: """ MetadataBase.metadata.create_all(self.engine) - def register_pipeline(self, num_workers: int, selection_strategy: str) -> int: + def register_pipeline( + self, + num_workers: int, + model_class_name: str, + model_config: str, + amp: bool, + selection_strategy: str, + full_model_strategy: ModelStorageStrategyConfig, + incremental_model_strategy: Optional[ModelStorageStrategyConfig] = None, + full_model_interval: Optional[int] = None, + ) -> int: """Register a new pipeline in the database. Args: num_workers (int): Number of workers in the pipeline. + model_class_name (str): the model class name that is used by the pipeline. + model_config (str): the serialized model configuration options. + amp (bool): whether amp is enabled for the model. selection_strategy (str): The selection strategy to use - + full_model_strategy: the strategy used to store full models. + incremental_model_strategy: the (optional) strategy used to store models incrementally. + full_model_interval: the (optional) interval between which the full model strategy is used. If not set, + the first model is stored according to the full model strategy, and the remaining + by using the incremental model strategy. Returns: int: Id of the newly created pipeline. """ - pipeline = Pipeline(num_workers=num_workers, selection_strategy=selection_strategy) + pipeline = Pipeline( + num_workers=num_workers, + model_class_name=model_class_name, + model_config=model_config, + amp=amp, + selection_strategy=selection_strategy, + full_model_strategy_name=full_model_strategy.name, + full_model_strategy_zip=full_model_strategy.zip, + full_model_strategy_zip_algorithm=full_model_strategy.zip_algorithm, + full_model_strategy_config=full_model_strategy.config, + ) + if incremental_model_strategy: + pipeline.inc_model_strategy_name = incremental_model_strategy.name + pipeline.inc_model_strategy_zip = incremental_model_strategy.zip + pipeline.inc_model_strategy_zip_algorithm = incremental_model_strategy.zip_algorithm + pipeline.inc_model_strategy_config = incremental_model_strategy.config + pipeline.full_model_interval = full_model_interval self.session.add(pipeline) self.session.commit() pipeline_id = pipeline.pipeline_id @@ -96,19 +131,46 @@ def add_selector_state_metadata_trigger(self, pipeline_id: int, trigger_id: int) pipeline_id, trigger_id, self.session, self.engine, self.hash_partition_modulus ) - def add_trained_model(self, pipeline_id: int, trigger_id: int, model_path: str) -> int: - """Add a trained model to the database. + def add_trained_model( + self, + pipeline_id: int, + trigger_id: int, + model_path: str, + metadata_path: str, + parent_model: Optional[int] = None, + ) -> int: + """Add a trained model to the database. Whenever the parent model is not specified, the model is expected to be + fully stored, i.e., by applying a full model strategy. Args: pipeline_id: id of the pipeline it was created from. trigger_id: id of the trigger it was created. model_path: path on the local filesystem on which the model is stored. - + metadata_path: the path on the local filesystem where model metadata is stored. + parent_model: (optional) id of the parent model. Returns: int: Id of the registered model """ - trained_model = TrainedModel(pipeline_id=pipeline_id, trigger_id=trigger_id, model_path=model_path) + trained_model = TrainedModel( + pipeline_id=pipeline_id, + trigger_id=trigger_id, + model_path=model_path, + metadata_path=metadata_path, + parent_model=parent_model, + ) self.session.add(trained_model) self.session.commit() model_id = trained_model.model_id return model_id + + def get_model_configuration(self, pipeline_id: int) -> tuple[str, str, bool]: + """Get the model id and its configuration options for a given pipeline. + + Args: + pipeline_id: id of the pipeline from which we want to extract the model. + + Returns: + (str, str, bool): the model class name, its configuration options and if amp is enabled. + """ + pipeline: Pipeline = self.session.query(Pipeline).get(pipeline_id) + return pipeline.model_class_name, pipeline.model_config, pipeline.amp diff --git a/modyn/metadata_database/models/pipelines.py b/modyn/metadata_database/models/pipelines.py index cd8370c7e..8683f115e 100644 --- a/modyn/metadata_database/models/pipelines.py +++ b/modyn/metadata_database/models/pipelines.py @@ -1,7 +1,7 @@ """Pipeline model.""" from modyn.metadata_database.metadata_base import MetadataBase -from sqlalchemy import Column, Integer, Text +from sqlalchemy import Boolean, Column, Integer, String, Text class Pipeline(MetadataBase): @@ -13,6 +13,22 @@ class Pipeline(MetadataBase): pipeline_id = Column("pipeline_id", Integer, primary_key=True) num_workers = Column("num_workers", Integer, nullable=False) selection_strategy = Column("selection_strategy", Text, nullable=False) + model_class_name = Column("model_class_name", String(length=50), nullable=False) + model_config = Column("model_config", String(length=500), nullable=False) + amp = Column("amp", Boolean, nullable=False) + full_model_strategy_name = Column("full_model_strategy_name", String(length=50), nullable=False) + full_model_strategy_zip = Column("full_model_strategy_zip", Boolean, default=False) + full_model_strategy_zip_algorithm = Column( + "full_model_strategy_zip_algorithm", String(length=50), default=None, nullable=True + ) + full_model_strategy_config = Column("full_model_strategy_config", String(length=500), default=None, nullable=True) + inc_model_strategy_name = Column("inc_model_strategy_name", String(length=50), default=None, nullable=True) + inc_model_strategy_zip = Column("inc_model_strategy_zip", Boolean, default=False) + inc_model_strategy_zip_algorithm = Column( + "inc_model_strategy_zip_algorithm", String(length=50), default=None, nullable=True + ) + inc_model_strategy_config = Column("inc_model_strategy_config", String(length=500), default=None, nullable=True) + full_model_interval = Column("full_model_interval", Integer, default=None, nullable=True) def __repr__(self) -> str: """Return string representation.""" diff --git a/modyn/metadata_database/models/trained_models.py b/modyn/metadata_database/models/trained_models.py index 27484c416..c2da41b22 100644 --- a/modyn/metadata_database/models/trained_models.py +++ b/modyn/metadata_database/models/trained_models.py @@ -3,7 +3,8 @@ from modyn.metadata_database.metadata_base import MetadataBase from modyn.metadata_database.models.triggers import Trigger -from sqlalchemy import TIMESTAMP, Column, ForeignKeyConstraint, Integer, String +from sqlalchemy import TIMESTAMP, Column, ForeignKey, ForeignKeyConstraint, Integer, String +from sqlalchemy.orm import relationship class TrainedModel(MetadataBase): @@ -16,6 +17,9 @@ class TrainedModel(MetadataBase): trigger_id = Column("trigger_id", Integer) timestamp = Column("timestamp", TIMESTAMP(timezone=False), default=datetime.now()) model_path = Column("model_path", String(length=200), nullable=False) + metadata_path = Column("metadata_path", String(length=200), nullable=False) + parent_model = Column("parent_model", Integer, ForeignKey(f"{__tablename__}.model_id"), nullable=True, default=None) + children = relationship("TrainedModel") __table_args__ = ( ForeignKeyConstraint([pipeline_id, trigger_id], [Trigger.pipeline_id, Trigger.trigger_id]), {"extend_existing": True}, diff --git a/modyn/metadata_database/utils/__init__.py b/modyn/metadata_database/utils/__init__.py new file mode 100644 index 000000000..45e3fbcf4 --- /dev/null +++ b/modyn/metadata_database/utils/__init__.py @@ -0,0 +1,12 @@ +"""This package contains the database classes for the metadata module. + +The models are used to abstract the database operations. +This allows the storage module to be used with different databases. +""" +import os + +from .model_storage_strategy_config import ModelStorageStrategyConfig # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/metadata_database/utils/model_storage_strategy_config.py b/modyn/metadata_database/utils/model_storage_strategy_config.py new file mode 100644 index 000000000..f109ab978 --- /dev/null +++ b/modyn/metadata_database/utils/model_storage_strategy_config.py @@ -0,0 +1,32 @@ +from dataclasses import dataclass +from typing import Optional + +# pylint: disable=no-name-in-module +from modyn.selector.internal.grpc.generated.selector_pb2 import StrategyConfig + + +@dataclass +class ModelStorageStrategyConfig: + """ + This class holds all information of a generic model storage strategy. + It is used to insert a given strategy in the metadata database. + """ + + name: str + zip: bool = False + zip_algorithm: Optional[str] = None + config: Optional[str] = None + + def __init__(self, name: str): + self.name = name + + @classmethod + def from_config(cls, strategy_config: StrategyConfig): # type: ignore[no-untyped-def] + strategy = cls(strategy_config.name) + if strategy_config.HasField("zip") and strategy_config.zip is not None: + strategy.zip = strategy_config.zip + if strategy_config.HasField("zip_algorithm") and strategy_config.zip is not None: + strategy.zip_algorithm = strategy_config.zip_algorithm + if strategy_config.HasField("config") and strategy_config.config is not None: + strategy.config = strategy_config.config.value + return strategy diff --git a/modyn/model_storage/internal/__init__.py b/modyn/model_storage/internal/__init__.py index 3dc30aa40..005c390c4 100644 --- a/modyn/model_storage/internal/__init__.py +++ b/modyn/model_storage/internal/__init__.py @@ -6,6 +6,8 @@ import os +from .model_storage_manager import ModelStorageManager # noqa: F401 + files = os.listdir(os.path.dirname(__file__)) files.remove("__init__.py") __all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/grpc/generated/model_storage_pb2.py b/modyn/model_storage/internal/grpc/generated/model_storage_pb2.py index 439e4cee3..8a4083551 100644 --- a/modyn/model_storage/internal/grpc/generated/model_storage_pb2.py +++ b/modyn/model_storage/internal/grpc/generated/model_storage_pb2.py @@ -14,25 +14,25 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13model_storage.proto\x12\x13modyn.model_storage\"s\n\x14RegisterModelRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x0c\n\x04port\x18\x04 \x01(\x05\x12\x12\n\nmodel_path\x18\x05 \x01(\t\":\n\x15RegisterModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\"%\n\x11\x46\x65tchModelRequest\x12\x10\n\x08model_id\x18\x01 \x01(\x05\"9\n\x12\x46\x65tchModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t\"&\n\x12\x44\x65leteModelRequest\x12\x10\n\x08model_id\x18\x01 \x01(\x05\"&\n\x13\x44\x65leteModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xbd\x02\n\x0cModelStorage\x12h\n\rRegisterModel\x12).modyn.model_storage.RegisterModelRequest\x1a*.modyn.model_storage.RegisterModelResponse\"\x00\x12_\n\nFetchModel\x12&.modyn.model_storage.FetchModelRequest\x1a\'.modyn.model_storage.FetchModelResponse\"\x00\x12\x62\n\x0b\x44\x65leteModel\x12\'.modyn.model_storage.DeleteModelRequest\x1a(.modyn.model_storage.DeleteModelResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13model_storage.proto\x12\x13modyn.model_storage\"\x85\x01\n\x14RegisterModelRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x0c\n\x04port\x18\x04 \x01(\x05\x12\x12\n\nmodel_path\x18\x05 \x01(\t\x12\x10\n\x08\x63hecksum\x18\x06 \x01(\x0c\":\n\x15RegisterModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\"<\n\x11\x46\x65tchModelRequest\x12\x10\n\x08model_id\x18\x01 \x01(\x05\x12\x15\n\rload_metadata\x18\x02 \x01(\x08\"K\n\x12\x46\x65tchModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t\x12\x10\n\x08\x63hecksum\x18\x03 \x01(\x0c\"&\n\x12\x44\x65leteModelRequest\x12\x10\n\x08model_id\x18\x01 \x01(\x05\"&\n\x13\x44\x65leteModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xbd\x02\n\x0cModelStorage\x12h\n\rRegisterModel\x12).modyn.model_storage.RegisterModelRequest\x1a*.modyn.model_storage.RegisterModelResponse\"\x00\x12_\n\nFetchModel\x12&.modyn.model_storage.FetchModelRequest\x1a\'.modyn.model_storage.FetchModelResponse\"\x00\x12\x62\n\x0b\x44\x65leteModel\x12\'.modyn.model_storage.DeleteModelRequest\x1a(.modyn.model_storage.DeleteModelResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'model_storage_pb2', globals()) if _descriptor._USE_C_DESCRIPTORS == False: DESCRIPTOR._options = None - _REGISTERMODELREQUEST._serialized_start=44 - _REGISTERMODELREQUEST._serialized_end=159 - _REGISTERMODELRESPONSE._serialized_start=161 - _REGISTERMODELRESPONSE._serialized_end=219 - _FETCHMODELREQUEST._serialized_start=221 - _FETCHMODELREQUEST._serialized_end=258 - _FETCHMODELRESPONSE._serialized_start=260 - _FETCHMODELRESPONSE._serialized_end=317 - _DELETEMODELREQUEST._serialized_start=319 - _DELETEMODELREQUEST._serialized_end=357 - _DELETEMODELRESPONSE._serialized_start=359 - _DELETEMODELRESPONSE._serialized_end=397 - _MODELSTORAGE._serialized_start=400 - _MODELSTORAGE._serialized_end=717 + _REGISTERMODELREQUEST._serialized_start=45 + _REGISTERMODELREQUEST._serialized_end=178 + _REGISTERMODELRESPONSE._serialized_start=180 + _REGISTERMODELRESPONSE._serialized_end=238 + _FETCHMODELREQUEST._serialized_start=240 + _FETCHMODELREQUEST._serialized_end=300 + _FETCHMODELRESPONSE._serialized_start=302 + _FETCHMODELRESPONSE._serialized_end=377 + _DELETEMODELREQUEST._serialized_start=379 + _DELETEMODELREQUEST._serialized_end=417 + _DELETEMODELRESPONSE._serialized_start=419 + _DELETEMODELRESPONSE._serialized_end=457 + _MODELSTORAGE._serialized_start=460 + _MODELSTORAGE._serialized_end=777 # @@protoc_insertion_point(module_scope) diff --git a/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi b/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi index 9143824a0..b03e736f3 100644 --- a/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi +++ b/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi @@ -23,11 +23,13 @@ class RegisterModelRequest(google.protobuf.message.Message): HOSTNAME_FIELD_NUMBER: builtins.int PORT_FIELD_NUMBER: builtins.int MODEL_PATH_FIELD_NUMBER: builtins.int + CHECKSUM_FIELD_NUMBER: builtins.int pipeline_id: builtins.int trigger_id: builtins.int hostname: builtins.str port: builtins.int model_path: builtins.str + checksum: builtins.bytes def __init__( self, *, @@ -36,8 +38,9 @@ class RegisterModelRequest(google.protobuf.message.Message): hostname: builtins.str = ..., port: builtins.int = ..., model_path: builtins.str = ..., + checksum: builtins.bytes = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["hostname", b"hostname", "model_path", b"model_path", "pipeline_id", b"pipeline_id", "port", b"port", "trigger_id", b"trigger_id"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["checksum", b"checksum", "hostname", b"hostname", "model_path", b"model_path", "pipeline_id", b"pipeline_id", "port", b"port", "trigger_id", b"trigger_id"]) -> None: ... global___RegisterModelRequest = RegisterModelRequest @@ -55,7 +58,7 @@ class RegisterModelResponse(google.protobuf.message.Message): success: builtins.bool = ..., model_id: builtins.int = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["model_id", b"model_id", "success", b"success"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["model_class_name", b"model_class_name", "success", b"success"]) -> None: ... global___RegisterModelResponse = RegisterModelResponse @@ -64,13 +67,16 @@ class FetchModelRequest(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor MODEL_ID_FIELD_NUMBER: builtins.int + LOAD_METADATA_FIELD_NUMBER: builtins.int model_id: builtins.int + load_metadata: builtins.bool def __init__( self, *, model_id: builtins.int = ..., + load_metadata: builtins.bool = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["model_id", b"model_id"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["load_metadata", b"load_metadata", "model_class_name", b"model_class_name"]) -> None: ... global___FetchModelRequest = FetchModelRequest @@ -80,15 +86,18 @@ class FetchModelResponse(google.protobuf.message.Message): SUCCESS_FIELD_NUMBER: builtins.int MODEL_PATH_FIELD_NUMBER: builtins.int + CHECKSUM_FIELD_NUMBER: builtins.int success: builtins.bool model_path: builtins.str + checksum: builtins.bytes def __init__( self, *, success: builtins.bool = ..., model_path: builtins.str = ..., + checksum: builtins.bytes = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["model_path", b"model_path", "success", b"success"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["checksum", b"checksum", "model_path", b"model_path", "success", b"success"]) -> None: ... global___FetchModelResponse = FetchModelResponse @@ -103,7 +112,7 @@ class DeleteModelRequest(google.protobuf.message.Message): *, model_id: builtins.int = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["model_id", b"model_id"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["model_class_name", b"model_class_name"]) -> None: ... global___DeleteModelRequest = DeleteModelRequest diff --git a/modyn/model_storage/internal/grpc/grpc_server.py b/modyn/model_storage/internal/grpc/grpc_server.py index 8580c6434..5d60e5fd2 100644 --- a/modyn/model_storage/internal/grpc/grpc_server.py +++ b/modyn/model_storage/internal/grpc/grpc_server.py @@ -15,14 +15,17 @@ class GRPCServer: """GRPC server context manager.""" - def __init__(self, modyn_config: dict, storage_dir: pathlib.Path) -> None: + def __init__(self, modyn_config: dict, storage_dir: pathlib.Path, ftp_directory: pathlib.Path) -> None: """Initialize the GRPC server. Args: modyn_config (dict): Configuration of the storage module. + storage_dir (path): Path to the model storage directory. + ftp_directory (path): Path to the ftp directory. """ self.modyn_config = modyn_config self.storage_dir = storage_dir + self.ftp_directory = ftp_directory self.server = grpc.server( futures.ThreadPoolExecutor( max_workers=10, @@ -39,7 +42,9 @@ def __enter__(self) -> grpc.Server: Returns: grpc.Server: GRPC server """ - add_ModelStorageServicer_to_server(ModelStorageGRPCServicer(self.modyn_config, self.storage_dir), self.server) + add_ModelStorageServicer_to_server( + ModelStorageGRPCServicer(self.modyn_config, self.storage_dir, self.ftp_directory), self.server + ) port = self.modyn_config["model_storage"]["port"] logger.info(f"Starting GRPC server. Listening on port {port}") self.server.add_insecure_port("[::]:" + port) diff --git a/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py b/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py index 62a13694a..63152a21a 100644 --- a/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py +++ b/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py @@ -5,9 +5,9 @@ import pathlib import grpc -from modyn.common.ftp.ftp_utils import download_file -from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection -from modyn.metadata_database.models.trained_models import TrainedModel +import torch +from modyn.common.ftp.ftp_utils import download_file, get_pretrained_model_callback +from modyn.model_storage.internal import ModelStorageManager # pylint: disable-next=no-name-in-module from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( @@ -19,7 +19,7 @@ RegisterModelResponse, ) from modyn.model_storage.internal.grpc.generated.model_storage_pb2_grpc import ModelStorageServicer -from modyn.utils import EMIT_MESSAGE_PERCENTAGES, current_time_millis +from modyn.utils import calculate_checksum, current_time_millis logger = logging.getLogger(__name__) @@ -27,16 +27,20 @@ class ModelStorageGRPCServicer(ModelStorageServicer): """GRPC servicer for the storage module.""" - def __init__(self, config: dict, storage_dir: pathlib.Path): + def __init__(self, config: dict, storage_dir: pathlib.Path, ftp_dir: pathlib.Path): """Initialize the model storage GRPC servicer. Args: config (dict): Configuration of the storage module. + storage_dir (path): Path to the directory, where the trained models are stored. + ftp_dir (path): Path to the temporary FTP directory, where the trained models are served. """ super().__init__() self._config = config + self.ftp_dir = ftp_dir self.storage_dir = storage_dir + self.model_storage_manager = ModelStorageManager(self._config, self.storage_dir, self.ftp_dir) def RegisterModel(self, request: RegisterModelRequest, context: grpc.ServicerContext) -> RegisterModelResponse: """Registers a new model at the model storage component by downloading it from a given server. @@ -55,39 +59,31 @@ def RegisterModel(self, request: RegisterModelRequest, context: grpc.ServicerCon logger.info(f"Try to download model from {hostname}:{port}, pipeline {pipeline_id} and trigger {trigger_id}.") local_file_name = f"{current_time_millis()}_{pipeline_id}_{trigger_id}.modyn" - local_model_path = self.storage_dir / local_file_name + local_model_path = self.ftp_dir / local_file_name logger.info(f"Remote model path is {remote_model_path}, storing at {local_model_path}.") - last_progress = 0.0 - - def callback(current_progress: float) -> None: - nonlocal last_progress - for emit_perc in EMIT_MESSAGE_PERCENTAGES: - if last_progress <= emit_perc < current_progress: - logger.info(f"Completed {emit_perc * 100}% of the download.") - last_progress = current_progress - - download_file( + success = download_file( hostname, port, "modyn", "modyn", remote_file_path=pathlib.Path(remote_model_path), local_file_path=local_model_path, - callback=callback, + callback=get_pretrained_model_callback(logger), + checksum=request.checksum, ) - logger.info("Download completed.") + if not success: + logger.error("Downloaded file does not match its checksum.") + return RegisterModelResponse(success=False) - response = RegisterModelResponse() + logger.info("Download completed. Invoking model storage manager.") - with MetadataDatabaseConnection(self._config) as database: - model_id = database.add_trained_model(pipeline_id, trigger_id, local_file_name) - response.model_id = model_id - response.success = True + model_id = self.model_storage_manager.store_model(pipeline_id, trigger_id, local_model_path) + os.remove(local_model_path) - return response + return RegisterModelResponse(success=True, model_id=model_id) def FetchModel(self, request: FetchModelRequest, context: grpc.ServicerContext) -> FetchModelResponse: """Fetch a model from the model storage component. @@ -101,21 +97,19 @@ def FetchModel(self, request: FetchModelRequest, context: grpc.ServicerContext) """ logger.info(f"Try to fetch model having id {request.model_id}") - response = FetchModelResponse() - with MetadataDatabaseConnection(self._config) as database: - model: TrainedModel = database.session.get(TrainedModel, request.model_id) - - if model: - response.model_path = model.model_path - response.success = True - - logger.info(f"Trained model {request.model_id} has local path {self.storage_dir / model.model_path}") - else: - response.success = False - - logger.warning(f"Trained model {request.model_id} was not found.") - - return response + model_dict = self.model_storage_manager.load_model(request.model_id, request.load_metadata) + if not model_dict: + logger.error(f"Trained model {request.model_id} could not be fetched.") + return FetchModelResponse(success=False) + model_file_path = self.ftp_dir / f"{current_time_millis()}_{request.model_id}.modyn" + torch.save(model_dict, model_file_path) + + logger.info(f"Trained model {request.model_id} has local path {model_file_path}") + return FetchModelResponse( + success=True, + model_path=str(model_file_path.relative_to(self.ftp_dir)), + checksum=calculate_checksum(model_file_path), + ) def DeleteModel(self, request: DeleteModelRequest, context: grpc.ServicerContext) -> DeleteModelResponse: """Delete model from the model storage component. @@ -127,27 +121,12 @@ def DeleteModel(self, request: DeleteModelRequest, context: grpc.ServicerContext Returns: DeleteModelResponse: the response containing information if the model was found in the database. """ - - logger.info(f"Try to delete model having id {request.model_id}") - - response = DeleteModelResponse() - with MetadataDatabaseConnection(self._config) as database: - model: TrainedModel = database.session.get(TrainedModel, request.model_id) - - if model: - local_model_path = self.storage_dir / model.model_path - os.remove(local_model_path) - - database.session.delete(model) - database.session.commit() - - response.success = True - logger.info( - f"Trained model {request.model_id} with path {self.storage_dir / model.model_path} has been removed" - ) - else: - response.success = False - - logger.warning(f"Trained model {request.model_id} was not found.") - - return response + model_id = request.model_id + logger.info(f"Try to delete model having id {model_id}") + + success = self.model_storage_manager.delete_model(model_id) + if success: + logger.info(f"Deleted model {request.model_id}.") + else: + logger.error(f"Deletion of model {request.model_id} was not successful.") + return DeleteModelResponse(success=success) diff --git a/modyn/model_storage/internal/model_storage_manager.py b/modyn/model_storage/internal/model_storage_manager.py new file mode 100644 index 000000000..33c52a68e --- /dev/null +++ b/modyn/model_storage/internal/model_storage_manager.py @@ -0,0 +1,277 @@ +import json +import logging +import pathlib +from typing import Optional + +import torch +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.models import Pipeline, TrainedModel +from modyn.model_storage.internal.utils import ModelStoragePolicy +from modyn.utils import current_time_millis, dynamic_module_import + +logger = logging.getLogger(__name__) + + +class ModelStorageManager: + """ + Class used as manager of the model storage component. Implements all model storage related functionalities. + """ + + def __init__(self, modyn_config: dict, storage_dir: pathlib.Path, ftp_dir: pathlib.Path): + """ + Constructor of the model storage manager. It establishes a connection to the metadata database in order + to store information related to the trained models. + + Args: + modyn_config: the modyn configuration. + storage_dir: path to the folder, in which the trained models are stored. + ftp_dir: FTP directory, which is used as temporary folder for serving trained models. + """ + self._modyn_config = modyn_config + self._storage_dir = storage_dir + self._ftp_dir = ftp_dir + + def store_model(self, pipeline_id: int, trigger_id: int, checkpoint_path: pathlib.Path) -> int: + """ + Store the trained model contained in the checkpoint file to disk. It uses the model storage policy that is + specified for the pipeline. Depending on the trigger id, it is either stored fully (according to full model + strategy) or incrementally by using the incremental model strategy. + + Args: + pipeline_id: the pipeline identifier for the model. + trigger_id: the trigger associated with the model. + checkpoint_path: path to the checkpoint containing the model. + + Returns: + int: the model id which identifies the stored model. + """ + checkpoint = torch.load(checkpoint_path) + policy = self.get_model_storage_policy(pipeline_id) + + # split the model (stored under the "model" key) from metadata. + assert "model" in checkpoint + state_dict = checkpoint["model"] + local_model_filename = f"{current_time_millis()}_{pipeline_id}_{trigger_id}.model" + model_path = self._storage_dir / local_model_filename + + # handle the new model according to the model storage policy. If it is stored incrementally, we receive + # the model id of the parent. + parent_id = self._handle_new_model(pipeline_id, trigger_id, state_dict, model_path, policy) + checkpoint.pop("model") + + # now checkpoint only contains optimizer state and metadata. + local_metadata_filename = f"{current_time_millis()}_{pipeline_id}_{trigger_id}.metadata.zip" + metadata_path = self._storage_dir / local_metadata_filename + torch.save(checkpoint, metadata_path) + + # add the new model to the database. + with MetadataDatabaseConnection(self._modyn_config) as database: + return database.add_trained_model( + pipeline_id, trigger_id, local_model_filename, local_metadata_filename, parent_id + ) + + def _handle_new_model( + self, + pipeline_id: int, + trigger_id: int, + state_dict: dict, + model_path: pathlib.Path, + policy: ModelStoragePolicy, + ) -> Optional[int]: + """ + Handle the new model according to the model storage policy. + + Args: + pipeline_id: the pipeline, to which the model belongs. + trigger_id: the trigger identifier associated with the model. + state_dict: the model's state. + model_path: path, under which the model must be stored. + policy: the model storage policy applied to store the model. + Returns: + int: if the model is stored incrementally, the parent model id is returned. + """ + + # check whether we must apply the incremental storage strategy or the full model strategy. + if policy.incremental_model_strategy and ( + policy.full_model_interval is None or trigger_id % policy.full_model_interval != 0 + ): + parent_model_id: Optional[int] = self._determine_parent_model_id(pipeline_id, trigger_id) + if parent_model_id is not None: + # load model state of the parent model. + parent_model_state = self._reconstruct_model_state(parent_model_id, policy) + + # finally store the model delta. + policy.incremental_model_strategy.store_model(state_dict, parent_model_state, model_path) + + return parent_model_id + logger.warning("Previous model is not available! Storing full model...") + + # store the model in its entirety. + policy.full_model_strategy.store_model(state_dict, model_path) + return None + + def _reconstruct_model_state(self, model_id: int, policy: ModelStoragePolicy) -> dict: + """ + Reconstruct a given model according to the model storage policy. + The function recursively calls itself whenever the model is stored as a delta. + Otherwise it is stored according to a full model strategy and the model state can be retrieved. + + Args: + model_id: the identifier of the model to be reconstructed. + policy: the model storage policy of the pipeline. + Returns: + dict: the reconstructed model state. Refers to the same object as model_state. + """ + + # we recursively overwrite the model state. + with MetadataDatabaseConnection(self._modyn_config) as database: + model: TrainedModel = database.session.get(TrainedModel, model_id) + if not model.parent_model: + # base case: we can load a fully stored model. + model_state = self._get_base_model_state(model.pipeline_id) + return policy.full_model_strategy.load_model(model_state, self._storage_dir / model.model_path) + + # recursive step: we recurse to load the model state of the parent model. + model_state = self._reconstruct_model_state(model.parent_model, policy) + + # we apply the incremental strategy to load our model state. + return policy.incremental_model_strategy.load_model(model_state, self._storage_dir / model.model_path) + + def _get_base_model_state(self, pipeline_id: int) -> dict: + """ + Get a randomly initialized model associated with the pipeline. + + Args: + pipeline_id: the involved pipeline. + + Returns: + dict: the plain model state derived from the model architecture of the pipeline's models. + """ + with MetadataDatabaseConnection(self._modyn_config) as database: + model_class_name, model_config, amp = database.get_model_configuration(pipeline_id) + model_module = dynamic_module_import("modyn.models") + assert hasattr(model_module, model_class_name), f"Model {model_class_name} not available." + + model_handler = getattr(model_module, model_class_name) + return model_handler(json.loads(model_config), "cpu", amp).model.state_dict() + + def _determine_parent_model_id(self, pipeline_id: int, trigger_id: int) -> Optional[int]: + """ + Determines the id of the parent model given the trigger id of a pipeline. Usually, the last fully stored + model is identified as such. The function returns None whenever no parent model can be found. + + Args: + pipeline_id: the pipeline that generated the model. + trigger_id: the trigger associated with the model. + + Returns: + Optional[int]: the parent model id (if it can be found). + """ + with MetadataDatabaseConnection(self._modyn_config) as database: + previous_model: TrainedModel = ( + database.session.query(TrainedModel) + .filter(TrainedModel.pipeline_id == pipeline_id, TrainedModel.trigger_id == trigger_id - 1) + .first() + ) + + # whenever the previous model is not present, a parent model cannot be determined. + if not previous_model: + return None + # return the id of the previous model if its stored in its entirety. + if previous_model.parent_model is None: + return previous_model.model_id + # otherwise return the parent model of the previous model. + return previous_model.parent_model + + def load_model(self, model_id: int, metadata: bool) -> Optional[dict]: + """ + Loads a given model and optionally, also appends the metadata. + + Args: + model_id: the model identifier of the model. + metadata: whether metadata should be loaded alongside. + + Returns: + Optional[dict]: dictionary containing the model state and metadata if the model exists. + """ + with MetadataDatabaseConnection(self._modyn_config) as database: + model: Optional[TrainedModel] = database.session.get(TrainedModel, model_id) + if model is None: + logger.error(f"Model {model_id} does not exist.") + return None + policy = self.get_model_storage_policy(model.pipeline_id) + + # retrieve the model by loading its state dictionary. + model_state = self._reconstruct_model_state(model_id, policy) + model_dict = {"model": model_state} + + # append the metadata to the dictionary if specified. + if metadata: + metadata_dict = torch.load(self._storage_dir / model.metadata_path) + model_dict.update(metadata_dict) + + return model_dict + + def delete_model(self, model_id: int) -> bool: + """ + Deletes a given model id. Only works, if all depending models (children) are deleted. + + Args: + model_id: the identifier of the model. + + Returns: + bool: True, whenever deletion was successful. + """ + with MetadataDatabaseConnection(self._modyn_config) as database: + model: Optional[TrainedModel] = database.session.get(TrainedModel, model_id) + + if model is None: + logger.error(f"Trained model {model_id} was not found.") + return False + + children = model.children + if len(children) > 0: + child_ids = [str(child.model_id) for child in children] + logger.info(f"Model {model_id} has depending child models: {', '.join(child_ids)}") + return False + + (self._storage_dir / model.model_path).unlink() + (self._storage_dir / model.metadata_path).unlink() + + database.session.delete(model) + database.session.commit() + logger.info(f"Successfully deleted model {model_id}.") + return True + + def get_model_storage_policy(self, pipeline_id: int) -> ModelStoragePolicy: + """ + Returns the model storage policy associated with the pipeline. + + Args: + pipeline_id: the id of the pipeline, from which the policy is taken. + + Returns: + ModelStoragePolicy: the model storage policy of the pipeline. + """ + + with MetadataDatabaseConnection(self._modyn_config) as database: + pipeline: Pipeline = database.session.query(Pipeline).get(pipeline_id) + + policy = ModelStoragePolicy( + self._ftp_dir, + pipeline.full_model_strategy_name, + pipeline.full_model_strategy_zip, + pipeline.full_model_strategy_zip_algorithm, + pipeline.full_model_strategy_config, + ) + + if pipeline.inc_model_strategy_name is not None: + policy.register_incremental_model_strategy( + pipeline.inc_model_strategy_name, + pipeline.inc_model_strategy_zip, + pipeline.inc_model_strategy_zip_algorithm, + pipeline.inc_model_strategy_config, + pipeline.full_model_interval, + ) + + return policy diff --git a/modyn/model_storage/internal/storage_strategies/__init__.py b/modyn/model_storage/internal/storage_strategies/__init__.py new file mode 100644 index 000000000..36034da4f --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/__init__.py @@ -0,0 +1,14 @@ +""" +Model Storage module. + +The model storage module contains all classes and functions related to the storage and retrieval of models. +""" + +import os + +from .abstract_difference_operator import AbstractDifferenceOperator # noqa: F401 +from .abstract_model_storage_strategy import AbstractModelStorageStrategy # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/storage_strategies/abstract_difference_operator.py b/modyn/model_storage/internal/storage_strategies/abstract_difference_operator.py new file mode 100644 index 000000000..161870fd1 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/abstract_difference_operator.py @@ -0,0 +1,40 @@ +from abc import ABC, abstractmethod + +import torch + + +class AbstractDifferenceOperator(ABC): + """ + This is the base class for all difference operators. These operators can be used to calculate the difference + between two successive models in the pipeline and later be used in a incremental model storage strategy. + """ + + @staticmethod + @abstractmethod + def calculate_difference(tensor: torch.Tensor, tensor_prev: torch.Tensor) -> bytes: + """ + Calculate the difference between two tensors. + + Args: + tensor: the tensor representing some weights of the current model. + tensor_prev: the tensor representing the same weights of the preceding model. + + Returns: + bytes: the byte-level difference. + """ + raise NotImplementedError() + + @staticmethod + @abstractmethod + def restore(tensor_prev: torch.Tensor, buffer: bytes) -> torch.Tensor: + """ + Restores a weight tensor. + + Args: + tensor_prev: the tensor representing some weights of the preceding model. + buffer: difference bytes, from which to restore the weights of the current model. + + Returns: + tensor: the weight tensor of the current model. + """ + raise NotImplementedError() diff --git a/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py b/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py new file mode 100644 index 000000000..d4f01cc3a --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py @@ -0,0 +1,32 @@ +import pathlib +from abc import ABC +from zipfile import ZIP_DEFLATED + +from modyn.utils import dynamic_module_import + + +class AbstractModelStorageStrategy(ABC): + """ + Base class for all model storage strategies. + """ + + def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm_name: str): + """ + Initialize a model storage strategy. + + Args: + zipping_dir: directory, in which the model is zipped. + zip_activated: whether the generated file is zipped. + zip_algorithm_name: name of the zip algorithm. + """ + self.zipping_dir = zipping_dir + self.zip = zip_activated + self.zip_algorithm = ZIP_DEFLATED + self._validate_zip_config(zip_algorithm_name) + + def _validate_zip_config(self, zip_algorithm_name: str) -> None: + if self.zip and zip_algorithm_name: + zip_module = dynamic_module_import("zipfile") + if not hasattr(zip_module, zip_algorithm_name): + raise NotImplementedError(f"The zip algorithm {zip_algorithm_name} is unknown!") + self.zip_algorithm = getattr(zip_module, zip_algorithm_name) diff --git a/modyn/model_storage/internal/storage_strategies/difference_operators/__init__.py b/modyn/model_storage/internal/storage_strategies/difference_operators/__init__.py new file mode 100644 index 000000000..62a936bb3 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/difference_operators/__init__.py @@ -0,0 +1,14 @@ +""" +Model Storage module. + +The model storage module contains all classes and functions related to the storage and retrieval of models. +""" + +import os + +from .sub_difference_operator import SubDifferenceOperator # noqa: F401 +from .xor_difference_operator import XorDifferenceOperator # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py b/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py new file mode 100644 index 000000000..343b5d0a0 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py @@ -0,0 +1,15 @@ +import torch +from modyn.model_storage.internal.storage_strategies.abstract_difference_operator import AbstractDifferenceOperator +from modyn.utils import reconstruct_tensor_from_bytes + + +class SubDifferenceOperator(AbstractDifferenceOperator): + @staticmethod + def calculate_difference(tensor: torch.Tensor, tensor_prev: torch.Tensor) -> bytes: + diff = tensor - tensor_prev + return diff.numpy().tobytes() + + @staticmethod + def restore(tensor_prev: torch.Tensor, buffer: bytes) -> torch.Tensor: + difference_tensor = reconstruct_tensor_from_bytes(tensor_prev, buffer) + return tensor_prev + difference_tensor diff --git a/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py b/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py new file mode 100644 index 000000000..7c0883924 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py @@ -0,0 +1,18 @@ +import torch +from modyn.model_storage.internal.storage_strategies.abstract_difference_operator import AbstractDifferenceOperator +from modyn.utils import reconstruct_tensor_from_bytes + + +class XorDifferenceOperator(AbstractDifferenceOperator): + @staticmethod + def calculate_difference(tensor: torch.Tensor, tensor_prev: torch.Tensor) -> bytes: + bytes_curr = tensor.numpy().tobytes() + bytes_prev = tensor_prev.numpy().tobytes() + + return bytes(a ^ b for (a, b) in zip(bytes_curr, bytes_prev)) + + @staticmethod + def restore(tensor_prev: torch.Tensor, buffer: bytes) -> torch.Tensor: + prev_model_data = tensor_prev.numpy().tobytes() + new_model_data = bytes(a ^ b for (a, b) in zip(prev_model_data, buffer)) + return reconstruct_tensor_from_bytes(tensor_prev, new_model_data) diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/__init__.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/__init__.py new file mode 100644 index 000000000..c185068a9 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/__init__.py @@ -0,0 +1,15 @@ +""" +Model Storage module. + +The model storage module contains all classes and functions related to the storage and retrieval of models. +""" + +import os + +from .abstract_full_model_strategy import AbstractFullModelStrategy # noqa: F401 +from .binary_full_model import BinaryFullModel # noqa: F401 +from .pytorch_full_model import PyTorchFullModel # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py new file mode 100644 index 000000000..d13c2c740 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py @@ -0,0 +1,55 @@ +import pathlib +import tempfile +from abc import ABC, abstractmethod + +from modyn.model_storage.internal.storage_strategies.abstract_model_storage_strategy import AbstractModelStorageStrategy +from modyn.utils import unzip_file, zip_file + + +class AbstractFullModelStrategy(AbstractModelStorageStrategy, ABC): + """ + This is the base class for all full model strategies. That is, strategies which contain full information about + a model in order to reproduce its model state. + """ + + @abstractmethod + def _store_model(self, model_state: dict, file_path: pathlib.Path) -> None: + """ + Stores the model state to the given file. + + Args: + model_state: the state dictionary of the model. + file_path: the path to the file in which to store the state. + """ + raise NotImplementedError() + + def store_model(self, model_state: dict, file_path: pathlib.Path) -> None: + if self.zip: + with tempfile.NamedTemporaryFile(dir=self.zipping_dir) as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + self._store_model(model_state, temp_file_path) + zip_file(temp_file_path, file_path, self.zip_algorithm, remove_file=False) + else: + self._store_model(model_state, file_path) + + @abstractmethod + def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> dict: + """ + Load the model state from the given file. + + Args: + base_model_state: the base model state which must be overwritten. + file_path: the path to the file that contains the state information. + + Returns: + dict: the state dictionary of the loaded model. + """ + raise NotImplementedError() + + def load_model(self, base_model_state: dict, file_path: pathlib.Path) -> dict: + if self.zip: + with tempfile.NamedTemporaryFile(dir=self.zipping_dir) as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + unzip_file(file_path, temp_file_path, compression=self.zip_algorithm, remove_file=False) + return self._load_model(base_model_state, temp_file_path) + return self._load_model(base_model_state, file_path) diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/binary_full_model.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/binary_full_model.py new file mode 100644 index 000000000..f8a669ce2 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/binary_full_model.py @@ -0,0 +1,26 @@ +import pathlib + +from modyn.model_storage.internal.storage_strategies.full_model_strategies import AbstractFullModelStrategy +from modyn.utils import get_tensor_byte_size, reconstruct_tensor_from_bytes + + +class BinaryFullModel(AbstractFullModelStrategy): + """ + This full model strategy stores the weights as binary sequence. + """ + + # pylint: disable-next=unused-argument + def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm_name: str, config: dict): + super().__init__(zipping_dir, zip_activated, zip_algorithm_name) + + def _store_model(self, model_state: dict, file_path: pathlib.Path) -> None: + with open(file_path, "wb") as file: + for tensor in model_state.values(): + file.write(tensor.numpy().tobytes()) + + def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> dict: + with open(file_path, "rb") as file: + for layer, tensor in base_model_state.items(): + num_bytes = get_tensor_byte_size(tensor) + base_model_state[layer] = reconstruct_tensor_from_bytes(tensor, file.read(num_bytes)) + return base_model_state diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py new file mode 100644 index 000000000..26c317ab2 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py @@ -0,0 +1,29 @@ +import logging +import pathlib + +import torch +from modyn.model_storage.internal.storage_strategies.full_model_strategies.abstract_full_model_strategy import ( + AbstractFullModelStrategy, +) + +logger = logging.getLogger(__name__) + + +class PyTorchFullModel(AbstractFullModelStrategy): + """ + This full model strategy naively stores the whole model on disk (default pytorch implementation). + """ + + # pylint: disable-next=unused-argument + def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm_name: str, config: dict): + super().__init__(zipping_dir, False, zip_algorithm_name) + + if zip_activated: + logger.warning("The zipping option is disabled for this strategy since its already performed natively.") + + def _store_model(self, model_state: dict, file_path: pathlib.Path) -> None: + torch.save(model_state, file_path) + + def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> dict: + base_model_state.update(torch.load(file_path)) + return base_model_state diff --git a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/__init__.py b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/__init__.py new file mode 100644 index 000000000..6bf7c0fda --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/__init__.py @@ -0,0 +1,14 @@ +""" +Model Storage module. + +The model storage module contains all classes and functions related to the storage and retrieval of models. +""" + +import os + +from .abstract_incremental_model_strategy import AbstractIncrementalModelStrategy # noqa: F401 +from .weights_difference import WeightsDifference # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py new file mode 100644 index 000000000..de7435b35 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py @@ -0,0 +1,56 @@ +import pathlib +import tempfile +from abc import ABC, abstractmethod + +from modyn.model_storage.internal.storage_strategies.abstract_model_storage_strategy import AbstractModelStorageStrategy +from modyn.utils import unzip_file, zip_file + + +class AbstractIncrementalModelStrategy(AbstractModelStorageStrategy, ABC): + """ + This is the base class for all incremental model strategies. These strategies build on the idea of storing a delta + between two successive models in order to reproduce the latter one. + """ + + @abstractmethod + def _store_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: + """ + Stores the delta between two successive models. + + Args: + model_state: the newer model state. + prev_model_state: the state of the preceding model. + file_path: the path to the file in which the delta is stored. + """ + raise NotImplementedError() + + def store_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: + if self.zip: + with tempfile.NamedTemporaryFile(dir=self.zipping_dir) as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + self._store_model(model_state, prev_model_state, temp_file_path) + zip_file(temp_file_path, file_path, self.zip_algorithm, remove_file=False) + else: + self._store_model(model_state, prev_model_state, file_path) + + @abstractmethod + def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> dict: + """ + Loads a model state by overwriting the state of the preceding model. + + Args: + prev_model_state: the state of the preceding model. + file_path: the path to the file which contains the delta. + + Returns: + dict: the state dictionary of the loaded model. + """ + raise NotImplementedError() + + def load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> dict: + if self.zip: + with tempfile.NamedTemporaryFile(dir=self.zipping_dir) as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + unzip_file(file_path, temp_file_path, compression=self.zip_algorithm, remove_file=False) + return self._load_model(prev_model_state, temp_file_path) + return self._load_model(prev_model_state, file_path) diff --git a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py new file mode 100644 index 000000000..f86d60277 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py @@ -0,0 +1,158 @@ +import io +import pathlib +from typing import BinaryIO, Union + +import torch +from bitstring import BitArray +from modyn.model_storage.internal.storage_strategies.difference_operators import ( + SubDifferenceOperator, + XorDifferenceOperator, +) +from modyn.model_storage.internal.storage_strategies.incremental_model_strategies import ( + AbstractIncrementalModelStrategy, +) +from modyn.utils import get_tensor_byte_size + +available_difference_operators = {"xor": XorDifferenceOperator, "sub": SubDifferenceOperator} + + +class WeightsDifference(AbstractIncrementalModelStrategy): + """ + This incremental model strategy stores the delta between two successive model states as difference of their + weight tensors. It currently supports two difference operators: xor and sub. + """ + + def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm_name: str, config: dict): + super().__init__(zipping_dir, zip_activated, zip_algorithm_name) + + self._validate_config(config) + + def _validate_config(self, config: dict) -> None: + self.difference_operator = SubDifferenceOperator + if "operator" in config: + difference_operator_name = config["operator"] + if difference_operator_name not in available_difference_operators: + raise ValueError(f"Operator should be one of {available_difference_operators}.") + self.difference_operator = available_difference_operators[difference_operator_name] + self.split_exponent = config["split_exponent"] if "split_exponent" in config else False + self.rle = config["rle"] if "rle" in config else False + + def _store_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: + bytestream = io.BytesIO() + exponent_bytestream = io.BytesIO() if self.split_exponent else None + + for tensor_model, tensor_prev_model in zip(model_state.values(), prev_model_state.values()): + difference = self.difference_operator.calculate_difference(tensor_model, tensor_prev_model) + + if exponent_bytestream is not None and tensor_model.dtype == torch.float32: + for i in range(0, len(difference), 4): + reordered_diff = self.reorder_buffer(difference[i : i + 4]) + bytestream.write(reordered_diff[0:3]) + exponent_bytestream.write(reordered_diff[3:4]) + else: + bytestream.write(difference) + + with open(file_path, "wb") as file: + if exponent_bytestream is not None: + exponents = exponent_bytestream.getvalue() + if self.rle: + exponents = self.encode_bytes(exponents) + file.write(len(exponents).to_bytes(8, byteorder="big")) + file.write(exponents) + file.write(bytestream.getbuffer().tobytes()) + + def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> dict: + with open(file_path, "rb") as file: + if not self.split_exponent: + for layer_name, tensor in prev_model_state.items(): + num_bytes = get_tensor_byte_size(tensor) + prev_model_state[layer_name] = self.difference_operator.restore(tensor, file.read(num_bytes)) + return prev_model_state + return self._load_model_split_exponent(prev_model_state, file) + + def _load_model_split_exponent(self, prev_model_state: dict, file: BinaryIO) -> dict: + exponent_bytes_amount = int.from_bytes(file.read(8), byteorder="big") + + with io.BytesIO() as exponent_bytes: + exponent_bytes.write( + self.decode_bytes(file.read(exponent_bytes_amount)) if self.rle else file.read(exponent_bytes_amount) + ) + exponent_bytes.seek(0) + + for layer_name, tensor in prev_model_state.items(): + num_bytes = get_tensor_byte_size(tensor) + + if tensor.dtype == torch.float32: + buffer = bytearray(num_bytes) + for i in range(0, num_bytes, 4): + buffer[i : i + 3] = file.read(3) + buffer[i + 3 : i + 4] = exponent_bytes.read(1) + + prev_model_state[layer_name] = self.difference_operator.restore(tensor, self.reorder_buffer(buffer)) + else: + prev_model_state[layer_name] = self.difference_operator.restore(tensor, file.read(num_bytes)) + return prev_model_state + + @staticmethod + def reorder_buffer(buffer: Union[bytes, bytearray]) -> bytes: + bit_array = BitArray(buffer) + array_size = len(bit_array) + + for i in range(0, array_size, 32): + # exchange sign bit with last exponent bit + sign_bit = bit_array[i + 24] + bit_array[i + 24] = bit_array[i + 16] + bit_array[i + 16] = sign_bit + + return bit_array.bytes + + @staticmethod + def encode_bytes(buffer: bytes) -> bytes: + """ + Perform byte-wise run-length encoding. + + Args: + buffer: the bytes to be encoded. + + Returns: + bytes: the encoded bytes. + """ + if len(buffer) == 0: + return buffer + bytestream = io.BytesIO() + + curr = buffer[0] + count = 0 + + for byte in buffer: + if byte == curr and count < 255: + count += 1 + else: + bytestream.write(count.to_bytes(1, byteorder="big")) + bytestream.write(curr.to_bytes(1, byteorder="big")) + curr = byte + count = 1 + bytestream.write(count.to_bytes(1, byteorder="big")) + bytestream.write(curr.to_bytes(1, byteorder="big")) + + return bytestream.getvalue() + + @staticmethod + def decode_bytes(buffer: bytes) -> bytes: + """ + Decode run-length encoded bytes. + + Args: + buffer: the encoded bytes. + + Returns: + bytes: the decoded bytes. + """ + assert len(buffer) % 2 == 0, "should be of even length" + bytestream = io.BytesIO() + + for i in range(0, len(buffer), 2): + count = int.from_bytes(buffer[i : i + 1], byteorder="big") + + bytestream.write(count * buffer[i + 1 : i + 2]) + return bytestream.getvalue() diff --git a/modyn/model_storage/internal/utils/__init__.py b/modyn/model_storage/internal/utils/__init__.py new file mode 100644 index 000000000..56eeadaac --- /dev/null +++ b/modyn/model_storage/internal/utils/__init__.py @@ -0,0 +1,13 @@ +""" +Model Storage module. + +The model storage module contains all classes and functions related to the storage and retrieval of models. +""" + +import os + +from .model_storage_policy import ModelStoragePolicy # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/utils/model_storage_policy.py b/modyn/model_storage/internal/utils/model_storage_policy.py new file mode 100644 index 000000000..9220be4e2 --- /dev/null +++ b/modyn/model_storage/internal/utils/model_storage_policy.py @@ -0,0 +1,77 @@ +import json +import logging +import pathlib +from typing import Optional, Union + +from modyn.model_storage.internal.storage_strategies.full_model_strategies import AbstractFullModelStrategy +from modyn.model_storage.internal.storage_strategies.incremental_model_strategies import ( + AbstractIncrementalModelStrategy, +) +from modyn.utils import dynamic_module_import + +logger = logging.getLogger(__name__) + +FULL_MODEL_STRATEGY_MODULE = "modyn.model_storage.internal.storage_strategies.full_model_strategies" +INCREMENTAL_MODEL_STRATEGY_MODULE = "modyn.model_storage.internal.storage_strategies.incremental_model_strategies" + + +class ModelStoragePolicy: + """ + Class used to represent the model storage policy. It loads the specified strategies. + """ + + def __init__( + self, + zipping_dir: pathlib.Path, + full_model_strategy_name: str, + full_model_strategy_zip: Optional[bool], + full_model_strategy_zip_algorithm: Optional[str], + full_model_strategy_config: Optional[str], + ) -> None: + self.zipping_dir = zipping_dir + self.full_model_strategy: AbstractFullModelStrategy = self._setup_model_storage_strategy( + full_model_strategy_name, + full_model_strategy_zip, + full_model_strategy_zip_algorithm, + full_model_strategy_config, + FULL_MODEL_STRATEGY_MODULE, + ) + + self.incremental_model_strategy: Optional[AbstractIncrementalModelStrategy] = None + self.full_model_interval: Optional[int] = None + + def register_incremental_model_strategy( + self, + name: str, + zip_enabled: Optional[bool], + zip_algorithm: Optional[str], + config: Optional[str], + full_model_interval: Optional[int], + ) -> None: + self.incremental_model_strategy = self._setup_model_storage_strategy( + name, zip_enabled, zip_algorithm, config, INCREMENTAL_MODEL_STRATEGY_MODULE + ) + if full_model_interval is not None: + self._validate_full_model_interval(full_model_interval) + + def _validate_full_model_interval(self, full_model_interval: int) -> None: + if full_model_interval <= 0: + raise ValueError("Full model interval should be positive.") + self.full_model_interval = full_model_interval + + def _setup_model_storage_strategy( + self, + name: str, + zip_enabled: Optional[bool], + zip_algorithm: Optional[str], + config: Optional[str], + module_name: str, + ) -> Union[AbstractFullModelStrategy, AbstractIncrementalModelStrategy]: + model_storage_module = dynamic_module_import(module_name) + if not hasattr(model_storage_module, name): + raise NotImplementedError(f"Strategy {name} not implemented!") + model_storage_strategy_handler = getattr(model_storage_module, name) + strategy_config = json.loads(config) if config else {} + return model_storage_strategy_handler( + self.zipping_dir, zip_enabled or False, zip_algorithm or "ZIP_DEFLATED", strategy_config + ) diff --git a/modyn/model_storage/model_storage.py b/modyn/model_storage/model_storage.py index 5ac2407ec..8a6550e28 100644 --- a/modyn/model_storage/model_storage.py +++ b/modyn/model_storage/model_storage.py @@ -1,34 +1,44 @@ import os import pathlib -from typing import Optional, Tuple +import shutil -from jsonschema import ValidationError from modyn.common.ftp.ftp_server import FTPServer from modyn.model_storage.internal.grpc.grpc_server import GRPCServer -from modyn.utils import validate_yaml +from modyn.utils import is_directory_writable class ModelStorage: def __init__(self, config: dict) -> None: self.config = config + self._init_model_storage_directory() - valid, errors = self._validate_config() - if not valid: - raise ValueError(f"Invalid configuration: {errors}") + def _init_model_storage_directory(self) -> None: + model_storage_directory = pathlib.Path(self.config["model_storage"]["models_directory"]) - self._setup_model_storage_directory() + if not model_storage_directory.exists(): + raise ValueError( + f"The model storage directory {model_storage_directory} does not exist. \ + Please create the directory or mount another, existing directory." + ) - def _validate_config(self) -> Tuple[bool, Optional[ValidationError]]: - schema_path = ( - pathlib.Path(os.path.abspath(__file__)).parent.parent / "config" / "schema" / "modyn_config_schema.yaml" - ) - return validate_yaml(self.config, schema_path) + if not is_directory_writable(model_storage_directory): + raise ValueError( + f"The model storage directory {model_storage_directory} is not writable. \ + Please check the directory permissions and try again.\n" + + f"Directory info: {os.stat(model_storage_directory)}" + ) - def _setup_model_storage_directory(self) -> None: - self.model_storage_directory = pathlib.Path(os.getcwd()) / "model_storage" - os.makedirs(self.model_storage_directory) + self.models_directory = model_storage_directory / "models" + self.models_directory.mkdir(exist_ok=True) + self.ftp_directory = model_storage_directory / "ftp" + + if self.ftp_directory.exists() and self.ftp_directory.is_dir(): + shutil.rmtree(self.ftp_directory) + self.ftp_directory.mkdir(exist_ok=False) def run(self) -> None: - with GRPCServer(self.config, self.model_storage_directory) as server: - with FTPServer(self.config["model_storage"]["ftp_port"], self.model_storage_directory): + with GRPCServer(self.config, self.models_directory, self.ftp_directory) as server: + with FTPServer(self.config["model_storage"]["ftp_port"], self.ftp_directory): server.wait_for_termination() + + shutil.rmtree(self.ftp_directory) diff --git a/modyn/protos/evaluator.proto b/modyn/protos/evaluator.proto index ff95f1b2a..12cb64d02 100644 --- a/modyn/protos/evaluator.proto +++ b/modyn/protos/evaluator.proto @@ -24,17 +24,14 @@ message MetricConfiguration { } message EvaluateModelRequest { - int32 trained_model_id = 1; + int32 model_id = 1; DatasetInfo dataset_info = 2; string device = 3; - bool amp = 4; - int32 batch_size = 5; - repeated MetricConfiguration metrics = 6; - string model_id = 7; - JsonString model_configuration = 8; - repeated string transform_list = 9; - PythonString bytes_parser = 10; - PythonString label_transformer = 11; + int32 batch_size = 4; + repeated MetricConfiguration metrics = 5; + repeated string transform_list = 6; + PythonString bytes_parser = 7; + PythonString label_transformer = 8; } message EvaluateModelResponse { diff --git a/modyn/protos/model_storage.proto b/modyn/protos/model_storage.proto index 03e177916..bd4184af9 100644 --- a/modyn/protos/model_storage.proto +++ b/modyn/protos/model_storage.proto @@ -14,6 +14,7 @@ message RegisterModelRequest { string hostname = 3; int32 port = 4; string model_path = 5; + bytes checksum = 6; } message RegisterModelResponse { @@ -23,11 +24,13 @@ message RegisterModelResponse { message FetchModelRequest { int32 model_id = 1; + bool load_metadata = 2; } message FetchModelResponse { bool success = 1; string model_path = 2; + bytes checksum = 3; } message DeleteModelRequest { diff --git a/modyn/protos/selector.proto b/modyn/protos/selector.proto index 03ab3252b..4bbdc157f 100644 --- a/modyn/protos/selector.proto +++ b/modyn/protos/selector.proto @@ -26,6 +26,20 @@ message Empty {} message JsonString { string value = 1; } +message StrategyConfig { + string name = 1; + optional bool zip = 2; + optional string zip_algorithm = 3; + optional JsonString config = 4; +} + +// TODO(#302): Remove this when reworking pipeline registration +message ModelStoragePolicyInfo { + StrategyConfig full_model_strategy_config = 1; + optional StrategyConfig incremental_model_strategy_config = 2; + optional int32 full_model_interval = 3; +} + message DataInformRequest { int32 pipeline_id = 1; repeated int64 keys = 2; @@ -43,6 +57,10 @@ message TriggerResponse { message RegisterPipelineRequest { int32 num_workers = 1; JsonString selection_strategy = 2; + string model_class_name = 3; + JsonString model_configuration = 4; + bool amp = 5; + ModelStoragePolicyInfo model_storage_policy = 6; } message PipelineResponse { int32 pipeline_id = 1; } diff --git a/modyn/protos/trainer_server.proto b/modyn/protos/trainer_server.proto index cf31c2850..ce8c684a4 100644 --- a/modyn/protos/trainer_server.proto +++ b/modyn/protos/trainer_server.proto @@ -35,28 +35,25 @@ message StartTrainingRequest { int32 pipeline_id = 1; int32 trigger_id = 2; string device = 3; - bool amp = 4; - string model_id = 5; - JsonString model_configuration = 6; - bool use_pretrained_model = 7; - bool load_optimizer_state = 8; - int32 pretrained_model_id = 9; - int32 batch_size = 10; - JsonString torch_optimizers_configuration = 11; - string torch_criterion = 12; - JsonString criterion_parameters = 13; - Data data_info = 14; - CheckpointInfo checkpoint_info = 15; - PythonString bytes_parser = 16; - repeated string transform_list = 17; - JsonString lr_scheduler = 18; - PythonString label_transformer = 19; - JsonString grad_scaler_configuration = 20; - int32 epochs_per_trigger = 21; - int32 num_prefetched_partitions = 22; - int32 parallel_prefetch_requests = 23; - optional int32 seed = 24; - optional PythonString tokenizer = 25; + bool use_pretrained_model = 4; + bool load_optimizer_state = 5; + int32 pretrained_model_id = 6; + int32 batch_size = 7; + JsonString torch_optimizers_configuration = 8; + string torch_criterion = 9; + JsonString criterion_parameters = 10; + Data data_info = 11; + CheckpointInfo checkpoint_info = 12; + PythonString bytes_parser = 13; + repeated string transform_list = 14; + JsonString lr_scheduler = 15; + PythonString label_transformer = 16; + JsonString grad_scaler_configuration = 17; + int32 epochs_per_trigger = 18; + int32 num_prefetched_partitions = 19; + int32 parallel_prefetch_requests = 20; + optional int32 seed = 21; + optional PythonString tokenizer = 22; } message StartTrainingResponse { diff --git a/modyn/selector/internal/grpc/generated/selector_pb2.py b/modyn/selector/internal/grpc/generated/selector_pb2.py index 48da76de2..9dbf92ebe 100644 --- a/modyn/selector/internal/grpc/generated/selector_pb2.py +++ b/modyn/selector/internal/grpc/generated/selector_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0eselector.proto\x12\x08selector\"\x07\n\x05\x45mpty\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"Z\n\x11\x44\x61taInformRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x0c\n\x04keys\x18\x02 \x03(\x03\x12\x12\n\ntimestamps\x18\x03 \x03(\x03\x12\x0e\n\x06labels\x18\x04 \x03(\x03\"7\n\x12\x44\x61taInformResponse\x12!\n\x03log\x18\x01 \x01(\x0b\x32\x14.selector.JsonString\"H\n\x0fTriggerResponse\x12\x12\n\ntrigger_id\x18\x01 \x01(\x05\x12!\n\x03log\x18\x02 \x01(\x0b\x32\x14.selector.JsonString\"`\n\x17RegisterPipelineRequest\x12\x13\n\x0bnum_workers\x18\x01 \x01(\x05\x12\x30\n\x12selection_strategy\x18\x02 \x01(\x0b\x32\x14.selector.JsonString\"\'\n\x10PipelineResponse\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"e\n\x11GetSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x14\n\x0cpartition_id\x18\x03 \x01(\x05\x12\x11\n\tworker_id\x18\x04 \x01(\x05\"T\n\x0fSamplesResponse\x12\x1f\n\x17training_samples_subset\x18\x01 \x03(\x03\x12 \n\x18training_samples_weights\x18\x02 \x03(\x02\"D\n\x19GetNumberOfSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\".\n\x17NumberOfSamplesResponse\x12\x13\n\x0bnum_samples\x18\x01 \x01(\x05\"/\n\x18GetStatusBarScaleRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"2\n\x16StatusBarScaleResponse\x12\x18\n\x10status_bar_scale\x18\x01 \x01(\x05\"G\n\x1cGetNumberOfPartitionsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\"4\n\x1aNumberOfPartitionsResponse\x12\x16\n\x0enum_partitions\x18\x01 \x01(\x05\"0\n\x19GetAvailableLabelsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"3\n\x17\x41vailableLabelsResponse\x12\x18\n\x10\x61vailable_labels\x18\x01 \x03(\x03\"2\n\x1bGetSelectionStrategyRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"\x82\x01\n\x19SelectionStrategyResponse\x12\x1c\n\x14\x64ownsampling_enabled\x18\x01 \x01(\x08\x12\x15\n\rstrategy_name\x18\x02 \x01(\t\x12\x30\n\x12\x64ownsampler_config\x18\x03 \x01(\x0b\x32\x14.selector.JsonString\")\n\x12UsesWeightsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"+\n\x13UsesWeightsResponse\x12\x14\n\x0cuses_weights\x18\x01 \x01(\x08\"#\n\x13SeedSelectorRequest\x12\x0c\n\x04seed\x18\x01 \x01(\x05\"\'\n\x14SeedSelectorResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xf6\x07\n\x08Selector\x12T\n\x11register_pipeline\x12!.selector.RegisterPipelineRequest\x1a\x1a.selector.PipelineResponse\"\x00\x12Y\n\x1bget_sample_keys_and_weights\x12\x1b.selector.GetSamplesRequest\x1a\x19.selector.SamplesResponse\"\x00\x30\x01\x12J\n\x0binform_data\x12\x1b.selector.DataInformRequest\x1a\x1c.selector.DataInformResponse\"\x00\x12S\n\x17inform_data_and_trigger\x12\x1b.selector.DataInformRequest\x1a\x19.selector.TriggerResponse\"\x00\x12\x61\n\x15get_number_of_samples\x12#.selector.GetNumberOfSamplesRequest\x1a!.selector.NumberOfSamplesResponse\"\x00\x12^\n\x14get_status_bar_scale\x12\".selector.GetStatusBarScaleRequest\x1a .selector.StatusBarScaleResponse\"\x00\x12j\n\x18get_number_of_partitions\x12&.selector.GetNumberOfPartitionsRequest\x1a$.selector.NumberOfPartitionsResponse\"\x00\x12`\n\x14get_available_labels\x12#.selector.GetAvailableLabelsRequest\x1a!.selector.AvailableLabelsResponse\"\x00\x12\x66\n\x16get_selection_strategy\x12%.selector.GetSelectionStrategyRequest\x1a#.selector.SelectionStrategyResponse\"\x00\x12P\n\rseed_selector\x12\x1d.selector.SeedSelectorRequest\x1a\x1e.selector.SeedSelectorResponse\"\x00\x12M\n\x0cuses_weights\x12\x1c.selector.UsesWeightsRequest\x1a\x1d.selector.UsesWeightsResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0eselector.proto\x12\x08selector\"\x07\n\x05\x45mpty\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x9c\x01\n\x0eStrategyConfig\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x03zip\x18\x02 \x01(\x08H\x00\x88\x01\x01\x12\x1a\n\rzip_algorithm\x18\x03 \x01(\tH\x01\x88\x01\x01\x12)\n\x06\x63onfig\x18\x04 \x01(\x0b\x32\x14.selector.JsonStringH\x02\x88\x01\x01\x42\x06\n\x04_zipB\x10\n\x0e_zip_algorithmB\t\n\x07_config\"\x80\x02\n\x16ModelStoragePolicyInfo\x12<\n\x1a\x66ull_model_strategy_config\x18\x01 \x01(\x0b\x32\x18.selector.StrategyConfig\x12H\n!incremental_model_strategy_config\x18\x02 \x01(\x0b\x32\x18.selector.StrategyConfigH\x00\x88\x01\x01\x12 \n\x13\x66ull_model_interval\x18\x03 \x01(\x05H\x01\x88\x01\x01\x42$\n\"_incremental_model_strategy_configB\x16\n\x14_full_model_interval\"Z\n\x11\x44\x61taInformRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x0c\n\x04keys\x18\x02 \x03(\x03\x12\x12\n\ntimestamps\x18\x03 \x03(\x03\x12\x0e\n\x06labels\x18\x04 \x03(\x03\"7\n\x12\x44\x61taInformResponse\x12!\n\x03log\x18\x01 \x01(\x0b\x32\x14.selector.JsonString\"H\n\x0fTriggerResponse\x12\x12\n\ntrigger_id\x18\x01 \x01(\x05\x12!\n\x03log\x18\x02 \x01(\x0b\x32\x14.selector.JsonString\"\xfa\x01\n\x17RegisterPipelineRequest\x12\x13\n\x0bnum_workers\x18\x01 \x01(\x05\x12\x30\n\x12selection_strategy\x18\x02 \x01(\x0b\x32\x14.selector.JsonString\x12\x18\n\x10model_class_name\x18\x03 \x01(\t\x12\x31\n\x13model_configuration\x18\x04 \x01(\x0b\x32\x14.selector.JsonString\x12\x0b\n\x03\x61mp\x18\x05 \x01(\x08\x12>\n\x14model_storage_policy\x18\x06 \x01(\x0b\x32 .selector.ModelStoragePolicyInfo\"\'\n\x10PipelineResponse\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"e\n\x11GetSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x14\n\x0cpartition_id\x18\x03 \x01(\x05\x12\x11\n\tworker_id\x18\x04 \x01(\x05\"T\n\x0fSamplesResponse\x12\x1f\n\x17training_samples_subset\x18\x01 \x03(\x03\x12 \n\x18training_samples_weights\x18\x02 \x03(\x02\"D\n\x19GetNumberOfSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\".\n\x17NumberOfSamplesResponse\x12\x13\n\x0bnum_samples\x18\x01 \x01(\x05\"/\n\x18GetStatusBarScaleRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"2\n\x16StatusBarScaleResponse\x12\x18\n\x10status_bar_scale\x18\x01 \x01(\x05\"G\n\x1cGetNumberOfPartitionsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\"4\n\x1aNumberOfPartitionsResponse\x12\x16\n\x0enum_partitions\x18\x01 \x01(\x05\"0\n\x19GetAvailableLabelsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"3\n\x17\x41vailableLabelsResponse\x12\x18\n\x10\x61vailable_labels\x18\x01 \x03(\x03\"2\n\x1bGetSelectionStrategyRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"\x82\x01\n\x19SelectionStrategyResponse\x12\x1c\n\x14\x64ownsampling_enabled\x18\x01 \x01(\x08\x12\x15\n\rstrategy_name\x18\x02 \x01(\t\x12\x30\n\x12\x64ownsampler_config\x18\x03 \x01(\x0b\x32\x14.selector.JsonString\")\n\x12UsesWeightsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"+\n\x13UsesWeightsResponse\x12\x14\n\x0cuses_weights\x18\x01 \x01(\x08\"#\n\x13SeedSelectorRequest\x12\x0c\n\x04seed\x18\x01 \x01(\x05\"\'\n\x14SeedSelectorResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xf6\x07\n\x08Selector\x12T\n\x11register_pipeline\x12!.selector.RegisterPipelineRequest\x1a\x1a.selector.PipelineResponse\"\x00\x12Y\n\x1bget_sample_keys_and_weights\x12\x1b.selector.GetSamplesRequest\x1a\x19.selector.SamplesResponse\"\x00\x30\x01\x12J\n\x0binform_data\x12\x1b.selector.DataInformRequest\x1a\x1c.selector.DataInformResponse\"\x00\x12S\n\x17inform_data_and_trigger\x12\x1b.selector.DataInformRequest\x1a\x19.selector.TriggerResponse\"\x00\x12\x61\n\x15get_number_of_samples\x12#.selector.GetNumberOfSamplesRequest\x1a!.selector.NumberOfSamplesResponse\"\x00\x12^\n\x14get_status_bar_scale\x12\".selector.GetStatusBarScaleRequest\x1a .selector.StatusBarScaleResponse\"\x00\x12j\n\x18get_number_of_partitions\x12&.selector.GetNumberOfPartitionsRequest\x1a$.selector.NumberOfPartitionsResponse\"\x00\x12`\n\x14get_available_labels\x12#.selector.GetAvailableLabelsRequest\x1a!.selector.AvailableLabelsResponse\"\x00\x12\x66\n\x16get_selection_strategy\x12%.selector.GetSelectionStrategyRequest\x1a#.selector.SelectionStrategyResponse\"\x00\x12P\n\rseed_selector\x12\x1d.selector.SeedSelectorRequest\x1a\x1e.selector.SeedSelectorResponse\"\x00\x12M\n\x0cuses_weights\x12\x1c.selector.UsesWeightsRequest\x1a\x1d.selector.UsesWeightsResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'selector_pb2', globals()) @@ -25,48 +25,52 @@ _EMPTY._serialized_end=35 _JSONSTRING._serialized_start=37 _JSONSTRING._serialized_end=64 - _DATAINFORMREQUEST._serialized_start=66 - _DATAINFORMREQUEST._serialized_end=156 - _DATAINFORMRESPONSE._serialized_start=158 - _DATAINFORMRESPONSE._serialized_end=213 - _TRIGGERRESPONSE._serialized_start=215 - _TRIGGERRESPONSE._serialized_end=287 - _REGISTERPIPELINEREQUEST._serialized_start=289 - _REGISTERPIPELINEREQUEST._serialized_end=385 - _PIPELINERESPONSE._serialized_start=387 - _PIPELINERESPONSE._serialized_end=426 - _GETSAMPLESREQUEST._serialized_start=428 - _GETSAMPLESREQUEST._serialized_end=529 - _SAMPLESRESPONSE._serialized_start=531 - _SAMPLESRESPONSE._serialized_end=615 - _GETNUMBEROFSAMPLESREQUEST._serialized_start=617 - _GETNUMBEROFSAMPLESREQUEST._serialized_end=685 - _NUMBEROFSAMPLESRESPONSE._serialized_start=687 - _NUMBEROFSAMPLESRESPONSE._serialized_end=733 - _GETSTATUSBARSCALEREQUEST._serialized_start=735 - _GETSTATUSBARSCALEREQUEST._serialized_end=782 - _STATUSBARSCALERESPONSE._serialized_start=784 - _STATUSBARSCALERESPONSE._serialized_end=834 - _GETNUMBEROFPARTITIONSREQUEST._serialized_start=836 - _GETNUMBEROFPARTITIONSREQUEST._serialized_end=907 - _NUMBEROFPARTITIONSRESPONSE._serialized_start=909 - _NUMBEROFPARTITIONSRESPONSE._serialized_end=961 - _GETAVAILABLELABELSREQUEST._serialized_start=963 - _GETAVAILABLELABELSREQUEST._serialized_end=1011 - _AVAILABLELABELSRESPONSE._serialized_start=1013 - _AVAILABLELABELSRESPONSE._serialized_end=1064 - _GETSELECTIONSTRATEGYREQUEST._serialized_start=1066 - _GETSELECTIONSTRATEGYREQUEST._serialized_end=1116 - _SELECTIONSTRATEGYRESPONSE._serialized_start=1119 - _SELECTIONSTRATEGYRESPONSE._serialized_end=1249 - _USESWEIGHTSREQUEST._serialized_start=1251 - _USESWEIGHTSREQUEST._serialized_end=1292 - _USESWEIGHTSRESPONSE._serialized_start=1294 - _USESWEIGHTSRESPONSE._serialized_end=1337 - _SEEDSELECTORREQUEST._serialized_start=1339 - _SEEDSELECTORREQUEST._serialized_end=1374 - _SEEDSELECTORRESPONSE._serialized_start=1376 - _SEEDSELECTORRESPONSE._serialized_end=1415 - _SELECTOR._serialized_start=1418 - _SELECTOR._serialized_end=2432 + _STRATEGYCONFIG._serialized_start=67 + _STRATEGYCONFIG._serialized_end=223 + _MODELSTORAGEPOLICYINFO._serialized_start=226 + _MODELSTORAGEPOLICYINFO._serialized_end=482 + _DATAINFORMREQUEST._serialized_start=484 + _DATAINFORMREQUEST._serialized_end=574 + _DATAINFORMRESPONSE._serialized_start=576 + _DATAINFORMRESPONSE._serialized_end=631 + _TRIGGERRESPONSE._serialized_start=633 + _TRIGGERRESPONSE._serialized_end=705 + _REGISTERPIPELINEREQUEST._serialized_start=708 + _REGISTERPIPELINEREQUEST._serialized_end=958 + _PIPELINERESPONSE._serialized_start=960 + _PIPELINERESPONSE._serialized_end=999 + _GETSAMPLESREQUEST._serialized_start=1001 + _GETSAMPLESREQUEST._serialized_end=1102 + _SAMPLESRESPONSE._serialized_start=1104 + _SAMPLESRESPONSE._serialized_end=1188 + _GETNUMBEROFSAMPLESREQUEST._serialized_start=1190 + _GETNUMBEROFSAMPLESREQUEST._serialized_end=1258 + _NUMBEROFSAMPLESRESPONSE._serialized_start=1260 + _NUMBEROFSAMPLESRESPONSE._serialized_end=1306 + _GETSTATUSBARSCALEREQUEST._serialized_start=1308 + _GETSTATUSBARSCALEREQUEST._serialized_end=1355 + _STATUSBARSCALERESPONSE._serialized_start=1357 + _STATUSBARSCALERESPONSE._serialized_end=1407 + _GETNUMBEROFPARTITIONSREQUEST._serialized_start=1409 + _GETNUMBEROFPARTITIONSREQUEST._serialized_end=1480 + _NUMBEROFPARTITIONSRESPONSE._serialized_start=1482 + _NUMBEROFPARTITIONSRESPONSE._serialized_end=1534 + _GETAVAILABLELABELSREQUEST._serialized_start=1536 + _GETAVAILABLELABELSREQUEST._serialized_end=1584 + _AVAILABLELABELSRESPONSE._serialized_start=1586 + _AVAILABLELABELSRESPONSE._serialized_end=1637 + _GETSELECTIONSTRATEGYREQUEST._serialized_start=1639 + _GETSELECTIONSTRATEGYREQUEST._serialized_end=1689 + _SELECTIONSTRATEGYRESPONSE._serialized_start=1692 + _SELECTIONSTRATEGYRESPONSE._serialized_end=1822 + _USESWEIGHTSREQUEST._serialized_start=1824 + _USESWEIGHTSREQUEST._serialized_end=1865 + _USESWEIGHTSRESPONSE._serialized_start=1867 + _USESWEIGHTSRESPONSE._serialized_end=1910 + _SEEDSELECTORREQUEST._serialized_start=1912 + _SEEDSELECTORREQUEST._serialized_end=1947 + _SEEDSELECTORRESPONSE._serialized_start=1949 + _SEEDSELECTORRESPONSE._serialized_end=1988 + _SELECTOR._serialized_start=1991 + _SELECTOR._serialized_end=3005 # @@protoc_insertion_point(module_scope) diff --git a/modyn/selector/internal/grpc/generated/selector_pb2.pyi b/modyn/selector/internal/grpc/generated/selector_pb2.pyi index 5b909fbf4..baf170338 100644 --- a/modyn/selector/internal/grpc/generated/selector_pb2.pyi +++ b/modyn/selector/internal/grpc/generated/selector_pb2.pyi @@ -8,6 +8,7 @@ import google.protobuf.descriptor import google.protobuf.internal.containers import google.protobuf.message import sys +import typing if sys.version_info >= (3, 8): import typing as typing_extensions @@ -41,6 +42,66 @@ class JsonString(google.protobuf.message.Message): global___JsonString = JsonString +@typing_extensions.final +class StrategyConfig(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + NAME_FIELD_NUMBER: builtins.int + ZIP_FIELD_NUMBER: builtins.int + ZIP_ALGORITHM_FIELD_NUMBER: builtins.int + CONFIG_FIELD_NUMBER: builtins.int + name: builtins.str + zip: builtins.bool + zip_algorithm: builtins.str + @property + def config(self) -> global___JsonString: ... + def __init__( + self, + *, + name: builtins.str = ..., + zip: builtins.bool | None = ..., + zip_algorithm: builtins.str | None = ..., + config: global___JsonString | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_config", b"_config", "_zip", b"_zip", "_zip_algorithm", b"_zip_algorithm", "config", b"config", "zip", b"zip", "zip_algorithm", b"zip_algorithm"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_config", b"_config", "_zip", b"_zip", "_zip_algorithm", b"_zip_algorithm", "config", b"config", "name", b"name", "zip", b"zip", "zip_algorithm", b"zip_algorithm"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_config", b"_config"]) -> typing_extensions.Literal["config"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_zip", b"_zip"]) -> typing_extensions.Literal["zip"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_zip_algorithm", b"_zip_algorithm"]) -> typing_extensions.Literal["zip_algorithm"] | None: ... + +global___StrategyConfig = StrategyConfig + +@typing_extensions.final +class ModelStoragePolicyInfo(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + FULL_MODEL_STRATEGY_CONFIG_FIELD_NUMBER: builtins.int + INCREMENTAL_MODEL_STRATEGY_CONFIG_FIELD_NUMBER: builtins.int + FULL_MODEL_INTERVAL_FIELD_NUMBER: builtins.int + @property + def full_model_strategy_config(self) -> global___StrategyConfig: ... + @property + def incremental_model_strategy_config(self) -> global___StrategyConfig: ... + full_model_interval: builtins.int + def __init__( + self, + *, + full_model_strategy_config: global___StrategyConfig | None = ..., + incremental_model_strategy_config: global___StrategyConfig | None = ..., + full_model_interval: builtins.int | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_full_model_interval", b"_full_model_interval", "_incremental_model_strategy_config", b"_incremental_model_strategy_config", "full_model_interval", b"full_model_interval", "full_model_strategy_config", b"full_model_strategy_config", "incremental_model_strategy_config", b"incremental_model_strategy_config"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_full_model_interval", b"_full_model_interval", "_incremental_model_strategy_config", b"_incremental_model_strategy_config", "full_model_interval", b"full_model_interval", "full_model_strategy_config", b"full_model_strategy_config", "incremental_model_strategy_config", b"incremental_model_strategy_config"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_full_model_interval", b"_full_model_interval"]) -> typing_extensions.Literal["full_model_interval"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_incremental_model_strategy_config", b"_incremental_model_strategy_config"]) -> typing_extensions.Literal["incremental_model_strategy_config"] | None: ... + +global___ModelStoragePolicyInfo = ModelStoragePolicyInfo + @typing_extensions.final class DataInformRequest(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor @@ -111,17 +172,31 @@ class RegisterPipelineRequest(google.protobuf.message.Message): NUM_WORKERS_FIELD_NUMBER: builtins.int SELECTION_STRATEGY_FIELD_NUMBER: builtins.int + MODEL_CLASS_NAME_FIELD_NUMBER: builtins.int + MODEL_CONFIGURATION_FIELD_NUMBER: builtins.int + AMP_FIELD_NUMBER: builtins.int + MODEL_STORAGE_POLICY_FIELD_NUMBER: builtins.int num_workers: builtins.int @property def selection_strategy(self) -> global___JsonString: ... + model_class_name: builtins.str + @property + def model_configuration(self) -> global___JsonString: ... + amp: builtins.bool + @property + def model_storage_policy(self) -> global___ModelStoragePolicyInfo: ... def __init__( self, *, num_workers: builtins.int = ..., selection_strategy: global___JsonString | None = ..., + model_class_name: builtins.str = ..., + model_configuration: global___JsonString | None = ..., + amp: builtins.bool = ..., + model_storage_policy: global___ModelStoragePolicyInfo | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["selection_strategy", b"selection_strategy"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["num_workers", b"num_workers", "selection_strategy", b"selection_strategy"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["model_configuration", b"model_configuration", "model_storage_policy", b"model_storage_policy", "selection_strategy", b"selection_strategy"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["amp", b"amp", "model_class_name", b"model_class_name", "model_configuration", b"model_configuration", "model_storage_policy", b"model_storage_policy", "num_workers", b"num_workers", "selection_strategy", b"selection_strategy"]) -> None: ... global___RegisterPipelineRequest = RegisterPipelineRequest diff --git a/modyn/selector/internal/grpc/selector_grpc_servicer.py b/modyn/selector/internal/grpc/selector_grpc_servicer.py index 0db6cf8f6..1e3f6988c 100644 --- a/modyn/selector/internal/grpc/selector_grpc_servicer.py +++ b/modyn/selector/internal/grpc/selector_grpc_servicer.py @@ -2,12 +2,12 @@ import logging import os import threading -from typing import Iterable +from typing import Iterable, Optional import grpc +from modyn.metadata_database.utils import ModelStorageStrategyConfig # pylint: disable=no-name-in-module -from modyn.selector.internal.grpc.generated.selector_pb2 import JsonString # noqa: E402, E501 from modyn.selector.internal.grpc.generated.selector_pb2 import ( AvailableLabelsResponse, DataInformRequest, @@ -18,6 +18,7 @@ GetSamplesRequest, GetSelectionStrategyRequest, GetStatusBarScaleRequest, + JsonString, NumberOfPartitionsResponse, NumberOfSamplesResponse, PipelineResponse, @@ -47,9 +48,40 @@ def __init__(self, selector_manager: SelectorManager, sample_batch_size: int): self.selector_manager = selector_manager self._sample_batch_size = sample_batch_size + # TODO(#302): Remove this when reworking pipeline registration def register_pipeline(self, request: RegisterPipelineRequest, context: grpc.ServicerContext) -> PipelineResponse: logger.info(f"Registering pipeline with request - {str(request)}") - pipeline_id = self.selector_manager.register_pipeline(request.num_workers, request.selection_strategy.value) + + full_model_strategy = ModelStorageStrategyConfig.from_config( + request.model_storage_policy.full_model_strategy_config + ) + + incremental_model_strategy: Optional[ModelStorageStrategyConfig] = None + if ( + request.model_storage_policy.HasField("incremental_model_strategy_config") + and request.model_storage_policy.incremental_model_strategy_config is not None + ): + incremental_model_strategy = ModelStorageStrategyConfig.from_config( + request.model_storage_policy.incremental_model_strategy_config + ) + + full_model_interval: Optional[int] = None + if ( + request.model_storage_policy.HasField("full_model_interval") + and request.model_storage_policy.full_model_interval is not None + ): + full_model_interval = request.model_storage_policy.full_model_interval + + pipeline_id = self.selector_manager.register_pipeline( + request.num_workers, + request.selection_strategy.value, + request.model_class_name, + request.model_configuration.value, + request.amp, + full_model_strategy, + incremental_model_strategy, + full_model_interval, + ) return PipelineResponse(pipeline_id=pipeline_id) def get_sample_keys_and_weights( # pylint: disable-next=unused-argument diff --git a/modyn/selector/internal/selector_manager.py b/modyn/selector/internal/selector_manager.py index c3f2fed9f..7c2df37ad 100644 --- a/modyn/selector/internal/selector_manager.py +++ b/modyn/selector/internal/selector_manager.py @@ -11,6 +11,7 @@ from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.metadata_database.models.pipelines import Pipeline +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.selector.internal.selector_strategies.abstract_selection_strategy import AbstractSelectionStrategy from modyn.selector.selector import Selector from modyn.utils.utils import dynamic_module_import, is_directory_writable @@ -95,7 +96,17 @@ def _instantiate_selector(self, pipeline_id: int, num_workers: int, selection_st selector = Selector(selection_strategy, pipeline_id, num_workers, self._modyn_config, self._selector_cache_size) self._selectors[pipeline_id] = selector - def register_pipeline(self, num_workers: int, selection_strategy: str) -> int: + def register_pipeline( + self, + num_workers: int, + selection_strategy: str, + model_class_name: str, + model_config: str, + amp: bool, + full_model_strategy: ModelStorageStrategyConfig, + incremental_model_strategy: Optional[ModelStorageStrategyConfig] = None, + full_model_interval: Optional[int] = None, + ) -> int: """ Registers a new pipeline at the Selector. Returns: @@ -108,7 +119,16 @@ def register_pipeline(self, num_workers: int, selection_strategy: str) -> int: with self._next_pipeline_lock: with MetadataDatabaseConnection(self._modyn_config) as database: - pipeline_id = database.register_pipeline(num_workers, selection_strategy) + pipeline_id = database.register_pipeline( + num_workers, + model_class_name, + model_config, + amp, + selection_strategy, + full_model_strategy, + incremental_model_strategy, + full_model_interval, + ) self._selector_locks[pipeline_id] = self._prepared_locks[pipeline_id % len(self._prepared_locks)] self._instantiate_selector(pipeline_id, num_workers, selection_strategy) diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/utils.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/utils.py index 45c0041d9..2b61bbe3f 100644 --- a/modyn/selector/internal/selector_strategies/downsampling_strategies/utils.py +++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/utils.py @@ -1,5 +1,5 @@ from modyn.selector.internal.selector_strategies.downsampling_strategies import AbstractDownsamplingStrategy -from modyn.utils.utils import instantiate_class +from modyn.utils import instantiate_class def instantiate_downsampler(config: dict, maximum_keys_in_memory: int) -> AbstractDownsamplingStrategy: diff --git a/modyn/selector/internal/selector_strategies/presampling_strategies/utils.py b/modyn/selector/internal/selector_strategies/presampling_strategies/utils.py index 7c7a0bc87..8f6e18a90 100644 --- a/modyn/selector/internal/selector_strategies/presampling_strategies/utils.py +++ b/modyn/selector/internal/selector_strategies/presampling_strategies/utils.py @@ -1,5 +1,5 @@ from modyn.selector.internal.selector_strategies.presampling_strategies import AbstractPresamplingStrategy -from modyn.utils.utils import instantiate_class +from modyn.utils import instantiate_class def instantiate_presampler(config: dict, modyn_config: dict, pipeline_id: int) -> AbstractPresamplingStrategy: diff --git a/modyn/supervisor/internal/grpc_handler.py b/modyn/supervisor/internal/grpc_handler.py index 8d2136ad8..353d0729f 100644 --- a/modyn/supervisor/internal/grpc_handler.py +++ b/modyn/supervisor/internal/grpc_handler.py @@ -29,10 +29,12 @@ ) from modyn.selector.internal.grpc.generated.selector_pb2 import JsonString as SelectorJsonString from modyn.selector.internal.grpc.generated.selector_pb2 import ( + ModelStoragePolicyInfo, NumberOfSamplesResponse, RegisterPipelineRequest, SeedSelectorRequest, StatusBarScaleResponse, + StrategyConfig, TriggerResponse, ) from modyn.selector.internal.grpc.generated.selector_pb2_grpc import SelectorStub @@ -200,18 +202,52 @@ def register_pipeline_at_selector(self, pipeline_config: dict) -> int: if not self.connected_to_selector: raise ConnectionError("Tried to register pipeline at selector, but no connection was made.") + if "config" in pipeline_config["model"]: + model_config = json.dumps(pipeline_config["model"]["config"]) + else: + model_config = "{}" + + model_storage_config = pipeline_config["model_storage"] + incremental_model_strategy: Optional[StrategyConfig] = None + full_model_interval: Optional[int] = None + if "incremental_model_strategy" in model_storage_config: + incremental_strategy = model_storage_config["incremental_model_strategy"] + incremental_model_strategy = self.get_model_strategy(incremental_strategy) + full_model_interval = ( + incremental_strategy["full_model_interval"] if "full_model_interval" in incremental_strategy else None + ) + pipeline_id = self.selector.register_pipeline( RegisterPipelineRequest( num_workers=pipeline_config["training"]["dataloader_workers"], selection_strategy=SelectorJsonString( value=json.dumps(pipeline_config["training"]["selection_strategy"]) ), + model_class_name=pipeline_config["model"]["id"], + model_configuration=SelectorJsonString(value=model_config), + amp=pipeline_config["training"]["amp"] if "amp" in pipeline_config["training"] else False, + model_storage_policy=ModelStoragePolicyInfo( + full_model_strategy_config=self.get_model_strategy(model_storage_config["full_model_strategy"]), + incremental_model_strategy_config=incremental_model_strategy, + full_model_interval=full_model_interval, + ), ) ).pipeline_id logger.info(f"Registered pipeline {pipeline_config['pipeline']['name']} at selector with ID {pipeline_id}") return pipeline_id + @staticmethod + def get_model_strategy(strategy_config: dict) -> StrategyConfig: + return StrategyConfig( + name=strategy_config["name"], + zip=strategy_config["zip"] if "zip" in strategy_config else None, + zip_algorithm=strategy_config["zip_algorithm"] if "zip_algorithm" in strategy_config else None, + config=SelectorJsonString(value=json.dumps(strategy_config["config"])) + if "config" in strategy_config + else None, + ) + # pylint: disable-next=unused-argument def unregister_pipeline_at_selector(self, pipeline_id: int) -> None: # # TODO(#64,#124): Implement. @@ -268,11 +304,6 @@ def start_training( if not self.connected_to_trainer_server: raise ConnectionError("Tried to start training at trainer server, but not there is no gRPC connection.") - if "config" in pipeline_config["model"]: - model_config = json.dumps(pipeline_config["model"]["config"]) - else: - model_config = "{}" - optimizers_config = {} for optimizer in pipeline_config["training"]["optimizers"]: optimizer_config = {} @@ -353,8 +384,6 @@ def start_training( else: checkpoint_info = CheckpointInfo(checkpoint_interval=0, checkpoint_path="") - amp = pipeline_config["training"]["amp"] if "amp" in pipeline_config["training"] else False - if "grad_scaler_config" in pipeline_config["training"]: grad_scaler_config = pipeline_config["training"]["grad_scaler_config"] else: @@ -364,9 +393,6 @@ def start_training( "pipeline_id": pipeline_id, "trigger_id": trigger_id, "device": pipeline_config["training"]["device"], - "amp": amp, - "model_id": pipeline_config["model"]["id"], - "model_configuration": TrainerServerJsonString(value=model_config), "use_pretrained_model": previous_model_id is not None, "pretrained_model_id": previous_model_id or -1, "load_optimizer_state": False, # TODO(#137): Think about this. @@ -506,27 +532,18 @@ def seed_selector(self, seed: int) -> None: assert success, "Something went wrong while seeding the selector" - def start_evaluation(self, trained_model_id: int, pipeline_config: dict) -> dict[int, EvaluationStatusTracker]: + def start_evaluation(self, model_id: int, pipeline_config: dict) -> dict[int, EvaluationStatusTracker]: if not self.connected_to_evaluator: raise ConnectionError("Tried to start evaluation at evaluator, but there is no gRPC connection.") - model_id = pipeline_config["model"]["id"] - if "config" in pipeline_config["model"]: - model_config = json.dumps(pipeline_config["model"]["config"]) - else: - model_config = "{}" - device = pipeline_config["evaluation"]["device"] - amp = pipeline_config["evaluation"]["amp"] if "amp" in pipeline_config["evaluation"] else False evaluations: dict[int, EvaluationStatusTracker] = {} for dataset in pipeline_config["evaluation"]["datasets"]: dataset_id = dataset["dataset_id"] - req = GRPCHandler._prepare_evaluation_request( - dataset, model_id, model_config, trained_model_id, device, amp - ) + req = GRPCHandler._prepare_evaluation_request(dataset, model_id, device) response: EvaluateModelResponse = self.evaluator.evaluate_model(req) if not response.evaluation_started: @@ -539,9 +556,7 @@ def start_evaluation(self, trained_model_id: int, pipeline_config: dict) -> dict return evaluations @staticmethod - def _prepare_evaluation_request( - dataset_config: dict, model_id: str, model_config: str, trained_model_id: int, device: str, amp: bool - ) -> EvaluateModelRequest: + def _prepare_evaluation_request(dataset_config: dict, model_id: int, device: str) -> EvaluateModelRequest: dataset_id = dataset_config["dataset_id"] if "transformations" in dataset_config: @@ -579,22 +594,17 @@ def _prepare_evaluation_request( ) start_evaluation_kwargs = { - "trained_model_id": trained_model_id, + "model_id": model_id, "dataset_info": DatasetInfo(dataset_id=dataset_id, num_dataloaders=dataloader_workers), "device": device, - "amp": amp, "batch_size": batch_size, "metrics": metrics, - "model_id": model_id, - "model_configuration": EvaluatorJsonString(value=model_config), "transform_list": transform_list, "bytes_parser": EvaluatorPythonString(value=bytes_parser_function), "label_transformer": EvaluatorPythonString(value=label_transformer), } - cleaned_kwargs = {k: v for k, v in start_evaluation_kwargs.items() if v is not None} - - return EvaluateModelRequest(**cleaned_kwargs) + return EvaluateModelRequest(**start_evaluation_kwargs) def wait_for_evaluation_completion(self, training_id: int, evaluations: dict[int, EvaluationStatusTracker]) -> None: if not self.connected_to_evaluator: diff --git a/modyn/supervisor/supervisor.py b/modyn/supervisor/supervisor.py index 30ba489de..1760ffffa 100644 --- a/modyn/supervisor/supervisor.py +++ b/modyn/supervisor/supervisor.py @@ -302,7 +302,7 @@ def wait_for_new_data(self, start_timestamp: int) -> None: self.status_bar.update(demo="Fetching new data") trigger_occured = False largest_keys = set() - for new_data in self.grpc.get_new_data_since(dataset_id, last_timestamp): + for new_data, _ in self.grpc.get_new_data_since(dataset_id, last_timestamp): # Since get_new_data_since is inclusive, we need to filter out the keys # we have already processed in the previous get_new_data_since request new_data = [ @@ -453,17 +453,17 @@ def _run_training(self, trigger_id: int) -> None: # We store the trained model for evaluation in any case. self._sw.start("store_trained_model", overwrite=True) - trained_model_id = self.grpc.store_trained_model(self.current_training_id) + model_id = self.grpc.store_trained_model(self.current_training_id) self.pipeline_log["supervisor"]["triggers"][trigger_id]["store_trained_model_time"] = self._sw.stop() # Only if the pipeline actually wants to continue the training on it, we set previous model. if self.pipeline_config["training"]["use_previous_model"]: - self.previous_model_id = trained_model_id + self.previous_model_id = model_id # Start evaluation if "evaluation" in self.pipeline_config: # TODO(#300) Add evaluator to pipeline log - evaluations = self.grpc.start_evaluation(trained_model_id, self.pipeline_config) + evaluations = self.grpc.start_evaluation(model_id, self.pipeline_config) self.grpc.wait_for_evaluation_completion(self.current_training_id, evaluations) writer_names: set[str] = set(self.pipeline_config["evaluation"]["result_writers"]) diff --git a/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py b/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py index 53e8fe138..fb38e07f3 100644 --- a/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py +++ b/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py @@ -1,9 +1,9 @@ # pylint: disable=unused-argument, no-name-in-module, no-value-for-parameter import json import multiprocessing as mp +import os import pathlib import platform -import shutil import tempfile from time import sleep from unittest import mock @@ -23,18 +23,54 @@ ) from modyn.evaluator.internal.metrics import Accuracy, F1Score from modyn.evaluator.internal.utils import EvaluationInfo, EvaluationProcessInfo, EvaluatorMessages +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import FetchModelRequest, FetchModelResponse from modyn.storage.internal.grpc.generated.storage_pb2 import GetDatasetSizeRequest, GetDatasetSizeResponse +DATABASE = pathlib.Path(os.path.abspath(__file__)).parent / "test_evaluator.database" + def get_modyn_config(): return { "evaluator": {"hostname": "localhost", "port": "50000"}, "model_storage": {"hostname": "localhost", "port": "50051", "ftp_port": "5223"}, "storage": {"hostname": "storage", "port": "50052"}, + "metadata_database": { + "drivername": "sqlite", + "username": "", + "password": "", + "host": "", + "port": 0, + "database": f"{DATABASE}", + }, } +def setup(): + DATABASE.unlink(True) + + with MetadataDatabaseConnection(get_modyn_config()) as database: + database.create_tables() + + database.register_pipeline( + 1, + "ResNet18", + json.dumps({}), + True, + "{}", + ModelStorageStrategyConfig(name="PyTorchFullModel"), + incremental_model_strategy=None, + full_model_interval=None, + ) + database.add_trained_model(1, 10, "trained_model.modyn", "trained_model.metadata") + database.add_trained_model(1, 11, "trained_model2.modyn", "trained_model.metadata") + + +def teardown(): + DATABASE.unlink() + + class DummyModelWrapper: def __init__(self, model_configuration=None) -> None: self.model = None @@ -43,8 +79,8 @@ def __init__(self, model_configuration=None) -> None: class DummyModelStorageStub: # pylint: disable-next=invalid-name def FetchModel(self, request: FetchModelRequest) -> FetchModelResponse: - if request.model_id <= 10: - return FetchModelResponse(success=True, model_path="trained_model.modyn") + if request.model_id == 1: + return FetchModelResponse(success=True, model_path="trained_model.modyn", checksum=bytes(5)) return FetchModelResponse(success=False) @@ -86,12 +122,11 @@ def get_mock_evaluation_transformer(): ) -def get_evaluate_model_request(valid_model: bool): +def get_evaluate_model_request(): return EvaluateModelRequest( - trained_model_id=5, + model_id=1, dataset_info=DatasetInfo(dataset_id="MNIST", num_dataloaders=1), device="cpu", - amp=False, batch_size=4, metrics=[ MetricConfiguration( @@ -100,19 +135,20 @@ def get_evaluate_model_request(valid_model: bool): evaluation_transformer=PythonString(value=""), ) ], - model_id="ResNet18" if valid_model else "unknown", - model_configuration=JsonString(value=json.dumps({})), transform_list=[], bytes_parser=PythonString(value=get_mock_bytes_parser()), label_transformer=PythonString(value=""), ) -def get_evaluation_info(evaluation_id, valid_model: bool, model_path: pathlib.Path, config: dict): +def get_evaluation_info(evaluation_id, model_path: pathlib.Path, config: dict): storage_address = f"{config['storage']['hostname']}:{config['storage']['port']}" return EvaluationInfo( - request=get_evaluate_model_request(valid_model), + request=get_evaluate_model_request(), evaluation_id=evaluation_id, + model_class_name="ResNet18", + amp=False, + model_config="{}", storage_address=storage_address, metrics=[Accuracy("", {}), F1Score("", {"num_classes": 2})], model_path=model_path, @@ -133,51 +169,68 @@ def test_init(test_connect_to_model_storage, test_connect_to_storage): @patch.object(EvaluatorGRPCServicer, "connect_to_storage", return_value=DummyStorageStub()) @patch.object(EvaluatorGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) -def test_evaluate_model_invalid(test_connect_to_model_storage, test_connect_to_storage): +@patch("modyn.evaluator.internal.grpc.evaluator_grpc_servicer.hasattr", return_value=False) +def test_evaluate_model_invalid_model_id(test_has_attribute, test_connect_to_model_storage, test_connect_to_storage): with tempfile.TemporaryDirectory() as modyn_temp: evaluator = EvaluatorGRPCServicer(get_modyn_config(), pathlib.Path(modyn_temp)) - response = evaluator.evaluate_model(get_evaluate_model_request(False), None) + response = evaluator.evaluate_model(get_evaluate_model_request(), None) assert not response.evaluation_started assert not evaluator._evaluation_dict assert evaluator._next_evaluation_id == 0 - req = get_evaluate_model_request(True) - req.trained_model_id = 15 + +@patch.object(EvaluatorGRPCServicer, "connect_to_storage", return_value=DummyStorageStub()) +@patch.object(EvaluatorGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) +def test_evaluate_model_invalid(test_connect_to_model_storage, test_connect_to_storage): + with tempfile.TemporaryDirectory() as modyn_temp: + evaluator = EvaluatorGRPCServicer(get_modyn_config(), pathlib.Path(modyn_temp)) + req = get_evaluate_model_request() + req.model_id = 15 resp = evaluator.evaluate_model(req, None) assert not resp.evaluation_started - req = get_evaluate_model_request(True) + req = get_evaluate_model_request() req.dataset_info.dataset_id = "unknown" resp = evaluator.evaluate_model(req, None) assert not resp.evaluation_started assert evaluator._next_evaluation_id == 0 + req = get_evaluate_model_request() + req.model_id = 2 + resp = evaluator.evaluate_model(req, None) + assert not resp.evaluation_started + -@patch("modyn.evaluator.internal.grpc.evaluator_grpc_servicer.download_file") +@patch( + "modyn.evaluator.internal.grpc.evaluator_grpc_servicer.download_trained_model", + return_value=pathlib.Path("downloaded_model.modyn"), +) @patch.object(EvaluatorGRPCServicer, "connect_to_storage", return_value=DummyStorageStub()) @patch.object(EvaluatorGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) -def test_evaluate_model_valid(test_connect_to_model_storage, test_connect_to_storage, download_file_mock: MagicMock): +def test_evaluate_model_valid(test_connect_to_model_storage, test_connect_to_storage, download_model_mock: MagicMock): with tempfile.TemporaryDirectory() as modyn_temp: evaluator = EvaluatorGRPCServicer(get_modyn_config(), pathlib.Path(modyn_temp)) - with open(pathlib.Path(modyn_temp) / "trained_model.modyn", "wb") as file: - file.write(b"Our trained model!") + mock_start = mock.Mock() with patch("multiprocessing.Process.start", mock_start): - resp: EvaluateModelResponse = evaluator.evaluate_model(get_evaluate_model_request(True), None) + resp: EvaluateModelResponse = evaluator.evaluate_model(get_evaluate_model_request(), None) assert 0 in evaluator._evaluation_process_dict assert evaluator._next_evaluation_id == 1 - download_file_mock.assert_called_once() - kwargs = download_file_mock.call_args.kwargs - remote_file_path = kwargs["remote_file_path"] - local_file_path = kwargs["local_file_path"] - - shutil.copyfile(pathlib.Path(modyn_temp) / remote_file_path, local_file_path) + download_model_mock.assert_called_once() + kwargs = download_model_mock.call_args.kwargs + remote_file_path = kwargs["remote_path"] + base_directory = kwargs["base_directory"] + identifier = kwargs["identifier"] - with open(evaluator._evaluation_dict[resp.evaluation_id].model_path, "rb") as file: - assert file.read().decode("utf-8") == "Our trained model!" + assert str(remote_file_path) == "trained_model.modyn" + assert base_directory == evaluator._base_dir + assert identifier == 0 + assert resp.evaluation_started + assert resp.evaluation_id == identifier + assert str(evaluator._evaluation_dict[resp.evaluation_id].model_path) == "downloaded_model.modyn" @patch.object(EvaluatorGRPCServicer, "connect_to_storage", return_value=DummyStorageStub()) @@ -396,7 +449,7 @@ def test_get_evaluation_result_missing_metric(test_is_alive, test_connect_to_mod evaluation_process_info = get_evaluation_process_info() evaluator._evaluation_process_dict[3] = evaluation_process_info config = get_modyn_config() - evaluator._evaluation_dict[3] = get_evaluation_info(3, True, pathlib.Path("trained.model"), config) + evaluator._evaluation_dict[3] = get_evaluation_info(3, pathlib.Path("trained.model"), config) response = evaluator.get_evaluation_result(EvaluationResultRequest(evaluation_id=3), None) assert response.valid assert len(response.evaluation_data) == 0 @@ -413,7 +466,7 @@ def test_get_evaluation_result( with tempfile.TemporaryDirectory() as temp: config = get_modyn_config() evaluator = EvaluatorGRPCServicer(config, pathlib.Path(temp)) - evaluator._evaluation_dict[1] = get_evaluation_info(1, True, pathlib.Path(temp) / "trained_model.modyn", config) + evaluator._evaluation_dict[1] = get_evaluation_info(1, pathlib.Path(temp) / "trained_model.modyn", config) assert len(evaluator._evaluation_dict[1].metrics) == 2 assert isinstance(evaluator._evaluation_dict[1].metrics[0], Accuracy) diff --git a/modyn/tests/evaluator/internal/test_pytorch_evaluator.py b/modyn/tests/evaluator/internal/test_pytorch_evaluator.py index b0b3b63da..d72f46399 100644 --- a/modyn/tests/evaluator/internal/test_pytorch_evaluator.py +++ b/modyn/tests/evaluator/internal/test_pytorch_evaluator.py @@ -92,10 +92,9 @@ def get_evaluation_info( ): model_dynamic_module_patch.return_value = MockModule() request = EvaluateModelRequest( - trained_model_id=1, + model_id=1, dataset_info=DatasetInfo(dataset_id="MNIST", num_dataloaders=1), device="cpu", - amp=False, batch_size=4, metrics=[ MetricConfiguration( @@ -104,14 +103,12 @@ def get_evaluation_info( evaluation_transformer=PythonString(value=get_mock_accuracy_transformer()), ) ], - model_id="model", - model_configuration=JsonString(value=json.dumps({})), transform_list=[], bytes_parser=PythonString(value=get_mock_bytes_parser()), label_transformer=PythonString(value=get_mock_label_transformer() if label_transformer else ""), ) - return EvaluationInfo(request, evaluation_id, storage_address, metrics, trained_model_path) + return EvaluationInfo(request, evaluation_id, "model", "{}", False, storage_address, metrics, trained_model_path) @patch.object(StorageStub, "__init__", noop_constructor_mock) diff --git a/modyn/tests/metadata_database/models/test_pipelines.py b/modyn/tests/metadata_database/models/test_pipelines.py index cd78125e3..dfa8a5e6c 100644 --- a/modyn/tests/metadata_database/models/test_pipelines.py +++ b/modyn/tests/metadata_database/models/test_pipelines.py @@ -1,4 +1,6 @@ # pylint: disable=redefined-outer-name +import json + import pytest from modyn.metadata_database.models import Pipeline from sqlalchemy import create_engine @@ -19,29 +21,66 @@ def session(): def test_add_pipeline(session): - pipeline = Pipeline(num_workers=10, selection_strategy="{}") + pipeline = Pipeline( + num_workers=10, + model_class_name="ResNet18", + model_config=json.dumps({"num_classes": 10}), + amp=True, + selection_strategy="{}", + full_model_strategy_name="PyTorchFullModel", + ) session.add(pipeline) session.commit() - assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first() is not None - assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().num_workers == 10 + extracted_pipeline: Pipeline = session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first() + + assert extracted_pipeline is not None + assert extracted_pipeline.num_workers == 10 + assert extracted_pipeline.model_class_name == "ResNet18" + assert json.loads(extracted_pipeline.model_config)["num_classes"] == 10 + assert extracted_pipeline.amp + assert extracted_pipeline.full_model_strategy_name == "PyTorchFullModel" + assert not extracted_pipeline.full_model_strategy_zip + assert extracted_pipeline.inc_model_strategy_name is None + assert extracted_pipeline.full_model_strategy_config is None def test_update_pipeline(session): - pipeline = Pipeline(num_workers=10, selection_strategy="{}") + pipeline = Pipeline( + num_workers=10, + model_class_name="ResNet18", + model_config="{}", + amp=True, + selection_strategy="{}", + full_model_strategy_name="PyTorchFullModel", + ) session.add(pipeline) session.commit() pipeline.num_workers = 20 + pipeline.amp = False session.commit() assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first() is not None assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().num_workers == 20 assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().selection_strategy == "{}" + assert not session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().amp + + pipeline.model_class_name = "test_model" + session.commit() + + assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().model_class_name == "test_model" def test_delete_pipeline(session): - pipeline = Pipeline(num_workers=10, selection_strategy="{}") + pipeline = Pipeline( + num_workers=10, + model_class_name="ResNet18", + model_config="{}", + amp=False, + selection_strategy="{}", + full_model_strategy_name="PyTorchFullModel", + ) session.add(pipeline) session.commit() diff --git a/modyn/tests/metadata_database/models/test_trained_models.py b/modyn/tests/metadata_database/models/test_trained_models.py index 979485e9c..b34465b98 100644 --- a/modyn/tests/metadata_database/models/test_trained_models.py +++ b/modyn/tests/metadata_database/models/test_trained_models.py @@ -20,7 +20,7 @@ def session(): def test_add_trained_model(session): - model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path") + model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path", metadata_path="metadata") session.add(model) session.commit() @@ -29,7 +29,7 @@ def test_add_trained_model(session): def test_get_trained_model(session): - model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path") + model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path", metadata_path="metadata") session.add(model) session.commit() @@ -38,11 +38,12 @@ def test_get_trained_model(session): assert fetched_valid.model_id == 1 assert fetched_valid.model_path == "test_path" + assert fetched_valid.metadata_path == "metadata" assert fetched_invalid is None def test_delete_trained_model(session): - model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path") + model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path", metadata_path="metadata") session.add(model) session.commit() @@ -50,6 +51,7 @@ def test_delete_trained_model(session): assert fetched.model_id == 1 assert fetched.model_path == "test_path" + assert fetched.metadata_path == "metadata" session.query(TrainedModel).filter(TrainedModel.model_id == 1).delete(synchronize_session="fetch") session.commit() @@ -60,7 +62,7 @@ def test_delete_trained_model(session): def test_string_repr(session): - model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path") + model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path", metadata_path="metadata") session.add(model) session.commit() diff --git a/modyn/tests/metadata_database/test_metadata_database_connection.py b/modyn/tests/metadata_database/test_metadata_database_connection.py index 51accd949..a56cab0cf 100644 --- a/modyn/tests/metadata_database/test_metadata_database_connection.py +++ b/modyn/tests/metadata_database/test_metadata_database_connection.py @@ -1,5 +1,8 @@ +import json + from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.metadata_database.models import TrainedModel, Trigger +from modyn.metadata_database.utils import ModelStorageStrategyConfig def get_minimal_modyn_config() -> dict: @@ -24,16 +27,22 @@ def test_database_connection(): def test_register_pipeline(): with MetadataDatabaseConnection(get_minimal_modyn_config()) as database: database.create_tables() - pipeline_id = database.register_pipeline(1, "{}") + pipeline_id = database.register_pipeline( + 1, "ResNet18", "{}", True, "{}", ModelStorageStrategyConfig(name="PyTorchFullModel") + ) assert pipeline_id == 1 - pipeline_id = database.register_pipeline(1, "{}") + pipeline_id = database.register_pipeline( + 1, "ResNet18", "{}", False, "{}", ModelStorageStrategyConfig(name="PyTorchFullModel") + ) assert pipeline_id == 2 def test_add_trained_model(): with MetadataDatabaseConnection(get_minimal_modyn_config()) as database: database.create_tables() - pipeline_id = database.register_pipeline(1, "{}") + pipeline_id = database.register_pipeline( + 1, "ResNet18", "{}", True, "{}", ModelStorageStrategyConfig(name="PyTorchFullModel") + ) trigger = Trigger(pipeline_id=pipeline_id, trigger_id=5) database.session.add(trigger) @@ -42,10 +51,42 @@ def test_add_trained_model(): assert pipeline_id == 1 and trigger_id == 5 - model_id = database.add_trained_model(pipeline_id, trigger_id, "test_path.modyn") + model_id = database.add_trained_model(pipeline_id, trigger_id, "test_path.modyn", "test_path.metadata") + + model_parent: TrainedModel = database.session.get(TrainedModel, model_id) + + assert model_parent.model_id == 1 + assert model_parent.model_path == "test_path.modyn" + assert model_parent.metadata_path == "test_path.metadata" + assert model_parent.pipeline_id == 1 and model_parent.trigger_id == 5 + assert model_parent.parent_model is None + + model_id = database.add_trained_model( + pipeline_id, 6, "test_path.modyn", "test_path.metadata", parent_model=model_parent.model_id + ) + model_child: TrainedModel = database.session.get(TrainedModel, model_id) + + assert model_child.parent_model == model_parent.model_id + assert len(model_parent.children) == 1 + assert model_parent.children[0] == model_child + + +def test_get_model_configuration(): + with MetadataDatabaseConnection(get_minimal_modyn_config()) as database: + database.create_tables() + pipeline_id = database.register_pipeline( + 1, + "ResNet18", + json.dumps({"num_classes": 10}), + True, + "{}", + ModelStorageStrategyConfig(name="PyTorchFullModel"), + ) + + assert pipeline_id == 1 - model: TrainedModel = database.session.get(TrainedModel, model_id) + model_class_name, model_config, amp = database.get_model_configuration(pipeline_id) - assert model.model_id == 1 - assert model.model_path == "test_path.modyn" - assert model.pipeline_id == 1 and model.trigger_id == 5 + assert model_class_name == "ResNet18" + assert json.loads(model_config) == {"num_classes": 10} + assert amp diff --git a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_server.py b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_server.py index 7f4b3e1ca..47a7b0276 100644 --- a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_server.py +++ b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_server.py @@ -2,6 +2,7 @@ import pathlib from unittest.mock import patch +from modyn.model_storage.internal import ModelStorageManager from modyn.model_storage.internal.grpc.grpc_server import GRPCServer @@ -10,12 +11,16 @@ def get_modyn_config(): def test_init(): - grpc_server = GRPCServer(get_modyn_config(), pathlib.Path.cwd() / "temp_dir") + grpc_server = GRPCServer(get_modyn_config(), pathlib.Path.cwd() / "storage_dir", pathlib.Path.cwd() / "ftp_dir") assert grpc_server.modyn_config == get_modyn_config() - assert str(grpc_server.storage_dir) == str(pathlib.Path.cwd() / "temp_dir") + assert str(grpc_server.storage_dir) == str(pathlib.Path.cwd() / "storage_dir") + assert str(grpc_server.ftp_directory) == str(pathlib.Path.cwd() / "ftp_dir") @patch("modyn.model_storage.internal.grpc.grpc_server.add_ModelStorageServicer_to_server", return_value=None) -def test_enter(mock_add_model_storage_servicer_to_server): - with GRPCServer(get_modyn_config(), pathlib.Path.cwd() / "temp_dir") as grpc_server: +@patch.object(ModelStorageManager, "__init__", return_value=None) +def test_enter(mock_init_model_storage_manager, mock_add_model_storage_servicer_to_server): + with GRPCServer( + get_modyn_config(), pathlib.Path.cwd() / "storage_dir", pathlib.Path.cwd() / "ftp_dir" + ) as grpc_server: assert grpc_server is not None diff --git a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py index 7a84c0533..09637d71e 100644 --- a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py +++ b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py @@ -1,11 +1,11 @@ -import os +# pylint: disable=unused-argument import pathlib import shutil import tempfile from unittest.mock import MagicMock, patch -from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection -from modyn.metadata_database.models import TrainedModel, Trigger +import torch +from modyn.model_storage.internal import ModelStorageManager # pylint: disable-next=no-name-in-module from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( @@ -17,52 +17,28 @@ RegisterModelResponse, ) from modyn.model_storage.internal.grpc.model_storage_grpc_servicer import ModelStorageGRPCServicer - -DATABASE = pathlib.Path(os.path.abspath(__file__)).parent / "test_model_storage.database" +from modyn.utils import calculate_checksum def get_modyn_config(): return { "model_storage": {"port": "50051", "ftp_port": "5223"}, "trainer_server": {"hostname": "localhost", "ftp_port": "5222"}, - "metadata_database": { - "drivername": "sqlite", - "username": "", - "password": "", - "host": "", - "port": 0, - "database": f"{DATABASE}", - }, } -def setup(): - if os.path.exists(DATABASE): - os.remove(DATABASE) - - with MetadataDatabaseConnection(get_modyn_config()) as database: - database.create_tables() - - pipeline_id = database.register_pipeline(1, "{}") - trigger = Trigger(trigger_id=10, pipeline_id=pipeline_id) - - database.session.add(trigger) - database.session.commit() - - pipeline2 = database.register_pipeline(4, "{}") - trigger2 = Trigger(trigger_id=50, pipeline_id=pipeline2) - - database.session.add(trigger2) - database.session.commit() - - -def teardown(): - os.remove(DATABASE) - - -@patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.download_file") +@patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.download_file", return_value=True) @patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.current_time_millis", return_value=100) -def test_register_model(current_time_millis, download_file_mock: MagicMock): # pylint: disable=unused-argument +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "store_model", return_value=15) +@patch("os.remove") +def test_register_model( + os_remove_mock: MagicMock, + store_model_mock: MagicMock, + init_manager_mock, + current_time_millis, + download_file_mock: MagicMock, +): config = get_modyn_config() with tempfile.TemporaryDirectory() as storage_dir: storage_path = pathlib.Path(storage_dir) @@ -70,7 +46,7 @@ def test_register_model(current_time_millis, download_file_mock: MagicMock): # with open(storage_path / "test.txt", "wb") as file: file.write(b"Our test model") - servicer = ModelStorageGRPCServicer(config, storage_path) + servicer = ModelStorageGRPCServicer(config, storage_path, storage_path) assert servicer is not None req = RegisterModelRequest( @@ -79,75 +55,125 @@ def test_register_model(current_time_millis, download_file_mock: MagicMock): # hostname=config["trainer_server"]["hostname"], port=int(config["trainer_server"]["ftp_port"]), model_path="test.txt", + checksum=calculate_checksum(storage_path / "test.txt"), ) resp: RegisterModelResponse = servicer.RegisterModel(req, None) download_file_mock.assert_called_once() kwargs = download_file_mock.call_args.kwargs + remote_file_path = kwargs["remote_file_path"] local_file_path = kwargs["local_file_path"] shutil.copyfile(storage_path / remote_file_path, local_file_path) assert resp.success + assert resp.model_id == 15 - # download file under path {current_time_millis}_{pipeline_id}_{trigger_id}.modyn - with open(storage_path / f"100_{resp.model_id}_10.modyn", "rb") as file: + # download file under path {current_time_millis}_{pipeline_id}_{trigger_id}.zip + with open(storage_path / "100_1_10.modyn", "rb") as file: assert file.read().decode("utf-8") == "Our test model" + assert calculate_checksum(storage_path / "100_1_10.modyn") == kwargs["checksum"] + os_remove_mock.assert_called_with(storage_path / "100_1_10.modyn") + -def test_fetch_model(): +@patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.download_file", return_value=False) +@patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.current_time_millis", return_value=100) +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "store_model") +def test_register_model_invalid( + store_model_mock: MagicMock, init_manager_mock, current_time_millis, download_file_mock: MagicMock +): + config = get_modyn_config() + storage_path = pathlib.Path("storage_dir") + servicer = ModelStorageGRPCServicer(config, storage_path, storage_path) + + assert servicer is not None + req = RegisterModelRequest( + pipeline_id=1, + trigger_id=10, + hostname=config["trainer_server"]["hostname"], + port=int(config["trainer_server"]["ftp_port"]), + model_path="test.txt", + checksum=bytes([7, 1, 0]), + ) + + resp: RegisterModelResponse = servicer.RegisterModel(req, None) + download_file_mock.assert_called_once() + + assert not resp.success + store_model_mock.assert_not_called() + + +@patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.current_time_millis", return_value=100) +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "load_model", return_value={"model": {"conv_1": 1}, "metadata": True}) +def test_fetch_model(load_model_mock: MagicMock, init_manager_mock, current_time_millis): config = get_modyn_config() with tempfile.TemporaryDirectory() as storage_dir: storage_path = pathlib.Path(storage_dir) - servicer = ModelStorageGRPCServicer(config, storage_path) + servicer = ModelStorageGRPCServicer(config, storage_path, storage_path) assert servicer is not None - with MetadataDatabaseConnection(config) as database: - model_id = database.add_trained_model(2, 50, "test_model.modyn") - - req = FetchModelRequest(model_id=model_id) + req = FetchModelRequest(model_id=10, load_metadata=True) resp: FetchModelResponse = servicer.FetchModel(req, None) assert resp.success - assert resp.model_path == "test_model.modyn" + load_model_mock.assert_called_once_with(10, True) - req_invalid = FetchModelRequest(model_id=142) - resp_invalid: FetchModelResponse = servicer.FetchModel(req_invalid, None) + # store final model to {current_time_millis()}_{model_id}.zip + assert resp.model_path == "100_10.modyn" - assert not resp_invalid.success + assert torch.load(storage_path / resp.model_path) == {"model": {"conv_1": 1}, "metadata": True} -def test_delete_model(): +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "load_model", return_value=None) +def test_fetch_model_invalid(load_model_mock: MagicMock, init_manager_mock): config = get_modyn_config() with tempfile.TemporaryDirectory() as storage_dir: storage_path = pathlib.Path(storage_dir) - servicer = ModelStorageGRPCServicer(config, storage_path) + servicer = ModelStorageGRPCServicer(config, storage_path, storage_dir) assert servicer is not None - with open(storage_path / "model_to_be_deleted.modyn", "wb") as file: - file.write(b"model that will be deleted") + req = FetchModelRequest(model_id=101, load_metadata=False) + resp: FetchModelResponse = servicer.FetchModel(req, None) + + assert not resp.success - assert os.path.isfile(storage_path / "model_to_be_deleted.modyn") + req = FetchModelRequest(model_id=101, load_metadata=True) + resp: FetchModelResponse = servicer.FetchModel(req, None) - with MetadataDatabaseConnection(config) as database: - model_id = database.add_trained_model(2, 50, "model_to_be_deleted.modyn") + assert not resp.success - req = DeleteModelRequest(model_id=model_id) - resp: DeleteModelResponse = servicer.DeleteModel(req, None) - assert resp.success - assert not os.path.isfile(storage_path / "model_to_be_deleted.modyn") +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "delete_model", return_value=True) +def test_delete_model(delete_model_mock: MagicMock, init_manager_mock): + config = get_modyn_config() + servicer = ModelStorageGRPCServicer(config, pathlib.Path("storage_dir"), pathlib.Path("ftp_dir")) + assert servicer is not None - req_invalid = DeleteModelRequest(model_id=model_id) - resp_invalid: DeleteModelResponse = servicer.DeleteModel(req_invalid, None) + req = DeleteModelRequest(model_id=20) + resp: DeleteModelResponse = servicer.DeleteModel(req, None) - assert not resp_invalid.success + assert resp.success + delete_model_mock.assert_called_once_with(20) + + +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "delete_model", return_value=False) +def test_delete_model_invalid(delete_model_mock: MagicMock, init_manager_mock): + config = get_modyn_config() + servicer = ModelStorageGRPCServicer(config, pathlib.Path("storage_dir"), pathlib.Path("ftp_dir")) + assert servicer is not None - with MetadataDatabaseConnection(config) as database: - model_id = database.session.get(TrainedModel, model_id) + req = DeleteModelRequest(model_id=50) + resp: DeleteModelResponse = servicer.DeleteModel(req, None) - assert not model_id + assert not resp.success + delete_model_mock.assert_called_once_with(50) diff --git a/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_sub_difference_operator.py b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_sub_difference_operator.py new file mode 100644 index 000000000..c635a6f2c --- /dev/null +++ b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_sub_difference_operator.py @@ -0,0 +1,26 @@ +import torch +from modyn.model_storage.internal.storage_strategies import AbstractDifferenceOperator +from modyn.model_storage.internal.storage_strategies.difference_operators import SubDifferenceOperator + + +def test_inheritance(): + assert issubclass(SubDifferenceOperator.__class__, AbstractDifferenceOperator.__class__) + + +def test_calculate_difference(): + ones = torch.ones(1, dtype=torch.int32) + + difference_operator = SubDifferenceOperator() + assert difference_operator.calculate_difference(ones, ones) == b"\x00\x00\x00\x00" + + twos = ones * 2 + assert difference_operator.calculate_difference(twos, ones) == b"\x01\x00\x00\x00" + + +def test_calculate_restore(): + difference_operator = SubDifferenceOperator() + + ones = torch.ones(1, dtype=torch.int32) + + assert difference_operator.restore(ones, b"\x00\x00\x00\x00").item() == 1 + assert difference_operator.restore(ones, b"\x01\x00\x00\x00").item() == 2 diff --git a/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_xor_difference_operator.py b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_xor_difference_operator.py new file mode 100644 index 000000000..b36f732c6 --- /dev/null +++ b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_xor_difference_operator.py @@ -0,0 +1,26 @@ +import torch +from modyn.model_storage.internal.storage_strategies import AbstractDifferenceOperator +from modyn.model_storage.internal.storage_strategies.difference_operators import XorDifferenceOperator + + +def test_inheritance(): + assert issubclass(XorDifferenceOperator.__class__, AbstractDifferenceOperator.__class__) + + +def test_calculate_difference(): + ones = torch.ones(1, dtype=torch.int32) + + difference_operator = XorDifferenceOperator() + assert difference_operator.calculate_difference(ones, ones) == b"\x00\x00\x00\x00" + + twos = ones * 2 + assert difference_operator.calculate_difference(twos, ones) == b"\x03\x00\x00\x00" + + +def test_calculate_restore(): + difference_operator = XorDifferenceOperator() + + ones = torch.ones(1, dtype=torch.int32) + + assert difference_operator.restore(ones, b"\x00\x00\x00\x00").item() == 1 + assert difference_operator.restore(ones, b"\x03\x00\x00\x00").item() == 2 diff --git a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_binary_full_model.py b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_binary_full_model.py new file mode 100644 index 000000000..c5ae70589 --- /dev/null +++ b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_binary_full_model.py @@ -0,0 +1,44 @@ +import pathlib +import tempfile + +import torch +from modyn.model_storage.internal.storage_strategies.full_model_strategies import BinaryFullModel + + +class MockModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32)) + + def forward(self, data): + return data + + +def test_store_model(): + model = MockModel() + full_model_strategy = BinaryFullModel( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} + ) + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + full_model_strategy.store_model(model.state_dict(), temp_file_path) + + with open(temp_file_path, "rb") as stored_model_file: + assert stored_model_file.read() == b"\x00\x00\x80\x3f\x00\x00\x80\x3f" + + +def test_load_model(): + model = MockModel() + full_model_strategy = BinaryFullModel( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} + ) + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + with open(temp_file_path, "wb") as stored_model_file: + assert stored_model_file.write(b"\x00\x00\x00\x3f\x00\x00\x00\x3f") + + state_dict = full_model_strategy.load_model(model.state_dict(), temp_file_path) + + assert state_dict["_weight"][0] == 0.5 # pylint: disable=unsubscriptable-object diff --git a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py new file mode 100644 index 000000000..530026bc5 --- /dev/null +++ b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py @@ -0,0 +1,61 @@ +import pathlib +import tempfile + +import torch +from modyn.model_storage.internal.storage_strategies.full_model_strategies import PyTorchFullModel + + +class MockModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32)) + + def forward(self, data): + return data + + +def test_store_model(): + full_model_strategy = PyTorchFullModel( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} + ) + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + model = MockModel() + full_model_strategy.store_model(model.state_dict(), temp_file_path) + + loaded_state = torch.load(temp_file_path) + + assert loaded_state["_weight"][0] == 1.0 + + +def test_load_model(): + full_model_strategy = PyTorchFullModel( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} + ) + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + model = MockModel() + torch.save(model.state_dict(), temp_file_path) + + model._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32) * 2) + state_dict = full_model_strategy.load_model(model.state_dict(), temp_file_path) + + assert state_dict["_weight"][0] == 1.0 # pylint: disable=unsubscriptable-object + + +def test_store_then_load(): + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + full_model_strategy = PyTorchFullModel( + zipping_dir=temp_file_path.parent, zip_activated=False, zip_algorithm_name="", config={} + ) + + model = MockModel() + full_model_strategy.store_model(model.state_dict(), temp_file_path) + + model._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32) * 2) + state_dict = full_model_strategy.load_model(model.state_dict(), temp_file_path) + + assert state_dict["_weight"][0] == 1.0 # pylint: disable=unsubscriptable-object diff --git a/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py b/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py new file mode 100644 index 000000000..1d5624794 --- /dev/null +++ b/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py @@ -0,0 +1,165 @@ +import pathlib +import tempfile +from zipfile import ZIP_LZMA + +import pytest +import torch +from modyn.model_storage.internal.storage_strategies.difference_operators import ( + SubDifferenceOperator, + XorDifferenceOperator, +) +from modyn.model_storage.internal.storage_strategies.incremental_model_strategies import WeightsDifference + + +class MockModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self._weight = torch.nn.Parameter(torch.zeros(2, dtype=torch.float32)) + + def forward(self, data): + return data + + +class MockComplexModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self._bias = torch.nn.Parameter(torch.ones(2, dtype=torch.float16)) + self._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32)) + + def forward(self, data): + return data + + +def get_mock_model_after() -> MockModel: + model_after = MockModel() + model_after._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32)) + + return model_after + + +def test_init(): + incremental_strategy = WeightsDifference( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} + ) + + assert isinstance(incremental_strategy.difference_operator, SubDifferenceOperator.__class__) + assert not incremental_strategy.split_exponent + + incremental_strategy = WeightsDifference( + zipping_dir=pathlib.Path(), + zip_activated=False, + zip_algorithm_name="", + config={"operator": "xor", "split_exponent": True}, + ) + + assert not incremental_strategy.zip + assert isinstance(incremental_strategy.difference_operator, XorDifferenceOperator.__class__) + assert incremental_strategy.split_exponent + + incremental_strategy = WeightsDifference( + zipping_dir=pathlib.Path(), + zip_activated=True, + zip_algorithm_name="ZIP_LZMA", + config={"operator": "sub", "split_exponent": False}, + ) + + assert incremental_strategy.zip + assert incremental_strategy.zip_algorithm == ZIP_LZMA + assert isinstance(incremental_strategy.difference_operator, SubDifferenceOperator.__class__) + assert not incremental_strategy.split_exponent + + +def test_store_model(): + model_before = MockModel() + model_after = get_mock_model_after() + + for operator in ["xor", "sub"]: + incremental_strategy = WeightsDifference( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={"operator": operator} + ) + + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + incremental_strategy.store_model(model_after.state_dict(), model_before.state_dict(), temp_file_path) + + with open(temp_file_path, "rb") as stored_model_file: + assert stored_model_file.read() == b"\x00\x00\x80\x3f\x00\x00\x80\x3f" + + +def test_load_model(): + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + with open(temp_file_path, "wb") as stored_model_file: + stored_model_file.write(b"\x00\x00\x80\x3f\x00\x00\x80\x3f") + + for operator in ["xor", "sub"]: + incremental_strategy = WeightsDifference( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={"operator": operator} + ) + + model = MockModel() + model_state = incremental_strategy.load_model(model.state_dict(), temp_file_path) + + assert model_state["_weight"][0] == 1 # pylint: disable=unsubscriptable-object + + +def test_rle(): + assert WeightsDifference.encode_bytes(b"") == b"" + + encoded = WeightsDifference.encode_bytes(b"\x00\x00\x02\x01\x01\x01\x00") + assert encoded == b"\x02\x00\x01\x02\x03\x01\x01\x00" + + encoded = WeightsDifference.encode_bytes(512 * b"\x00" + b"\x01") + assert encoded == b"\xff\x00\xff\x00\x02\x00\x01\x01" + + +def test_inv_rle(): + assert WeightsDifference.decode_bytes(b"") == b"" + + encoded = WeightsDifference.decode_bytes(b"\x02\x00\x01\x02\x03\x01\x01\x00") + assert encoded == b"\x00\x00\x02\x01\x01\x01\x00" + + encoded = WeightsDifference.decode_bytes(b"\xff\x00\xff\x00\x02\x00\x01\x01") + assert encoded == 512 * b"\x00" + b"\x01" + + with pytest.raises(AssertionError): + WeightsDifference.decode_bytes(b"\x02\x00\x01") + + +def test_store_then_load_model(): + model_before = MockComplexModel() + before_state = model_before.state_dict() + model_after = MockComplexModel() + model_after._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32) * 2) + + incremental_strategy = WeightsDifference( + zipping_dir=pathlib.Path(), + zip_activated=False, + zip_algorithm_name="", + config={"operator": "xor", "split_exponent": True, "rle": True}, + ) + + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + incremental_strategy.store_model(model_after.state_dict(), before_state, temp_file_path) + + with open(temp_file_path, "rb") as stored_model_file: + # we store 2 exponent bytes. + assert stored_model_file.read(8) == b"\x00\x00\x00\x00\x00\x00\x00\x02" + + # twice the xor difference between 2 and 1 in the exponent byte. + assert stored_model_file.read(2) == b"\x02\xff" + + # xor difference of the float16 tensors. + assert stored_model_file.read(4) == b"\x00\x00\x00\x00" + + # xor difference of the remaining float32 bytes. + assert stored_model_file.read(8) == b"\x00\x00\x00\x00\x00\x00" + + state_dict = incremental_strategy.load_model(before_state, temp_file_path) + + assert state_dict["_bias"][0].item() == 1 # pylint: disable=unsubscriptable-object + assert state_dict["_weight"][0].item() == 2 # pylint: disable=unsubscriptable-object diff --git a/modyn/tests/model_storage/internal/test_model_storage_manager.py b/modyn/tests/model_storage/internal/test_model_storage_manager.py new file mode 100644 index 000000000..27334bcdf --- /dev/null +++ b/modyn/tests/model_storage/internal/test_model_storage_manager.py @@ -0,0 +1,355 @@ +# pylint: disable=unused-argument +import json +import os +import pathlib +import tempfile +from unittest.mock import MagicMock, patch +from zipfile import ZIP_DEFLATED + +import torch +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.models import TrainedModel +from modyn.metadata_database.utils import ModelStorageStrategyConfig +from modyn.model_storage.internal import ModelStorageManager +from modyn.model_storage.internal.storage_strategies.full_model_strategies import PyTorchFullModel +from modyn.model_storage.internal.storage_strategies.incremental_model_strategies import WeightsDifference +from modyn.models import ResNet18 + +DATABASE = pathlib.Path(os.path.abspath(__file__)).parent / "test_model_storage.database" + + +def get_modyn_config(): + return { + "model_storage": {"port": "50051", "ftp_port": "5223"}, + "trainer_server": {"hostname": "localhost", "ftp_port": "5222"}, + "metadata_database": { + "drivername": "sqlite", + "username": "", + "password": "", + "host": "", + "port": 0, + "database": f"{DATABASE}", + }, + } + + +def setup(): + DATABASE.unlink(True) + + with MetadataDatabaseConnection(get_modyn_config()) as database: + database.create_tables() + + full_model_strategy = ModelStorageStrategyConfig(name="PyTorchFullModel") + inc_model_strategy = ModelStorageStrategyConfig(name="WeightsDifference") + inc_model_strategy.zip = False + inc_model_strategy.config = json.dumps({"operator": "sub"}) + database.register_pipeline( + 1, "ResNet18", json.dumps({"num_classes": 10}), True, "{}", full_model_strategy, inc_model_strategy, 5 + ) + + +def teardown(): + DATABASE.unlink() + + +class MockModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self._weight = torch.nn.Parameter(torch.ones(1, dtype=torch.float32)) + + def forward(self, data): + return data + + +def get_mock_model_after() -> MockModel: + model_after = MockModel() + model_after._weight = torch.nn.Parameter(torch.ones(1, dtype=torch.float32) * 3) + + return model_after + + +def test_init(): + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) + + assert manager._modyn_config == get_modyn_config() + assert manager._storage_dir == pathlib.Path("storage") + assert manager._ftp_dir == pathlib.Path("ftp") + + +def test__determine_parent_model_id(): + with MetadataDatabaseConnection(get_modyn_config()) as database: + model_id = database.add_trained_model(10, 2, "model.modyn", "model.metadata") + + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) + assert manager._determine_parent_model_id(10, 3) == model_id + assert manager._determine_parent_model_id(10, 2) is None + + +def test__get_base_model_state(): + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) + model_state = manager._get_base_model_state(1) + + assert len(model_state) == 122 + + +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test__reconstruct_model(base_model_state_mock: MagicMock): + mock_model = MockModel() + model_state = mock_model.state_dict() + full_model_strategy = PyTorchFullModel( + zipping_dir=pathlib.Path("ftp"), zip_activated=False, zip_algorithm_name="", config={} + ) + incremental_model_strategy = WeightsDifference( + zipping_dir=pathlib.Path("ftp"), zip_activated=False, zip_algorithm_name="", config={} + ) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, pathlib.Path("ftp")) + + prev_model_file_name = "before.model" + full_model_strategy.store_model(model_state, temp_directory_path / prev_model_file_name) + + difference_model_file_name = "difference.model" + incremental_model_strategy.store_model( + get_mock_model_after().state_dict(), model_state, temp_directory_path / difference_model_file_name + ) + + with MetadataDatabaseConnection(get_modyn_config()) as database: + prev_model_id = database.add_trained_model(15, 3, prev_model_file_name, "model.metadata") + curr_model_id = database.add_trained_model( + 15, 4, difference_model_file_name, "model.metadata", parent_model=prev_model_id + ) + + reconstructed_state = manager._reconstruct_model_state(curr_model_id, manager.get_model_storage_policy(1)) + + assert reconstructed_state["_weight"].item() == 3 # pylint: disable=unsubscriptable-object + + +def test__handle_new_model_full(): + with MetadataDatabaseConnection(get_modyn_config()) as database: + database.add_trained_model(1, 4, "model.modyn", "model.metadata") + + mock_model = MockModel() + model_state = mock_model.state_dict() + + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) + + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + parent_id = manager._handle_new_model(1, 5, model_state, temp_file_path, manager.get_model_storage_policy(1)) + assert parent_id is None + + loaded_state = torch.load(temp_file_path) + assert loaded_state["_weight"].item() == 1 + + +@patch.object(ModelStorageManager, "_reconstruct_model_state", return_value=MockModel().state_dict()) +@patch.object(ModelStorageManager, "_determine_parent_model_id", return_value=101) +def test__handle_new_model_incremental(previous_model_mock, reconstruct_model_mock: MagicMock): + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) + + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + parent_id = manager._handle_new_model( + 5, 4, get_mock_model_after().state_dict(), temp_file_path, manager.get_model_storage_policy(1) + ) + + assert parent_id == 101 + + with open(temp_file_path, "rb") as model_file: + assert model_file.read() == b"\x00\x00\x00\x40" + + reconstruct_model_mock.assert_called_once() + previous_model_mock.assert_called_once_with(5, 4) + + +def test_get_model_storage_policy(): + with MetadataDatabaseConnection(get_modyn_config()) as database: + simple_pipeline = database.register_pipeline( + 74, + "ResNet18", + json.dumps({"num_classes": 10}), + True, + "{}", + ModelStorageStrategyConfig(name="PyTorchFullModel"), + None, + None, + ) + + full_model_strategy = ModelStorageStrategyConfig(name="PyTorchFullModel") + full_model_strategy.zip_algorithm = "ZIP_DEFLATED" + inc_model_strategy = ModelStorageStrategyConfig(name="WeightsDifference") + inc_model_strategy.zip = True + inc_model_strategy.config = json.dumps({"operator": "sub"}) + complex_pipeline = database.register_pipeline( + 75, "ResNet18", json.dumps({"num_classes": 10}), True, "{}", full_model_strategy, inc_model_strategy, 10 + ) + + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) + + policy = manager.get_model_storage_policy(simple_pipeline) + assert policy.incremental_model_strategy is None + assert policy.full_model_interval is None + assert not policy.full_model_strategy.zip + + complex_policy = manager.get_model_storage_policy(complex_pipeline) + assert not complex_policy.full_model_strategy.zip + assert complex_policy.full_model_strategy.zip_algorithm == ZIP_DEFLATED + assert complex_policy.incremental_model_strategy + assert complex_policy.incremental_model_strategy.zip + + +@patch("modyn.model_storage.internal.model_storage_manager.current_time_millis", return_value=100) +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test_store_model(base_model_mock, current_time_mock): + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) + + with MetadataDatabaseConnection(get_modyn_config()) as database: + parent_id = database.add_trained_model(1, 128, "before.model", "before.metadata") + + policy = manager.get_model_storage_policy(1) + policy.full_model_strategy.store_model(MockModel().state_dict(), temp_directory_path / "before.model") + + torch.save( + {"model": get_mock_model_after().state_dict(), "metadata": True}, temp_directory_path / "model.modyn" + ) + + model_id = manager.store_model(1, 129, temp_directory_path / "model.modyn") + + with MetadataDatabaseConnection(get_modyn_config()) as database: + model: TrainedModel = database.session.get(TrainedModel, model_id) + + assert model.pipeline_id == 1 + assert model.trigger_id == 129 + assert model.model_path == "100_1_129.model" + assert model.parent_model == parent_id + assert model.metadata_path == "100_1_129.metadata.zip" + + with open(temp_directory_path / model.model_path, "rb") as model_file: + assert model_file.read() == b"\x00\x00\x00\x40" + + assert torch.load(temp_directory_path / model.metadata_path)["metadata"] + + loaded_model = manager.load_model(model_id, True) + + assert loaded_model["model"]["_weight"].item() == 3 + assert loaded_model["metadata"] + + +def test_store_model_resnet(): + full_model_strategy = ModelStorageStrategyConfig(name="BinaryFullModel") + full_model_strategy.zip = True + + with MetadataDatabaseConnection(get_modyn_config()) as database: + pipeline_id = database.register_pipeline( + 1, "ResNet18", json.dumps({"num_classes": 10}), True, "{}", full_model_strategy + ) + + resnet = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) + + torch.save({"model": resnet.model.state_dict(), "metadata": True}, temp_directory_path / "model.modyn") + + model_id = manager.store_model(pipeline_id, 1, temp_directory_path / "model.modyn") + loaded_state = manager.load_model(model_id, True) + assert loaded_state["metadata"] + + original_state = resnet.model.state_dict() + for layer_name, _ in loaded_state["model"].items(): + original_layer = original_state[layer_name] # pylint: disable=unsubscriptable-object + assert torch.all(torch.eq(loaded_state["model"][layer_name], original_layer)) + + +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test_load_model(base_model_mock: MagicMock): + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) + + model_file_name = "mock.model" + with MetadataDatabaseConnection(get_modyn_config()) as database: + model_id = database.add_trained_model(1, 32, model_file_name, "mock.metadata") + + policy = manager.get_model_storage_policy(1) + policy.full_model_strategy.store_model( + get_mock_model_after().state_dict(), temp_directory_path / model_file_name + ) + + reconstructed_state = manager.load_model(model_id, False) + + assert reconstructed_state["model"]["_weight"].item() == 3 + + +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test_load_model_metadata(base_model_mock: MagicMock): + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) + + model_file_name = "mock.model" + with MetadataDatabaseConnection(get_modyn_config()) as database: + model_id = database.add_trained_model(1, 32, model_file_name, "mock.metadata") + + policy = manager.get_model_storage_policy(1) + policy.full_model_strategy.store_model( + get_mock_model_after().state_dict(), temp_directory_path / model_file_name + ) + torch.save({"metadata": True}, temp_directory_path / "mock.metadata") + + reconstructed_state = manager.load_model(model_id, True) + + assert reconstructed_state["model"]["_weight"].item() == 3 + assert reconstructed_state["metadata"] + + +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test_load_model_invalid(base_model_mock: MagicMock): + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) + + assert manager.load_model(133, False) is None + + +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test_delete_model(base_model_mock: MagicMock): + mock_model = MockModel() + model_state = mock_model.state_dict() + model_state_after = get_mock_model_after().state_dict() + + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + + with MetadataDatabaseConnection(get_modyn_config()) as database: + parent_id = database.add_trained_model(1, 52, "parent.modyn", "parent.metadata") + child_id = database.add_trained_model(1, 53, "child.modyn", "child.metadata", parent_model=parent_id) + + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) + policy = manager.get_model_storage_policy(1) + policy.full_model_strategy.store_model(model_state, temp_directory_path / "parent.modyn") + torch.save({"metadata": True}, temp_directory_path / "parent.metadata") + policy.incremental_model_strategy.store_model( + model_state_after, model_state, temp_directory_path / "child.modyn" + ) + torch.save({"metadata": True}, temp_directory_path / "child.metadata") + + success = manager.delete_model(parent_id) + assert not success + + success = manager.delete_model(child_id) + assert success + assert not (temp_directory_path / "child.modyn").exists() + success = manager.delete_model(parent_id) + assert success + assert not (temp_directory_path / "parent.modyn").exists() + + with MetadataDatabaseConnection(get_modyn_config()) as database: + assert not database.session.get(TrainedModel, child_id) + assert not database.session.get(TrainedModel, parent_id) diff --git a/modyn/tests/model_storage/internal/utils/test_model_storage_policy.py b/modyn/tests/model_storage/internal/utils/test_model_storage_policy.py new file mode 100644 index 000000000..0f319079a --- /dev/null +++ b/modyn/tests/model_storage/internal/utils/test_model_storage_policy.py @@ -0,0 +1,59 @@ +import json +import pathlib +from zipfile import ZIP_LZMA + +import pytest +from modyn.model_storage.internal.storage_strategies.difference_operators import XorDifferenceOperator +from modyn.model_storage.internal.utils import ModelStoragePolicy + + +def test_basic_model_storage_policy(): + policy = ModelStoragePolicy(pathlib.Path(), "PyTorchFullModel", None, None, None) + + assert policy.incremental_model_strategy is None + assert policy.full_model_interval is None + assert not policy.full_model_strategy.zip + + +def test_extended_model_storage_policy(): + policy = ModelStoragePolicy( + zipping_dir=pathlib.Path(), + full_model_strategy_name="PyTorchFullModel", + full_model_strategy_zip=True, + full_model_strategy_zip_algorithm=None, + full_model_strategy_config=None, + ) + policy.register_incremental_model_strategy( + name="WeightsDifference", + zip_enabled=True, + zip_algorithm="ZIP_LZMA", + config=json.dumps({"operator": "xor", "split_exponent": True}), + full_model_interval=10, + ) + + assert policy.zipping_dir == pathlib.Path("") + assert not policy.full_model_strategy.zip + + weights_diff_strategy = policy.incremental_model_strategy + assert weights_diff_strategy.zip + assert weights_diff_strategy.zip_algorithm == ZIP_LZMA + assert getattr(weights_diff_strategy, "split_exponent") + assert isinstance(getattr(weights_diff_strategy, "difference_operator"), XorDifferenceOperator.__class__) + + assert policy.full_model_interval == 10 + + +def test_model_storage_policy_invalid(): + policy = ModelStoragePolicy( + zipping_dir=pathlib.Path(), + full_model_strategy_name="PyTorchFullModel", + full_model_strategy_zip=None, + full_model_strategy_zip_algorithm=None, + full_model_strategy_config=None, + ) + + with pytest.raises(ValueError): + policy.register_incremental_model_strategy("WeightsDifference", None, None, None, 0) + + with pytest.raises(NotImplementedError): + policy.register_incremental_model_strategy("UnknownStrategy", None, None, None, None) diff --git a/modyn/tests/model_storage/test_model_storage.py b/modyn/tests/model_storage/test_model_storage.py index 1bad9fedd..ad1df3134 100644 --- a/modyn/tests/model_storage/test_model_storage.py +++ b/modyn/tests/model_storage/test_model_storage.py @@ -1,18 +1,12 @@ -import os -import pathlib +import tempfile from unittest.mock import patch -import pytest from modyn.model_storage import ModelStorage from modyn.model_storage.internal.grpc.grpc_server import GRPCServer -modyn_config = ( - pathlib.Path(os.path.abspath(__file__)).parent.parent.parent / "config" / "examples" / "modyn_config.yaml" -) - -def get_invalid_modyn_config() -> dict: - return {"invalid": "not_valid"} +def get_modyn_config(): + return {"model_storage": {"port": "5001", "ftp_port": "5002"}} # pylint: disable=unused-argument @@ -21,6 +15,9 @@ def noop_setup_directory(self): class MockFTPServer: + def __init__(self, ftp_port, ftp_directory): # pylint: disable=unused-argument + pass + def __enter__(self): pass @@ -41,18 +38,19 @@ def __exit__(self, *args, **kwargs): # pylint: disable=unused-argument pass -@patch.object(ModelStorage, "_setup_model_storage_directory", noop_setup_directory) +@patch.object(ModelStorage, "_init_model_storage_directory", noop_setup_directory) def test_model_storage_init(): - model_storage = ModelStorage(modyn_config) - assert model_storage.config == modyn_config - - -@patch.object(ModelStorage, "_setup_model_storage_directory", noop_setup_directory) -def test_validate_config(): - model_storage = ModelStorage(modyn_config) - assert model_storage._validate_config()[0] - - -def test_invalid_config(): - with pytest.raises(ValueError): - ModelStorage(get_invalid_modyn_config()) + model_storage = ModelStorage(get_modyn_config()) + assert model_storage.config == get_modyn_config() + + +@patch("modyn.model_storage.model_storage.GRPCServer", MockGRPCServer) +@patch("modyn.model_storage.model_storage.FTPServer", MockFTPServer) +def test_cleanup_at_exit(): + with tempfile.TemporaryDirectory() as temp_dir: + config = get_modyn_config() + config["model_storage"]["models_directory"] = temp_dir + model_storage = ModelStorage(config) + assert model_storage.ftp_directory.exists() + model_storage.run() + assert not model_storage.ftp_directory.exists() diff --git a/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py b/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py index d55cfe8c2..0b7b0f6d5 100644 --- a/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py +++ b/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py @@ -11,12 +11,14 @@ GetSamplesRequest, GetSelectionStrategyRequest, JsonString, + ModelStoragePolicyInfo, NumberOfPartitionsResponse, NumberOfSamplesResponse, PipelineResponse, RegisterPipelineRequest, SamplesResponse, SelectionStrategyResponse, + StrategyConfig, TriggerResponse, ) from modyn.selector.internal.grpc.selector_grpc_servicer import SelectorGRPCServicer @@ -49,13 +51,32 @@ def test_register_pipeline(test_register_pipeline: MagicMock): config["selector"]["trigger_sample_directory"] = tmp_dir mgr = SelectorManager(config) servicer = SelectorGRPCServicer(mgr, 8096) - request = RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value="strat")) + policy = ModelStoragePolicyInfo(full_model_strategy_config=StrategyConfig(name="PyTorchFullModel")) + request = RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value="strat"), + model_class_name="ResNet18", + model_configuration=JsonString(value="{}"), + amp=True, + model_storage_policy=policy, + ) test_register_pipeline.return_value = 42 response: PipelineResponse = servicer.register_pipeline(request, None) assert response.pipeline_id == 42 - test_register_pipeline.assert_called_once_with(2, "strat") + test_register_pipeline.assert_called_once() + + arguments = test_register_pipeline.call_args[0] + assert arguments[0] == 2 + assert arguments[1] == "strat" + assert arguments[2] == "ResNet18" + assert arguments[3] == "{}" + assert arguments[4] + assert arguments[5].name == "PyTorchFullModel" + assert not arguments[5].zip + assert arguments[6] is None + assert arguments[7] is None @patch.object(SelectorManager, "init_metadata_db", noop_init_metadata_db) diff --git a/modyn/tests/selector/internal/test_selector_manager.py b/modyn/tests/selector/internal/test_selector_manager.py index 7936dd00a..9e52ad2c9 100644 --- a/modyn/tests/selector/internal/test_selector_manager.py +++ b/modyn/tests/selector/internal/test_selector_manager.py @@ -5,6 +5,7 @@ from unittest.mock import MagicMock, patch import pytest +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.selector.internal.selector_manager import SelectorManager from modyn.selector.internal.selector_strategies.abstract_selection_strategy import AbstractSelectionStrategy from modyn.selector.selector import Selector @@ -45,8 +46,17 @@ def __init__(self, modyn_config: dict): # pylint: disable=super-init-not-called self.current_pipeline_id = 0 self.session = MockSession() + # pylint: disable=unused-argument def register_pipeline( - self, number_of_workers: int, selection_strategy: str # pylint: disable=unused-argument + self, + number_of_workers: int, + model_id: int, + model_config: dict, + amp: bool, + selection_strategy: str, + full_model_strategy: ModelStorageStrategyConfig, + incremental_model_strategy: Optional[ModelStorageStrategyConfig] = None, + full_model_interval: Optional[int] = None, ) -> Optional[int]: pid = self.current_pipeline_id self.current_pipeline_id += 1 @@ -97,13 +107,20 @@ def test_register_pipeline(test__instantiate_strategy: MagicMock): assert len(selec._selectors) == 0 - assert selec.register_pipeline(42, "{}") == 0 + assert ( + selec.register_pipeline( + 42, "{}", "RestNet18", "{}", True, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) + == 0 + ) assert len(selec._selectors) == 1 assert isinstance(selec._selectors[0]._strategy, MockStrategy) with pytest.raises(ValueError): - selec.register_pipeline(0, "strat") + selec.register_pipeline( + 0, "strat", "RestNet18", "{}", False, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) @patch("modyn.selector.internal.selector_manager.MetadataDatabaseConnection", MockDatabaseConnection) @@ -119,7 +136,9 @@ def test_get_sample_keys_and_weights( selec = SelectorManager(config) test__instantiate_strategy.return_value = MockStrategy() - pipe_id = selec.register_pipeline(2, "{}") + pipe_id = selec.register_pipeline( + 2, "{}", "RestNet18", "{}", True, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) with pytest.raises(ValueError): # Non existing pipeline @@ -150,7 +169,9 @@ def test_inform_data(selector_inform_data: MagicMock, test__instantiate_strategy with pytest.raises(ValueError): selec.inform_data(0, [10], [0], [0]) - pipe_id = selec.register_pipeline(2, "{}") + pipe_id = selec.register_pipeline( + 2, "{}", "RestNet18", "{}", False, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) selector_inform_data.return_value = None selec.inform_data(pipe_id, [10], [0], [0]) @@ -172,7 +193,9 @@ def test_inform_data_and_trigger(selector_inform_data_and_trigger: MagicMock, te with pytest.raises(ValueError): selec.inform_data_and_trigger(0, [10], [0], [0]) - pipe_id = selec.register_pipeline(2, "{}") + pipe_id = selec.register_pipeline( + 2, "{}", "RestNet18", "{}", True, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) selector_inform_data_and_trigger.return_value = None selec.inform_data_and_trigger(pipe_id, [10], [0], [0]) @@ -191,7 +214,9 @@ def test_get_available_labels(selector_get_available_labels: MagicMock, test__in selector = SelectorManager(config) test__instantiate_strategy.return_value = MockStrategy() - pipe_id = selector.register_pipeline(2, "{}") + pipe_id = selector.register_pipeline( + 2, "{}", "RestNet18", "{}", False, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) selector_get_available_labels.return_value = None selector.get_available_labels(pipe_id) @@ -235,7 +260,9 @@ def test_get_number_of_samples(selector_get_number_of_samples: MagicMock, test__ with pytest.raises(ValueError): selec.get_number_of_samples(0, 0) - pipe_id = selec.register_pipeline(2, "{}") + pipe_id = selec.register_pipeline( + 2, "{}", "RestNet18", "{}", True, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) selector_get_number_of_samples.return_value = 12 assert selec.get_number_of_samples(pipe_id, 21) == 12 diff --git a/modyn/tests/storage/test_storage.py b/modyn/tests/storage/test_storage.py index f1d576916..22bd0b74d 100644 --- a/modyn/tests/storage/test_storage.py +++ b/modyn/tests/storage/test_storage.py @@ -54,6 +54,8 @@ def get_minimal_modyn_config() -> dict: }, "selector": {"hostname": "host", "port": "1337"}, "trainer_server": {"hostname": "host", "port": "1337"}, + "evaluator": {"hostname": "host", "port": "1337"}, + "model_storage": {"hostname": "host", "port": "1337", "ftp_port": "1337", "models_directory": "test.dir"}, } diff --git a/modyn/tests/supervisor/internal/test_grpc_handler.py b/modyn/tests/supervisor/internal/test_grpc_handler.py index e8aecf226..d0cc9137a 100644 --- a/modyn/tests/supervisor/internal/test_grpc_handler.py +++ b/modyn/tests/supervisor/internal/test_grpc_handler.py @@ -338,11 +338,41 @@ def test_register_pipeline_at_selector(test_grpc_connection_established): mock.return_value = PipelineResponse(pipeline_id=42) result = handler.register_pipeline_at_selector( - {"pipeline": {"name": "test"}, "training": {"dataloader_workers": 2, "selection_strategy": {}}} + { + "pipeline": {"name": "test"}, + "training": {"dataloader_workers": 2, "selection_strategy": {}, "amp": True}, + "model": {"id": "ResNet18"}, + "model_storage": { + "full_model_strategy": {"name": "PyTorchFullModel", "zip": True, "zip_algorithm": "ZIP_DEFLATED"}, + "incremental_model_strategy": { + "name": "WeightsDifference", + "config": {"operator": "sub"}, + "full_model_interval": 10, + }, + }, + } ) assert result == 42 - mock.assert_called_once_with(RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value="{}"))) + mock.assert_called_once() + + request: RegisterPipelineRequest = mock.call_args.args[0] + assert request.num_workers == 2 + assert request.selection_strategy.value == "{}" + assert request.model_class_name == "ResNet18" + assert request.model_configuration.value == "{}" + assert request.amp + assert request.model_storage_policy.full_model_strategy_config.name == "PyTorchFullModel" + assert request.model_storage_policy.full_model_strategy_config.zip + assert request.model_storage_policy.full_model_strategy_config.zip_algorithm == "ZIP_DEFLATED" + assert not request.model_storage_policy.full_model_strategy_config.HasField("config") + assert request.model_storage_policy.incremental_model_strategy_config.name == "WeightsDifference" + assert not request.model_storage_policy.incremental_model_strategy_config.HasField("zip") + assert not request.model_storage_policy.incremental_model_strategy_config.HasField("zip_algorithm") + assert ( + json.loads(request.model_storage_policy.incremental_model_strategy_config.config.value)["operator"] == "sub" + ) + assert request.model_storage_policy.full_model_interval == 10 def test_unregister_pipeline_at_selector(): @@ -527,7 +557,7 @@ def test_start_evaluation(test_connection_established): handler = GRPCHandler(get_simple_config(), mgr, pbar) assert handler.evaluator is not None - trained_model_id = 10 + model_id = 10 pipeline_config = get_minimal_pipeline_config() with patch.object( @@ -535,7 +565,7 @@ def test_start_evaluation(test_connection_established): "evaluate_model", return_value=EvaluateModelResponse(evaluation_started=True, evaluation_id=12, dataset_size=1000), ) as avail_method: - evaluations = handler.start_evaluation(trained_model_id, pipeline_config) + evaluations = handler.start_evaluation(model_id, pipeline_config) assert len(evaluations) == 1 assert evaluations[12].dataset_id == "MNIST_eval" @@ -543,6 +573,19 @@ def test_start_evaluation(test_connection_established): avail_method.assert_called_once() +def test__prepare_evaluation_request(): + pipeline_config = get_minimal_pipeline_config() + request = GRPCHandler._prepare_evaluation_request(pipeline_config["evaluation"]["datasets"][0], 23, "cpu") + + assert request.model_id == 23 + assert request.device == "cpu" + assert request.batch_size == 64 + assert request.dataset_info.dataset_id == "MNIST_eval" + assert request.dataset_info.num_dataloaders == 2 + assert request.metrics[0].name == "Accuracy" + assert request.metrics[0].config.value == "{}" + + @patch("modyn.supervisor.internal.grpc_handler.grpc_connection_established", return_value=True) def test_wait_for_evaluation_completion(test_connection_established): mgr = enlighten.get_manager() diff --git a/modyn/tests/supervisor/test_supervisor.py b/modyn/tests/supervisor/test_supervisor.py index 8284ab665..cb0ba0348 100644 --- a/modyn/tests/supervisor/test_supervisor.py +++ b/modyn/tests/supervisor/test_supervisor.py @@ -18,6 +18,7 @@ def get_minimal_pipeline_config() -> dict: return { "pipeline": {"name": "Test"}, "model": {"id": "ResNet18"}, + "model_storage": {"full_model_strategy": {"name": "PyTorchFullModel"}}, "training": { "gpus": 1, "device": "cpu", @@ -342,7 +343,7 @@ def test_shutdown_trainer(): pass -@patch.object(GRPCHandler, "get_new_data_since", return_value=[[(10, 42, 0), (11, 43, 1)]]) +@patch.object(GRPCHandler, "get_new_data_since", return_value=[([(10, 42, 0), (11, 43, 1)], {})]) @patch.object(Supervisor, "_handle_new_data", return_value=False, side_effect=KeyboardInterrupt) def test_wait_for_new_data(test__handle_new_data: MagicMock, test_get_new_data_since: MagicMock): # This is a simple test and does not the inclusivity filtering! @@ -353,7 +354,7 @@ def test_wait_for_new_data(test__handle_new_data: MagicMock, test_get_new_data_s test__handle_new_data.assert_called_once_with([(10, 42, 0), (11, 43, 1)]) -@patch.object(GRPCHandler, "get_new_data_since", return_value=[[(10, 42, 0)], [(11, 43, 1)]]) +@patch.object(GRPCHandler, "get_new_data_since", return_value=[([(10, 42, 0)], {}), ([(11, 43, 1)], {})]) @patch.object(Supervisor, "_handle_new_data", return_value=False, side_effect=[None, KeyboardInterrupt]) def test_wait_for_new_data_batched(test__handle_new_data: MagicMock, test_get_new_data_since: MagicMock): # This is a simple test and does not the inclusivity filtering! @@ -375,9 +376,9 @@ def test_wait_for_new_data_filtering(): mocked__handle_new_data_return_vals = [True, True, KeyboardInterrupt] mocked_get_new_data_since = [ - [[(10, 42, 0), (11, 43, 0), (12, 43, 1)]], - [[(11, 43, 0), (12, 43, 1), (13, 43, 2), (14, 45, 3)]], - [[]], + [([(10, 42, 0), (11, 43, 0), (12, 43, 1)], {})], + [([(11, 43, 0), (12, 43, 1), (13, 43, 2), (14, 45, 3)], {})], + [([], {})], ValueError, ] @@ -406,9 +407,9 @@ def test_wait_for_new_data_filtering_batched(): mocked__handle_new_data_return_vals = [True, True, True, True, True, KeyboardInterrupt] mocked_get_new_data_since = [ - [[(10, 42, 0), (11, 43, 0)], [(12, 43, 1)]], - [[(11, 43, 0)], [(12, 43, 1), (13, 43, 2)], [(14, 45, 3)]], - [[]], + [([(10, 42, 0), (11, 43, 0)], {}), ([(12, 43, 1)], {})], + [([(11, 43, 0)], {}), ([(12, 43, 1), (13, 43, 2)], {}), ([(14, 45, 3)], {})], + [([], {})], ValueError, ] diff --git a/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py b/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py index c694ec0e3..1642bad08 100644 --- a/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py +++ b/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py @@ -4,7 +4,6 @@ import os import pathlib import platform -import shutil import tempfile from io import BytesIO from time import sleep @@ -12,6 +11,8 @@ from unittest.mock import MagicMock, patch import torch +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( FetchModelRequest, FetchModelResponse, @@ -34,6 +35,9 @@ from modyn.trainer_server.internal.utils.trainer_messages import TrainerMessages from modyn.trainer_server.internal.utils.training_info import TrainingInfo from modyn.trainer_server.internal.utils.training_process_info import TrainingProcessInfo +from modyn.utils import calculate_checksum + +DATABASE = pathlib.Path(os.path.abspath(__file__)).parent / "test_trainer_server.database" trainer_available_request = TrainerAvailableRequest() get_status_request = TrainingStatusRequest(training_id=1) @@ -47,22 +51,40 @@ "ftp_port": "3001", "offline_dataset_directory": "/tmp/offline_dataset", }, + "metadata_database": { + "drivername": "sqlite", + "username": "", + "password": "", + "host": "", + "port": 0, + "database": f"{DATABASE}", + }, "storage": {"hostname": "storage", "port": "5002"}, "selector": {"hostname": "selector", "port": "5003"}, "model_storage": {"hostname": "model_storage", "port": "5004"}, } -modyn_download_file_config = { - "trainer_server": { - "hostname": "trainer_server", - "port": "5001", - "ftp_port": "3001", - "offline_dataset_directory": "/tmp/offline_dataset", - }, - "storage": {"hostname": "storage", "port": "5002"}, - "selector": {"hostname": "selector", "port": "5003"}, - "model_storage": {"hostname": "localhost", "port": "5004", "ftp_port": "3002"}, -} + +def setup(): + DATABASE.unlink(True) + + with MetadataDatabaseConnection(modyn_config) as database: + database.create_tables() + + database.register_pipeline( + 1, + "model", + json.dumps({}), + True, + "{}", + ModelStorageStrategyConfig(name="PyTorchFullModel"), + incremental_model_strategy=None, + full_model_interval=None, + ) + + +def teardown(): + DATABASE.unlink() class DummyModelStorageStub: @@ -104,13 +126,11 @@ def get_training_process_info(): return training_process_info -def get_start_training_request(checkpoint_path="", valid_model=True): +def get_start_training_request(checkpoint_path=""): return StartTrainingRequest( pipeline_id=1, trigger_id=1, device="cpu", - amp=False, - model_id="model" if valid_model else "unknown", batch_size=32, torch_optimizers_configuration=JsonString( value=json.dumps( @@ -118,7 +138,6 @@ def get_start_training_request(checkpoint_path="", valid_model=True): ) ), torch_criterion="CrossEntropyLoss", - model_configuration=JsonString(value=json.dumps({})), criterion_parameters=JsonString(value=json.dumps({})), data_info=Data(dataset_id="Dataset", num_dataloaders=1), checkpoint_info=CheckpointInfo(checkpoint_interval=10, checkpoint_path=checkpoint_path), @@ -144,6 +163,9 @@ def get_training_info( training_info = TrainingInfo( request, training_id, + "model", + json.dumps({}), + True, storage_address, selector_address, offline_dataset_path, @@ -187,7 +209,7 @@ def test_trainer_not_available(test_is_alive, test_connect_to_model_storage): def test_start_training_invalid(test_hasattr, test_connect_to_model_storage): with tempfile.TemporaryDirectory() as modyn_temp: trainer_server = TrainerServerGRPCServicer(modyn_config, modyn_temp) - response = trainer_server.start_training(get_start_training_request(valid_model=False), None) + response = trainer_server.start_training(get_start_training_request(), None) assert not response.training_started assert not trainer_server._training_dict assert trainer_server._next_training_id == 0 @@ -198,54 +220,62 @@ def test_start_training_invalid(test_hasattr, test_connect_to_model_storage): def test_start_training_invalid_id(test_hasattr, test_connect_to_model_storage): with tempfile.TemporaryDirectory() as modyn_temp: trainer_server = TrainerServerGRPCServicer(modyn_config, modyn_temp) - req = get_start_training_request(valid_model=True) + req = get_start_training_request() req.use_pretrained_model = True req.pretrained_model_id = 15 resp = trainer_server.start_training(req, None) assert not resp.training_started -@patch("modyn.trainer_server.internal.grpc.trainer_server_grpc_servicer.download_file") +@patch( + "modyn.trainer_server.internal.grpc.trainer_server_grpc_servicer.download_trained_model", + return_value=pathlib.Path("downloaded_model.modyn"), +) @patch.object(TrainerServerGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) @patch("modyn.trainer_server.internal.grpc.trainer_server_grpc_servicer.hasattr", return_value=True) @patch( "modyn.trainer_server.internal.utils.training_info.getattr", return_value=DummyModelWrapper, ) -def test_start_training(test_getattr, test_hasattr, test_connect_to_model_storage, download_file_mock: MagicMock): +def test_start_training(test_getattr, test_hasattr, test_connect_to_model_storage, download_model_mock: MagicMock): with tempfile.TemporaryDirectory() as modyn_temp: - trainer_server = TrainerServerGRPCServicer(modyn_download_file_config, modyn_temp) - with open(pathlib.Path(modyn_temp) / "testpath.modyn", "wb") as file: - file.write(b"Our pretrained model!") + trainer_server = TrainerServerGRPCServicer(modyn_config, modyn_temp) mock_start = mock.Mock() mock_start.side_effect = noop trainer_server._training_dict[1] = None with patch("multiprocessing.Process.start", mock_start): - trainer_server.start_training(get_start_training_request(valid_model=True), None) + trainer_server.start_training(get_start_training_request(), None) assert 0 in trainer_server._training_process_dict assert trainer_server._next_training_id == 1 # start new training - trainer_server.start_training(get_start_training_request(valid_model=True), None) + trainer_server.start_training(get_start_training_request(), None) assert 1 in trainer_server._training_process_dict assert trainer_server._next_training_id == 2 + assert trainer_server._training_dict[1].model_class_name == "model" + assert trainer_server._training_dict[1].model_configuration_dict == {} + assert trainer_server._training_dict[1].amp - request = get_start_training_request(valid_model=True) + request = get_start_training_request() request.use_pretrained_model = True request.pretrained_model_id = 10 resp = trainer_server.start_training(request, None) - download_file_mock.assert_called_once() - kwargs = download_file_mock.call_args.kwargs - remote_file_path = kwargs["remote_file_path"] - local_file_path = kwargs["local_file_path"] - - shutil.copyfile(pathlib.Path(modyn_temp) / remote_file_path, local_file_path) + download_model_mock.assert_called_once() + kwargs = download_model_mock.call_args.kwargs + remote_file_path = kwargs["remote_path"] + base_directory = kwargs["base_directory"] + identifier = kwargs["identifier"] assert resp.training_id == 2 - with open(trainer_server._training_dict[resp.training_id].pretrained_model_path, "rb") as file: - assert file.read().decode("utf-8") == "Our pretrained model!" + assert str(remote_file_path) == "testpath.modyn" + assert base_directory == trainer_server._modyn_base_dir + assert resp.training_started + assert resp.training_id == identifier + assert ( + str(trainer_server._training_dict[resp.training_id].pretrained_model_path) == "downloaded_model.modyn" + ) @patch.object(TrainerServerGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) @@ -535,6 +565,7 @@ def test_store_final_model_found(test_is_alive, test_connect_to_model_storage): checkpoint_file = base_path / "model_final.modyn" torch.save(dict_to_save, checkpoint_file) + checksum = calculate_checksum(checkpoint_file) trainer_server._training_dict[1] = training_info trainer_server._training_process_dict[1] = get_training_process_info() @@ -551,6 +582,7 @@ def test_store_final_model_found(test_is_alive, test_connect_to_model_storage): assert req.hostname == "trainer_server" assert req.port == 3001 assert req.model_path == "model_final.modyn" + assert req.checksum == checksum @patch.object(TrainerServerGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) diff --git a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py index d181f0ea1..f5eb098e9 100644 --- a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py +++ b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py @@ -205,12 +205,9 @@ def get_training_info( pipeline_id=1, trigger_id=1, device="cpu", - amp=False, data_info=Data(dataset_id="MNIST", num_dataloaders=2), torch_optimizers_configuration=JsonString(value=json.dumps(torch_optimizers_configuration)), - model_configuration=JsonString(value=json.dumps({})), criterion_parameters=JsonString(value=json.dumps({})), - model_id="model", batch_size=32, torch_criterion="CrossEntropyLoss", checkpoint_info=CheckpointInfo(checkpoint_interval=10, checkpoint_path=tmpdirname), @@ -227,6 +224,9 @@ def get_training_info( training_info = TrainingInfo( request, training_id, + "model", + json.dumps({}), + False, storage_address, selector_address, offline_dataset_path, diff --git a/modyn/tests/utils/test_utils.py b/modyn/tests/utils/test_utils.py index 8ac6ded13..e88d7e051 100644 --- a/modyn/tests/utils/test_utils.py +++ b/modyn/tests/utils/test_utils.py @@ -1,5 +1,7 @@ # pylint: disable=unused-argument,redefined-outer-name +import io import pathlib +import tempfile from unittest.mock import patch import grpc @@ -11,21 +13,26 @@ from modyn.supervisor.internal.grpc_handler import GRPCHandler from modyn.trainer_server.internal.trainer.remote_downsamplers import RemoteLossDownsampling from modyn.utils import ( + calculate_checksum, convert_timestr_to_seconds, current_time_millis, deserialize_function, dynamic_module_import, flatten, get_partition_for_worker, + get_tensor_byte_size, grpc_connection_established, + instantiate_class, model_available, package_available_and_can_be_imported, + reconstruct_tensor_from_bytes, seed_everything, trigger_available, + unzip_file, validate_timestr, validate_yaml, + zip_file, ) -from modyn.utils.utils import instantiate_class @patch.object(GRPCHandler, "init_storage", lambda self: None) @@ -203,3 +210,86 @@ def test_instantiate_class_not_existing(): # missing parameters with pytest.raises(TypeError): instantiate_class("modyn.common.trigger_sample", "TriggerSampleStorage") + + +def test_calculate_checksum(): + with tempfile.TemporaryDirectory() as tempdir: + tempdir_path = pathlib.Path(tempdir) + + with open(tempdir_path / "testfile1.txt", "w", encoding="utf-8") as file: + file.write("This is a test") + + with open(tempdir_path / "testfile2.txt", "w", encoding="utf-8") as file: + file.write("This is a test") + + assert calculate_checksum(tempdir_path / "testfile1.txt") == calculate_checksum(tempdir_path / "testfile2.txt") + assert calculate_checksum(tempdir_path / "testfile1.txt", chunk_num_blocks=20) == calculate_checksum( + tempdir_path / "testfile2.txt", chunk_num_blocks=10 + ) + assert calculate_checksum(tempdir_path / "testfile1.txt", hash_func_name="blake2s") != calculate_checksum( + tempdir_path / "testfile2.txt", chunk_num_blocks=10 + ) + + +def test_zip_and_unzip_file(): + with tempfile.TemporaryDirectory() as tempdir: + tempdir_path = pathlib.Path(tempdir) + + text_file_path = tempdir_path / "testfile.txt" + zip_file_path = tempdir_path / "testfile.zip" + + with open(text_file_path, "w", encoding="utf-8") as file: + file.write("This is a testfile!") + + zip_file(text_file_path, zip_file_path, remove_file=True) + + assert not text_file_path.exists() + assert zip_file_path.exists() and zip_file_path.is_file() + + unzip_file(zip_file_path, text_file_path, remove_file=True) + + assert not zip_file_path.exists() + assert text_file_path.exists() and text_file_path.is_file() + + with open(text_file_path, "r", encoding="utf-8") as file: + assert file.read() == "This is a testfile!" + + +def test_read_tensor_from_bytes(): + buf = io.BytesIO() + buf.write(b"\x01\x00\x00\x00") + buf.write(b"\x02\x00\x00\x00") + buf.write(b"\x03\x00\x00\x00") + buf.write(b"\x04\x00\x00\x00") + buf.seek(0) + res = reconstruct_tensor_from_bytes(torch.ones((2, 2), dtype=torch.int32), buf.getvalue()) + + assert res[0, 0] == 1 and res[0, 1] == 2 and res[1, 0] == 3 and res[1, 1] == 4 + + buf.seek(0, io.SEEK_END) + buf.write(b"\xff\x00\x00\x00") + buf.write(b"\x0f\x00\x00\x00") + buf.seek(0) + + res = reconstruct_tensor_from_bytes(torch.ones((1, 6), dtype=torch.int32), buf.getvalue()) + assert ( + res[0, 0] == 1 and res[0, 1] == 2 and res[0, 2] == 3 and res[0, 3] == 4 and res[0, 4] == 255 and res[0, 5] == 15 + ) + + buf_floats = io.BytesIO() + buf_floats.write(b"\x00\x00\x00\x3f") + buf_floats.write(b"\x00\x00\x00\x3e") + + res = reconstruct_tensor_from_bytes(torch.ones((2, 1), dtype=torch.float32), buf_floats.getvalue()) + assert res[0, 0] == 1.0 / 2 and res[1, 0] == 1.0 / 8 + + +def test_get_tensor_byte_size(): + tensor = torch.ones((3, 3, 3), dtype=torch.int32) + assert get_tensor_byte_size(tensor) == 3 * 3 * 3 * 4 + + tensor = torch.ones((5, 5), dtype=torch.float64) * 5 + assert get_tensor_byte_size(tensor) == 5 * 5 * 8 + + tensor = torch.ones(10, dtype=torch.float32) + assert get_tensor_byte_size(tensor) == 40 diff --git a/modyn/trainer_server/internal/dataset/online_dataset.py b/modyn/trainer_server/internal/dataset/online_dataset.py index f0879319d..4132efdd3 100644 --- a/modyn/trainer_server/internal/dataset/online_dataset.py +++ b/modyn/trainer_server/internal/dataset/online_dataset.py @@ -14,7 +14,7 @@ ) from modyn.storage.internal.grpc.generated.storage_pb2_grpc import StorageStub from modyn.trainer_server.internal.dataset.key_sources import AbstractKeySource, SelectorKeySource -from modyn.utils.utils import ( +from modyn.utils import ( BYTES_PARSER_FUNC_NAME, MAX_MESSAGE_SIZE, deserialize_function, diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py index e62f14fdd..72135930c 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py @@ -14,42 +14,41 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\x80\x07\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x0b\n\x03\x61mp\x18\x04 \x01(\x08\x12\x10\n\x08model_id\x18\x05 \x01(\t\x12\x30\n\x13model_configuration\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1c\n\x14use_pretrained_model\x18\x07 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x08 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\t \x01(\x05\x12\x12\n\nbatch_size\x18\n \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x0b \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\x0c \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\r \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0e \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0f \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x11 \x03(\t\x12)\n\x0clr_scheduler\x18\x12 \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x13 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x14 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x15 \x01(\x05\x12!\n\x19num_prefetched_partitions\x18\x16 \x01(\x05\x12\"\n\x1aparallel_prefetch_requests\x18\x17 \x01(\x05\x12\x11\n\x04seed\x18\x18 \x01(\x05H\x00\x88\x01\x01\x12-\n\ttokenizer\x18\x19 \x01(\x0b\x32\x15.trainer.PythonStringH\x01\x88\x01\x01\x42\x07\n\x05_seedB\x0c\n\n_tokenizer\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\xa6\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12 \n\x03log\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x16\n\texception\x18\x07 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x08 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\t \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\n \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\x0b \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xaf\x06\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x1c\n\x14use_pretrained_model\x18\x04 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x05 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\x06 \x01(\x05\x12\x12\n\nbatch_size\x18\x07 \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x08 \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\t \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\n \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0b \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0c \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\r \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x0e \x03(\t\x12)\n\x0clr_scheduler\x18\x0f \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x11 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x12 \x01(\x05\x12!\n\x19num_prefetched_partitions\x18\x13 \x01(\x05\x12\"\n\x1aparallel_prefetch_requests\x18\x14 \x01(\x05\x12\x11\n\x04seed\x18\x15 \x01(\x05H\x00\x88\x01\x01\x12-\n\ttokenizer\x18\x16 \x01(\x0b\x32\x15.trainer.PythonStringH\x01\x88\x01\x01\x42\x07\n\x05_seedB\x0c\n\n_tokenizer\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\xa6\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12 \n\x03log\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x16\n\texception\x18\x07 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x08 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\t \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\n \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\x0b \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trainer_server_pb2', _globals) +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trainer_server_pb2', globals()) if _descriptor._USE_C_DESCRIPTORS == False: DESCRIPTOR._options = None - _globals['_JSONSTRING']._serialized_start=33 - _globals['_JSONSTRING']._serialized_end=60 - _globals['_PYTHONSTRING']._serialized_start=62 - _globals['_PYTHONSTRING']._serialized_end=91 - _globals['_DATA']._serialized_start=93 - _globals['_DATA']._serialized_end=144 - _globals['_TRAINERAVAILABLEREQUEST']._serialized_start=146 - _globals['_TRAINERAVAILABLEREQUEST']._serialized_end=171 - _globals['_TRAINERAVAILABLERESPONSE']._serialized_start=173 - _globals['_TRAINERAVAILABLERESPONSE']._serialized_end=218 - _globals['_CHECKPOINTINFO']._serialized_start=220 - _globals['_CHECKPOINTINFO']._serialized_end=290 - _globals['_STARTTRAININGREQUEST']._serialized_start=293 - _globals['_STARTTRAININGREQUEST']._serialized_end=1189 - _globals['_STARTTRAININGRESPONSE']._serialized_start=1191 - _globals['_STARTTRAININGRESPONSE']._serialized_end=1261 - _globals['_TRAININGSTATUSREQUEST']._serialized_start=1263 - _globals['_TRAININGSTATUSREQUEST']._serialized_end=1307 - _globals['_TRAININGSTATUSRESPONSE']._serialized_start=1310 - _globals['_TRAININGSTATUSRESPONSE']._serialized_end=1732 - _globals['_STOREFINALMODELREQUEST']._serialized_start=1734 - _globals['_STOREFINALMODELREQUEST']._serialized_end=1779 - _globals['_STOREFINALMODELRESPONSE']._serialized_start=1781 - _globals['_STOREFINALMODELRESPONSE']._serialized_end=1845 - _globals['_GETLATESTMODELREQUEST']._serialized_start=1847 - _globals['_GETLATESTMODELREQUEST']._serialized_end=1891 - _globals['_GETLATESTMODELRESPONSE']._serialized_start=1893 - _globals['_GETLATESTMODELRESPONSE']._serialized_end=1958 - _globals['_TRAINERSERVER']._serialized_start=1961 - _globals['_TRAINERSERVER']._serialized_end=2418 + _JSONSTRING._serialized_start=33 + _JSONSTRING._serialized_end=60 + _PYTHONSTRING._serialized_start=62 + _PYTHONSTRING._serialized_end=91 + _DATA._serialized_start=93 + _DATA._serialized_end=144 + _TRAINERAVAILABLEREQUEST._serialized_start=146 + _TRAINERAVAILABLEREQUEST._serialized_end=171 + _TRAINERAVAILABLERESPONSE._serialized_start=173 + _TRAINERAVAILABLERESPONSE._serialized_end=218 + _CHECKPOINTINFO._serialized_start=220 + _CHECKPOINTINFO._serialized_end=290 + _STARTTRAININGREQUEST._serialized_start=293 + _STARTTRAININGREQUEST._serialized_end=1108 + _STARTTRAININGRESPONSE._serialized_start=1110 + _STARTTRAININGRESPONSE._serialized_end=1180 + _TRAININGSTATUSREQUEST._serialized_start=1182 + _TRAININGSTATUSREQUEST._serialized_end=1226 + _TRAININGSTATUSRESPONSE._serialized_start=1229 + _TRAININGSTATUSRESPONSE._serialized_end=1651 + _STOREFINALMODELREQUEST._serialized_start=1653 + _STOREFINALMODELREQUEST._serialized_end=1698 + _STOREFINALMODELRESPONSE._serialized_start=1700 + _STOREFINALMODELRESPONSE._serialized_end=1764 + _GETLATESTMODELREQUEST._serialized_start=1766 + _GETLATESTMODELREQUEST._serialized_end=1810 + _GETLATESTMODELRESPONSE._serialized_start=1812 + _GETLATESTMODELRESPONSE._serialized_end=1877 + _TRAINERSERVER._serialized_start=1880 + _TRAINERSERVER._serialized_end=2337 # @@protoc_insertion_point(module_scope) diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi index 9723ebdb8..b185cc7b1 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi @@ -115,9 +115,6 @@ class StartTrainingRequest(google.protobuf.message.Message): PIPELINE_ID_FIELD_NUMBER: builtins.int TRIGGER_ID_FIELD_NUMBER: builtins.int DEVICE_FIELD_NUMBER: builtins.int - AMP_FIELD_NUMBER: builtins.int - MODEL_ID_FIELD_NUMBER: builtins.int - MODEL_CONFIGURATION_FIELD_NUMBER: builtins.int USE_PRETRAINED_MODEL_FIELD_NUMBER: builtins.int LOAD_OPTIMIZER_STATE_FIELD_NUMBER: builtins.int PRETRAINED_MODEL_ID_FIELD_NUMBER: builtins.int @@ -140,10 +137,6 @@ class StartTrainingRequest(google.protobuf.message.Message): pipeline_id: builtins.int trigger_id: builtins.int device: builtins.str - amp: builtins.bool - model_id: builtins.str - @property - def model_configuration(self) -> global___JsonString: ... use_pretrained_model: builtins.bool load_optimizer_state: builtins.bool pretrained_model_id: builtins.int @@ -179,9 +172,6 @@ class StartTrainingRequest(google.protobuf.message.Message): pipeline_id: builtins.int = ..., trigger_id: builtins.int = ..., device: builtins.str = ..., - amp: builtins.bool = ..., - model_id: builtins.str = ..., - model_configuration: global___JsonString | None = ..., use_pretrained_model: builtins.bool = ..., load_optimizer_state: builtins.bool = ..., pretrained_model_id: builtins.int = ..., @@ -202,8 +192,8 @@ class StartTrainingRequest(google.protobuf.message.Message): seed: builtins.int | None = ..., tokenizer: global___PythonString | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "lr_scheduler", b"lr_scheduler", "model_configuration", b"model_configuration", "seed", b"seed", "tokenizer", b"tokenizer", "torch_optimizers_configuration", b"torch_optimizers_configuration"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "amp", b"amp", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "model_configuration", b"model_configuration", "model_id", b"model_id", "num_prefetched_partitions", b"num_prefetched_partitions", "parallel_prefetch_requests", b"parallel_prefetch_requests", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "tokenizer", b"tokenizer", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "lr_scheduler", b"lr_scheduler", "seed", b"seed", "tokenizer", b"tokenizer", "torch_optimizers_configuration", b"torch_optimizers_configuration"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "num_prefetched_partitions", b"num_prefetched_partitions", "parallel_prefetch_requests", b"parallel_prefetch_requests", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "tokenizer", b"tokenizer", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ... @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_seed", b"_seed"]) -> typing_extensions.Literal["seed"] | None: ... @typing.overload diff --git a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py index a23802109..5d8ceb058 100644 --- a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py +++ b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py @@ -10,7 +10,8 @@ import torch # pylint: disable=no-name-in-module -from modyn.common.ftp import download_file, get_pretrained_model_callback +from modyn.common.ftp import download_trained_model +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( FetchModelRequest, FetchModelResponse, @@ -36,6 +37,7 @@ from modyn.trainer_server.internal.utils.training_info import TrainingInfo from modyn.trainer_server.internal.utils.training_process_info import TrainingProcessInfo from modyn.utils import current_time_millis, dynamic_module_import, grpc_connection_established +from modyn.utils.utils import calculate_checksum logger = logging.getLogger(__name__) @@ -95,13 +97,16 @@ def start_training( ) -> StartTrainingResponse: logger.info("Received start training request.") - if not hasattr(dynamic_module_import("modyn.models"), request.model_id): - logger.error(f"Model {request.model_id} not available!") + with MetadataDatabaseConnection(self._config) as database: + model_class_name, model_config, amp = database.get_model_configuration(request.pipeline_id) + + if not hasattr(dynamic_module_import("modyn.models"), model_class_name): + logger.error(f"Model {model_class_name} not available!") return StartTrainingResponse(training_started=False) pretrained_model_path: Optional[pathlib.Path] = None if request.use_pretrained_model: - fetch_request = FetchModelRequest(model_id=request.pretrained_model_id) + fetch_request = FetchModelRequest(model_id=request.pretrained_model_id, load_metadata=True) fetch_resp: FetchModelResponse = self.model_storage_stub.FetchModel(fetch_request) if not fetch_resp.success: @@ -115,19 +120,18 @@ def start_training( training_id = self._next_training_id self._next_training_id += 1 - pretrained_model_path = self._modyn_base_dir / pathlib.Path(f"pretrained_model_{training_id}.modyn") - - download_file( - hostname=self._config["model_storage"]["hostname"], - port=int(self._config["model_storage"]["ftp_port"]), - user="modyn", - password="modyn", - remote_file_path=pathlib.Path(fetch_resp.model_path), - local_file_path=pretrained_model_path, - callback=get_pretrained_model_callback(logger), + pretrained_model_path = download_trained_model( + logger=logger, + model_storage_config=self._config["model_storage"], + remote_path=pathlib.Path(fetch_resp.model_path), + checksum=fetch_resp.checksum, + identifier=training_id, + base_directory=self._modyn_base_dir, ) - logger.info(f"Completed pretrained model download. Local path: {pretrained_model_path}") + if not pretrained_model_path: + return StartTrainingResponse(training_started=False) + else: with self._lock: training_id = self._next_training_id @@ -138,6 +142,9 @@ def start_training( training_info = TrainingInfo( request, training_id, + model_class_name, + model_config, + amp, self._storage_address, self._selector_address, self._offline_dataset_directory, @@ -343,9 +350,9 @@ def store_final_model( logger.error(f"Training with id {training_id} is still running.") return StoreFinalModelResponse(valid_state=False) - final_model_path = self._training_dict[training_id].final_checkpoint_path / "model_final.modyn" - if final_model_path.exists(): - prefix_path = str(final_model_path.relative_to(self._modyn_base_dir)) + final_checkpoint_path = self._get_final_model_path(training_id) + if final_checkpoint_path: + prefix_model_path = final_checkpoint_path.relative_to(self._modyn_base_dir) pipeline_id = self._training_dict[training_id].pipeline_id trigger_id = self._training_dict[training_id].trigger_id @@ -355,24 +362,30 @@ def store_final_model( trigger_id=trigger_id, hostname=self._config["trainer_server"]["hostname"], port=int(self._config["trainer_server"]["ftp_port"]), - model_path=prefix_path, + model_path=str(prefix_model_path), + checksum=calculate_checksum(final_checkpoint_path), ) register_response: RegisterModelResponse = self.model_storage_stub.RegisterModel(register_request) if not register_response.success: - logger.error(f"Could not store final model from training id {training_id}.") + logger.error(f"Could not store final model from training id {training_id} at model storage.") return StoreFinalModelResponse(valid_state=False) - - os.remove(final_model_path) - - logger.info(f"Deleted final model at {final_model_path}") + final_checkpoint_path.unlink() + logger.info(f"Deleted final model on path {final_checkpoint_path}") return StoreFinalModelResponse(valid_state=True, model_id=register_response.model_id) logger.error(f"Could not find final checkpoint of training with ID {training_id}.") return StoreFinalModelResponse(valid_state=False) + def _get_final_model_path(self, training_id: int) -> Optional[pathlib.Path]: + final_checkpoint_path = self._training_dict[training_id].final_checkpoint_path / "model_final.modyn" + if not final_checkpoint_path.exists(): + return None + + return final_checkpoint_path + def get_latest_model( self, request: GetLatestModelRequest, diff --git a/modyn/trainer_server/internal/trainer/pytorch_trainer.py b/modyn/trainer_server/internal/trainer/pytorch_trainer.py index 5d4f69b80..5f8319fed 100644 --- a/modyn/trainer_server/internal/trainer/pytorch_trainer.py +++ b/modyn/trainer_server/internal/trainer/pytorch_trainer.py @@ -48,10 +48,10 @@ deserialize_function, dynamic_module_import, grpc_connection_established, + instantiate_class, package_available_and_can_be_imported, seed_everything, ) -from modyn.utils.utils import instantiate_class AvailableQueues = Enum("AvailableQueues", ["TRAINING", "DOWNSAMPLING"]) diff --git a/modyn/trainer_server/internal/utils/training_info.py b/modyn/trainer_server/internal/utils/training_info.py index 41465f691..c08ea27e2 100644 --- a/modyn/trainer_server/internal/utils/training_info.py +++ b/modyn/trainer_server/internal/utils/training_info.py @@ -17,6 +17,9 @@ def __init__( self, request: StartTrainingRequest, training_id: int, + model_class_name: str, + model_config: str, + amp: bool, storage_address: str, selector_address: str, offline_dataset_path: str, @@ -35,7 +38,7 @@ def __init__( self.epochs_per_trigger = request.epochs_per_trigger self.torch_optimizers_configuration = json.loads(request.torch_optimizers_configuration.value) - self.model_configuration_dict = json.loads(request.model_configuration.value) + self.model_configuration_dict = json.loads(model_config) self.criterion_dict = json.loads(request.criterion_parameters.value) self.grad_scaler_configuration = json.loads(request.grad_scaler_configuration.value) @@ -43,9 +46,9 @@ def __init__( self.bytes_parser = request.bytes_parser.value self.label_transformer = request.label_transformer.value - self.model_id = request.model_id + self.model_class_name = model_class_name model_module = dynamic_module_import("modyn.models") - self.model_handler = getattr(model_module, self.model_id) + self.model_handler = getattr(model_module, self.model_class_name) self.use_pretrained_model = request.use_pretrained_model self.load_optimizer_state = request.load_optimizer_state @@ -57,7 +60,7 @@ def __init__( self.batch_size = request.batch_size self.torch_criterion = request.torch_criterion - self.amp = request.amp + self.amp = amp self.lr_scheduler = json.loads(request.lr_scheduler.value) diff --git a/modyn/utils/__init__.py b/modyn/utils/__init__.py index 268ae1ac5..efe785d2e 100644 --- a/modyn/utils/__init__.py +++ b/modyn/utils/__init__.py @@ -12,20 +12,26 @@ LABEL_TRANSFORMER_FUNC_NAME, MAX_MESSAGE_SIZE, DownsamplingMode, + calculate_checksum, convert_timestr_to_seconds, current_time_millis, deserialize_function, dynamic_module_import, flatten, get_partition_for_worker, + get_tensor_byte_size, grpc_connection_established, + instantiate_class, is_directory_writable, model_available, package_available_and_can_be_imported, + reconstruct_tensor_from_bytes, seed_everything, trigger_available, + unzip_file, validate_timestr, validate_yaml, + zip_file, ) files = os.listdir(os.path.dirname(__file__)) diff --git a/modyn/utils/utils.py b/modyn/utils/utils.py index f1fb54193..0eff6e594 100644 --- a/modyn/utils/utils.py +++ b/modyn/utils/utils.py @@ -1,8 +1,10 @@ import errno +import hashlib import importlib import importlib.util import inspect import logging +import math import os import pathlib import random @@ -13,6 +15,7 @@ from inspect import isfunction from types import ModuleType from typing import Any, Callable, Optional +from zipfile import ZIP_DEFLATED, ZipFile import grpc import numpy as np @@ -239,3 +242,102 @@ def get_partition_for_worker(worker_id: int, total_workers: int, total_num_eleme start_index = 0 return start_index, worker_subset_size + + +def calculate_checksum(file_path: pathlib.Path, hash_func_name: str = "blake2b", chunk_num_blocks: int = 128) -> bytes: + """ + Returns the checksum of a file. + + Args: + file_path: the path to the file. + hash_func_name: the name of the hash function. + chunk_num_blocks: size of the update step. + + Returns: + bytes: the checksum that is calculated over the file. + """ + assert file_path.exists() and file_path.is_file() + + hash_func = hashlib.new(hash_func_name) + with open(file_path, "rb") as file: + while chunk := file.read(chunk_num_blocks * hash_func.block_size): + hash_func.update(chunk) + return hash_func.digest() + + +def zip_file( + file_path: pathlib.Path, zipped_file_path: pathlib.Path, compression: int = ZIP_DEFLATED, remove_file: bool = False +) -> None: + """ + Zips a file. + + Args: + file_path: the path to the file that should be zipped. + zipped_file_path: the path to the zipped file. + compression: the compression algorithm to be used. + remove_file: if the file should be removed after zipping. + """ + assert file_path.exists(), "Cannot work with non-existing file" + + with ZipFile(zipped_file_path, "w", compression=compression) as zipfile: + zipfile.write(file_path) + + if remove_file: + os.remove(file_path) + + +def unzip_file( + zipped_file_path: pathlib.Path, file_path: pathlib.Path, compression: int = ZIP_DEFLATED, remove_file: bool = False +) -> None: + """ + Unzips a file. + + Args: + zipped_file_path: path to the zipped file. + file_path: path pointing to the location where the unzipped file should be stored. + compression: the compression algorithm to be used. + remove_file: true if we should remove the zipped file afterwards. + """ + with ZipFile(zipped_file_path, "r", compression=compression) as zipfile: + assert len(zipfile.namelist()) == 1 + + with open(file_path, "wb") as file: + file.write(zipfile.read(zipfile.namelist()[0])) + + if remove_file: + os.remove(zipped_file_path) + + +def reconstruct_tensor_from_bytes(tensor: torch.Tensor, buffer: bytes) -> torch.Tensor: + """ + Reconstruct a tensor from bytes. + + Args: + tensor: the template for the reconstructed tensor. + buffer: the serialized tensor information. + + Returns: + Tensor: the reconstructed tensor. + """ + reconstructed_tensor = torch.frombuffer(buffer, dtype=tensor.dtype) + return torch.reshape(reconstructed_tensor, tensor.shape) + + +def get_tensor_byte_size(tensor: torch.Tensor) -> int: + """ + Get the amount of bytes needed to represent a tensor in binary format. + + Args: + tensor: the tensor, for which the number of bytes is calculated. + + Returns: + int: the number of bytes needed to represent the tensor. + """ + shape = tensor.shape + if torch.is_floating_point(tensor): + type_size = torch.finfo(tensor.dtype).bits / 8 + else: + type_size = torch.iinfo(tensor.dtype).bits / 8 + num_bytes = int(math.prod(shape) * type_size) + + return num_bytes