From d591f39f6c597423c8c61f199a0efafc52018c7f Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Thu, 20 Jul 2023 14:57:25 +0200 Subject: [PATCH 01/12] Add model information to pipeline database model --- docker-compose.yml | 1 + .../integrationtest_metadata_processor.py | 2 +- .../integrationtest_model_storage.py | 9 ++- .../selector/integrationtest_selector.py | 63 ++++++++++++--- .../metadata_database_connection.py | 19 ++++- modyn/metadata_database/models/pipelines.py | 4 +- modyn/protos/selector.proto | 2 + modyn/protos/trainer_server.proto | 2 - .../internal/grpc/generated/selector_pb2.py | 78 +++++++++---------- .../internal/grpc/generated/selector_pb2.pyi | 11 ++- .../internal/grpc/selector_grpc_servicer.py | 4 +- modyn/selector/internal/selector_manager.py | 4 +- modyn/supervisor/internal/grpc_handler.py | 14 ++-- .../models/test_pipelines.py | 27 ++++--- .../test_metadata_database_connection.py | 21 ++++- .../grpc/test_model_storage_grpc_servicer.py | 5 +- .../grpc/test_selector_grpc_servicer.py | 9 ++- .../internal/test_selector_manager.py | 17 ++-- .../supervisor/internal/test_grpc_handler.py | 15 +++- .../grpc/test_trainer_server_grpc_servicer.py | 58 +++++++++++--- .../internal/trainer/test_pytorch_trainer.py | 4 +- .../grpc/generated/trainer_server_pb2.py | 36 ++++----- .../grpc/generated/trainer_server_pb2.pyi | 11 +-- .../grpc/trainer_server_grpc_servicer.py | 11 ++- .../internal/utils/training_info.py | 6 +- 25 files changed, 295 insertions(+), 138 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 052870785..77a640106 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -82,6 +82,7 @@ services: - storage - selector - model_storage + - metadata-db build: context: . dockerfile: docker/Trainer_Server/Dockerfile diff --git a/integrationtests/metadata_processor/integrationtest_metadata_processor.py b/integrationtests/metadata_processor/integrationtest_metadata_processor.py index 096105d2f..5e4b9137a 100644 --- a/integrationtests/metadata_processor/integrationtest_metadata_processor.py +++ b/integrationtests/metadata_processor/integrationtest_metadata_processor.py @@ -49,7 +49,7 @@ def get_grpc_channel(config: dict, component: str) -> grpc.Channel: def send_metadata_and_check_database(processor_client: MetadataProcessorClient, config: dict) -> int: with MetadataDatabaseConnection(config) as database: - pipeline_id = database.register_pipeline(2) + pipeline_id = database.register_pipeline(2, "ResNet18", "{}") req = TrainingMetadataRequest( pipeline_id=pipeline_id, diff --git a/integrationtests/model_storage/integrationtest_model_storage.py b/integrationtests/model_storage/integrationtest_model_storage.py index 70299dbb0..e14dd3647 100644 --- a/integrationtests/model_storage/integrationtest_model_storage.py +++ b/integrationtests/model_storage/integrationtest_model_storage.py @@ -1,4 +1,5 @@ # end-to-end testing of the model storage component +import json import pathlib import shutil @@ -68,7 +69,7 @@ def delete_dummy_file_from_trainer(config: dict): def insert_trigger_into_database(config: dict) -> (int, int): with MetadataDatabaseConnection(config) as database: - pipeline_id = database.register_pipeline(2) + pipeline_id = database.register_pipeline(2, "ResNet18", json.dumps({"num_classes": 10})) trigger = Trigger(trigger_id=10, pipeline_id=pipeline_id) database.session.add(trigger) @@ -89,6 +90,12 @@ def test_model_storage(config: dict): # register pipeline and trigger pipeline_id, trigger_id = insert_trigger_into_database(config) + with MetadataDatabaseConnection(config) as database: + model_id, model_config = database.get_model_configuration(pipeline_id) + + assert model_id == "ResNet18" + assert json.loads(model_config) == {"num_classes": 10} + model_storage_channel = connect_to_model_storage(config) model_storage = ModelStorageStub(model_storage_channel) diff --git a/integrationtests/selector/integrationtest_selector.py b/integrationtests/selector/integrationtest_selector.py index bf2efb152..23af71993 100644 --- a/integrationtests/selector/integrationtest_selector.py +++ b/integrationtests/selector/integrationtest_selector.py @@ -44,7 +44,12 @@ def test_label_balanced_presampling_huge() -> None: } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_id="ResNet10", + model_configuration=JsonString(value="{}"), + ) ).pipeline_id trigger_id = selector.inform_data_and_trigger( @@ -124,7 +129,12 @@ def test_label_balanced_force_same_size(): } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_id="ResNet10", + model_configuration=JsonString(value="{}"), + ) ).pipeline_id # now we just have 2 classes with 4 samples each @@ -208,7 +218,12 @@ def test_label_balanced_force_all_samples(): } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_id="ResNet10", + model_configuration=JsonString(value="{}"), + ) ).pipeline_id # same classes as before @@ -298,7 +313,12 @@ def test_newdata() -> None: } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_id="ResNet10", + model_configuration=JsonString(value="{}"), + ) ).pipeline_id selector.inform_data( @@ -437,7 +457,12 @@ def test_abstract_downsampler(reset_after_trigger) -> None: } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_id="ResNet10", + model_configuration=JsonString(value="{}"), + ) ).pipeline_id selector.inform_data( @@ -586,7 +611,12 @@ def test_empty_triggers() -> None: } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_id="ResNet10", + model_configuration=JsonString(value="{}"), + ) ).pipeline_id selector.inform_data( @@ -754,7 +784,12 @@ def test_many_samples_evenly_distributed(): } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_id="ResNet10", + model_configuration=JsonString(value="{}"), + ) ).pipeline_id selector.inform_data( @@ -824,7 +859,12 @@ def test_many_samples_unevenly_distributed(): } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_id="ResNet10", + model_configuration=JsonString(value="{}"), + ) ).pipeline_id selector.inform_data( @@ -895,7 +935,12 @@ def test_get_available_labels(reset_after_trigger: bool): } pipeline_id = selector.register_pipeline( - RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config))) + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value=json.dumps(strategy_config)), + model_id="ResNet10", + model_configuration=JsonString(value="{}"), + ) ).pipeline_id selector.inform_data( diff --git a/modyn/metadata_database/metadata_database_connection.py b/modyn/metadata_database/metadata_database_connection.py index ac337bc1f..ab8ec6dd0 100644 --- a/modyn/metadata_database/metadata_database_connection.py +++ b/modyn/metadata_database/metadata_database_connection.py @@ -67,16 +67,17 @@ def create_tables(self) -> None: """ MetadataBase.metadata.create_all(self.engine) - def register_pipeline(self, num_workers: int) -> int: + def register_pipeline(self, num_workers: int, model_id: str, model_config: str) -> int: """Register a new pipeline in the database. Args: num_workers (int): Number of workers in the pipeline. - + model_id (str): the model name that is used by the pipeline. + model_config (str): the serialized model configuration options. Returns: int: Id of the newly created pipeline. """ - pipeline = Pipeline(num_workers=num_workers) + pipeline = Pipeline(num_workers=num_workers, model_id=model_id, model_config=model_config) self.session.add(pipeline) self.session.commit() pipeline_id = pipeline.pipeline_id @@ -111,3 +112,15 @@ def add_trained_model(self, pipeline_id: int, trigger_id: int, model_path: str) self.session.commit() model_id = trained_model.model_id return model_id + + def get_model_configuration(self, pipeline_id: int) -> (str, str): + """Get the model id and its configuration options for a given pipeline. + + Args: + pipeline_id: id of the pipeline from which we want to extract the model. + + Returns: + (str, str): the model id and its configuration options. + """ + pipeline: Pipeline = self.session.query(Pipeline).get(pipeline_id) + return pipeline.model_id, pipeline.model_config diff --git a/modyn/metadata_database/models/pipelines.py b/modyn/metadata_database/models/pipelines.py index 4094b3f95..7d78666f5 100644 --- a/modyn/metadata_database/models/pipelines.py +++ b/modyn/metadata_database/models/pipelines.py @@ -1,7 +1,7 @@ """Pipeline model.""" from modyn.metadata_database.metadata_base import MetadataBase -from sqlalchemy import Column, Integer +from sqlalchemy import Column, Integer, String class Pipeline(MetadataBase): @@ -12,6 +12,8 @@ class Pipeline(MetadataBase): __table_args__ = {"extend_existing": True} pipeline_id = Column("pipeline_id", Integer, primary_key=True) num_workers = Column("num_workers", Integer, nullable=False) + model_id = Column("model_id", String(length=50), nullable=False) + model_config = Column("model_config", String(length=500), nullable=False) def __repr__(self) -> str: """Return string representation.""" diff --git a/modyn/protos/selector.proto b/modyn/protos/selector.proto index b72a81eb8..5c41c0b54 100644 --- a/modyn/protos/selector.proto +++ b/modyn/protos/selector.proto @@ -36,6 +36,8 @@ message TriggerResponse { int32 trigger_id = 1; } message RegisterPipelineRequest { int32 num_workers = 1; JsonString selection_strategy = 2; + string model_id = 3; + JsonString model_configuration = 4; } message PipelineResponse { int32 pipeline_id = 1; } diff --git a/modyn/protos/trainer_server.proto b/modyn/protos/trainer_server.proto index 800bf3d9a..6698b7617 100644 --- a/modyn/protos/trainer_server.proto +++ b/modyn/protos/trainer_server.proto @@ -36,8 +36,6 @@ message StartTrainingRequest { int32 trigger_id = 2; string device = 3; bool amp = 4; - string model_id = 5; - JsonString model_configuration = 6; bool use_pretrained_model = 7; bool load_optimizer_state = 8; int32 pretrained_model_id = 9; diff --git a/modyn/selector/internal/grpc/generated/selector_pb2.py b/modyn/selector/internal/grpc/generated/selector_pb2.py index 8091fd68e..2bec54a51 100644 --- a/modyn/selector/internal/grpc/generated/selector_pb2.py +++ b/modyn/selector/internal/grpc/generated/selector_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0eselector.proto\x12\x08selector\"\x07\n\x05\x45mpty\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"Z\n\x11\x44\x61taInformRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x0c\n\x04keys\x18\x02 \x03(\x03\x12\x12\n\ntimestamps\x18\x03 \x03(\x03\x12\x0e\n\x06labels\x18\x04 \x03(\x03\"%\n\x0fTriggerResponse\x12\x12\n\ntrigger_id\x18\x01 \x01(\x05\"`\n\x17RegisterPipelineRequest\x12\x13\n\x0bnum_workers\x18\x01 \x01(\x05\x12\x30\n\x12selection_strategy\x18\x02 \x01(\x0b\x32\x14.selector.JsonString\"\'\n\x10PipelineResponse\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"e\n\x11GetSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x14\n\x0cpartition_id\x18\x03 \x01(\x05\x12\x11\n\tworker_id\x18\x04 \x01(\x05\"T\n\x0fSamplesResponse\x12\x1f\n\x17training_samples_subset\x18\x01 \x03(\x03\x12 \n\x18training_samples_weights\x18\x02 \x03(\x02\"D\n\x19GetNumberOfSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\".\n\x17NumberOfSamplesResponse\x12\x13\n\x0bnum_samples\x18\x01 \x01(\x05\"/\n\x18GetStatusBarScaleRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"2\n\x16StatusBarScaleResponse\x12\x18\n\x10status_bar_scale\x18\x01 \x01(\x05\"G\n\x1cGetNumberOfPartitionsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\"4\n\x1aNumberOfPartitionsResponse\x12\x16\n\x0enum_partitions\x18\x01 \x01(\x05\"0\n\x19GetAvailableLabelsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"3\n\x17\x41vailableLabelsResponse\x12\x18\n\x10\x61vailable_labels\x18\x01 \x03(\x03\"2\n\x1bGetSelectionStrategyRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"\x82\x01\n\x19SelectionStrategyResponse\x12\x1c\n\x14\x64ownsampling_enabled\x18\x01 \x01(\x08\x12\x15\n\rstrategy_name\x18\x02 \x01(\t\x12\x30\n\x12\x64ownsampler_config\x18\x03 \x01(\x0b\x32\x14.selector.JsonString\")\n\x12UsesWeightsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"+\n\x13UsesWeightsResponse\x12\x14\n\x0cuses_weights\x18\x01 \x01(\x08\"#\n\x13SeedSelectorRequest\x12\x0c\n\x04seed\x18\x01 \x01(\x05\"\'\n\x14SeedSelectorResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xe9\x07\n\x08Selector\x12T\n\x11register_pipeline\x12!.selector.RegisterPipelineRequest\x1a\x1a.selector.PipelineResponse\"\x00\x12Y\n\x1bget_sample_keys_and_weights\x12\x1b.selector.GetSamplesRequest\x1a\x19.selector.SamplesResponse\"\x00\x30\x01\x12=\n\x0binform_data\x12\x1b.selector.DataInformRequest\x1a\x0f.selector.Empty\"\x00\x12S\n\x17inform_data_and_trigger\x12\x1b.selector.DataInformRequest\x1a\x19.selector.TriggerResponse\"\x00\x12\x61\n\x15get_number_of_samples\x12#.selector.GetNumberOfSamplesRequest\x1a!.selector.NumberOfSamplesResponse\"\x00\x12^\n\x14get_status_bar_scale\x12\".selector.GetStatusBarScaleRequest\x1a .selector.StatusBarScaleResponse\"\x00\x12j\n\x18get_number_of_partitions\x12&.selector.GetNumberOfPartitionsRequest\x1a$.selector.NumberOfPartitionsResponse\"\x00\x12`\n\x14get_available_labels\x12#.selector.GetAvailableLabelsRequest\x1a!.selector.AvailableLabelsResponse\"\x00\x12\x66\n\x16get_selection_strategy\x12%.selector.GetSelectionStrategyRequest\x1a#.selector.SelectionStrategyResponse\"\x00\x12P\n\rseed_selector\x12\x1d.selector.SeedSelectorRequest\x1a\x1e.selector.SeedSelectorResponse\"\x00\x12M\n\x0cuses_weights\x12\x1c.selector.UsesWeightsRequest\x1a\x1d.selector.UsesWeightsResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0eselector.proto\x12\x08selector\"\x07\n\x05\x45mpty\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"Z\n\x11\x44\x61taInformRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x0c\n\x04keys\x18\x02 \x03(\x03\x12\x12\n\ntimestamps\x18\x03 \x03(\x03\x12\x0e\n\x06labels\x18\x04 \x03(\x03\"%\n\x0fTriggerResponse\x12\x12\n\ntrigger_id\x18\x01 \x01(\x05\"\xa5\x01\n\x17RegisterPipelineRequest\x12\x13\n\x0bnum_workers\x18\x01 \x01(\x05\x12\x30\n\x12selection_strategy\x18\x02 \x01(\x0b\x32\x14.selector.JsonString\x12\x10\n\x08model_id\x18\x03 \x01(\t\x12\x31\n\x13model_configuration\x18\x04 \x01(\x0b\x32\x14.selector.JsonString\"\'\n\x10PipelineResponse\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"e\n\x11GetSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x14\n\x0cpartition_id\x18\x03 \x01(\x05\x12\x11\n\tworker_id\x18\x04 \x01(\x05\"T\n\x0fSamplesResponse\x12\x1f\n\x17training_samples_subset\x18\x01 \x03(\x03\x12 \n\x18training_samples_weights\x18\x02 \x03(\x02\"D\n\x19GetNumberOfSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\".\n\x17NumberOfSamplesResponse\x12\x13\n\x0bnum_samples\x18\x01 \x01(\x05\"/\n\x18GetStatusBarScaleRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"2\n\x16StatusBarScaleResponse\x12\x18\n\x10status_bar_scale\x18\x01 \x01(\x05\"G\n\x1cGetNumberOfPartitionsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\"4\n\x1aNumberOfPartitionsResponse\x12\x16\n\x0enum_partitions\x18\x01 \x01(\x05\"0\n\x19GetAvailableLabelsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"3\n\x17\x41vailableLabelsResponse\x12\x18\n\x10\x61vailable_labels\x18\x01 \x03(\x03\"2\n\x1bGetSelectionStrategyRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"\x82\x01\n\x19SelectionStrategyResponse\x12\x1c\n\x14\x64ownsampling_enabled\x18\x01 \x01(\x08\x12\x15\n\rstrategy_name\x18\x02 \x01(\t\x12\x30\n\x12\x64ownsampler_config\x18\x03 \x01(\x0b\x32\x14.selector.JsonString\")\n\x12UsesWeightsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"+\n\x13UsesWeightsResponse\x12\x14\n\x0cuses_weights\x18\x01 \x01(\x08\"#\n\x13SeedSelectorRequest\x12\x0c\n\x04seed\x18\x01 \x01(\x05\"\'\n\x14SeedSelectorResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xe9\x07\n\x08Selector\x12T\n\x11register_pipeline\x12!.selector.RegisterPipelineRequest\x1a\x1a.selector.PipelineResponse\"\x00\x12Y\n\x1bget_sample_keys_and_weights\x12\x1b.selector.GetSamplesRequest\x1a\x19.selector.SamplesResponse\"\x00\x30\x01\x12=\n\x0binform_data\x12\x1b.selector.DataInformRequest\x1a\x0f.selector.Empty\"\x00\x12S\n\x17inform_data_and_trigger\x12\x1b.selector.DataInformRequest\x1a\x19.selector.TriggerResponse\"\x00\x12\x61\n\x15get_number_of_samples\x12#.selector.GetNumberOfSamplesRequest\x1a!.selector.NumberOfSamplesResponse\"\x00\x12^\n\x14get_status_bar_scale\x12\".selector.GetStatusBarScaleRequest\x1a .selector.StatusBarScaleResponse\"\x00\x12j\n\x18get_number_of_partitions\x12&.selector.GetNumberOfPartitionsRequest\x1a$.selector.NumberOfPartitionsResponse\"\x00\x12`\n\x14get_available_labels\x12#.selector.GetAvailableLabelsRequest\x1a!.selector.AvailableLabelsResponse\"\x00\x12\x66\n\x16get_selection_strategy\x12%.selector.GetSelectionStrategyRequest\x1a#.selector.SelectionStrategyResponse\"\x00\x12P\n\rseed_selector\x12\x1d.selector.SeedSelectorRequest\x1a\x1e.selector.SeedSelectorResponse\"\x00\x12M\n\x0cuses_weights\x12\x1c.selector.UsesWeightsRequest\x1a\x1d.selector.UsesWeightsResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'selector_pb2', globals()) @@ -29,42 +29,42 @@ _DATAINFORMREQUEST._serialized_end=156 _TRIGGERRESPONSE._serialized_start=158 _TRIGGERRESPONSE._serialized_end=195 - _REGISTERPIPELINEREQUEST._serialized_start=197 - _REGISTERPIPELINEREQUEST._serialized_end=293 - _PIPELINERESPONSE._serialized_start=295 - _PIPELINERESPONSE._serialized_end=334 - _GETSAMPLESREQUEST._serialized_start=336 - _GETSAMPLESREQUEST._serialized_end=437 - _SAMPLESRESPONSE._serialized_start=439 - _SAMPLESRESPONSE._serialized_end=523 - _GETNUMBEROFSAMPLESREQUEST._serialized_start=525 - _GETNUMBEROFSAMPLESREQUEST._serialized_end=593 - _NUMBEROFSAMPLESRESPONSE._serialized_start=595 - _NUMBEROFSAMPLESRESPONSE._serialized_end=641 - _GETSTATUSBARSCALEREQUEST._serialized_start=643 - _GETSTATUSBARSCALEREQUEST._serialized_end=690 - _STATUSBARSCALERESPONSE._serialized_start=692 - _STATUSBARSCALERESPONSE._serialized_end=742 - _GETNUMBEROFPARTITIONSREQUEST._serialized_start=744 - _GETNUMBEROFPARTITIONSREQUEST._serialized_end=815 - _NUMBEROFPARTITIONSRESPONSE._serialized_start=817 - _NUMBEROFPARTITIONSRESPONSE._serialized_end=869 - _GETAVAILABLELABELSREQUEST._serialized_start=871 - _GETAVAILABLELABELSREQUEST._serialized_end=919 - _AVAILABLELABELSRESPONSE._serialized_start=921 - _AVAILABLELABELSRESPONSE._serialized_end=972 - _GETSELECTIONSTRATEGYREQUEST._serialized_start=974 - _GETSELECTIONSTRATEGYREQUEST._serialized_end=1024 - _SELECTIONSTRATEGYRESPONSE._serialized_start=1027 - _SELECTIONSTRATEGYRESPONSE._serialized_end=1157 - _USESWEIGHTSREQUEST._serialized_start=1159 - _USESWEIGHTSREQUEST._serialized_end=1200 - _USESWEIGHTSRESPONSE._serialized_start=1202 - _USESWEIGHTSRESPONSE._serialized_end=1245 - _SEEDSELECTORREQUEST._serialized_start=1247 - _SEEDSELECTORREQUEST._serialized_end=1282 - _SEEDSELECTORRESPONSE._serialized_start=1284 - _SEEDSELECTORRESPONSE._serialized_end=1323 - _SELECTOR._serialized_start=1326 - _SELECTOR._serialized_end=2327 + _REGISTERPIPELINEREQUEST._serialized_start=198 + _REGISTERPIPELINEREQUEST._serialized_end=363 + _PIPELINERESPONSE._serialized_start=365 + _PIPELINERESPONSE._serialized_end=404 + _GETSAMPLESREQUEST._serialized_start=406 + _GETSAMPLESREQUEST._serialized_end=507 + _SAMPLESRESPONSE._serialized_start=509 + _SAMPLESRESPONSE._serialized_end=593 + _GETNUMBEROFSAMPLESREQUEST._serialized_start=595 + _GETNUMBEROFSAMPLESREQUEST._serialized_end=663 + _NUMBEROFSAMPLESRESPONSE._serialized_start=665 + _NUMBEROFSAMPLESRESPONSE._serialized_end=711 + _GETSTATUSBARSCALEREQUEST._serialized_start=713 + _GETSTATUSBARSCALEREQUEST._serialized_end=760 + _STATUSBARSCALERESPONSE._serialized_start=762 + _STATUSBARSCALERESPONSE._serialized_end=812 + _GETNUMBEROFPARTITIONSREQUEST._serialized_start=814 + _GETNUMBEROFPARTITIONSREQUEST._serialized_end=885 + _NUMBEROFPARTITIONSRESPONSE._serialized_start=887 + _NUMBEROFPARTITIONSRESPONSE._serialized_end=939 + _GETAVAILABLELABELSREQUEST._serialized_start=941 + _GETAVAILABLELABELSREQUEST._serialized_end=989 + _AVAILABLELABELSRESPONSE._serialized_start=991 + _AVAILABLELABELSRESPONSE._serialized_end=1042 + _GETSELECTIONSTRATEGYREQUEST._serialized_start=1044 + _GETSELECTIONSTRATEGYREQUEST._serialized_end=1094 + _SELECTIONSTRATEGYRESPONSE._serialized_start=1097 + _SELECTIONSTRATEGYRESPONSE._serialized_end=1227 + _USESWEIGHTSREQUEST._serialized_start=1229 + _USESWEIGHTSREQUEST._serialized_end=1270 + _USESWEIGHTSRESPONSE._serialized_start=1272 + _USESWEIGHTSRESPONSE._serialized_end=1315 + _SEEDSELECTORREQUEST._serialized_start=1317 + _SEEDSELECTORREQUEST._serialized_end=1352 + _SEEDSELECTORRESPONSE._serialized_start=1354 + _SEEDSELECTORRESPONSE._serialized_end=1393 + _SELECTOR._serialized_start=1396 + _SELECTOR._serialized_end=2397 # @@protoc_insertion_point(module_scope) diff --git a/modyn/selector/internal/grpc/generated/selector_pb2.pyi b/modyn/selector/internal/grpc/generated/selector_pb2.pyi index 41439c968..3639fd9cc 100644 --- a/modyn/selector/internal/grpc/generated/selector_pb2.pyi +++ b/modyn/selector/internal/grpc/generated/selector_pb2.pyi @@ -89,17 +89,24 @@ class RegisterPipelineRequest(google.protobuf.message.Message): NUM_WORKERS_FIELD_NUMBER: builtins.int SELECTION_STRATEGY_FIELD_NUMBER: builtins.int + MODEL_ID_FIELD_NUMBER: builtins.int + MODEL_CONFIGURATION_FIELD_NUMBER: builtins.int num_workers: builtins.int @property def selection_strategy(self) -> global___JsonString: ... + model_id: builtins.str + @property + def model_configuration(self) -> global___JsonString: ... def __init__( self, *, num_workers: builtins.int = ..., selection_strategy: global___JsonString | None = ..., + model_id: builtins.str = ..., + model_configuration: global___JsonString | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["selection_strategy", b"selection_strategy"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["num_workers", b"num_workers", "selection_strategy", b"selection_strategy"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["model_configuration", b"model_configuration", "selection_strategy", b"selection_strategy"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["model_configuration", b"model_configuration", "model_id", b"model_id", "num_workers", b"num_workers", "selection_strategy", b"selection_strategy"]) -> None: ... global___RegisterPipelineRequest = RegisterPipelineRequest diff --git a/modyn/selector/internal/grpc/selector_grpc_servicer.py b/modyn/selector/internal/grpc/selector_grpc_servicer.py index bee70fe58..64dd109c9 100644 --- a/modyn/selector/internal/grpc/selector_grpc_servicer.py +++ b/modyn/selector/internal/grpc/selector_grpc_servicer.py @@ -49,7 +49,9 @@ def __init__(self, selector_manager: SelectorManager, sample_batch_size: int): def register_pipeline(self, request: RegisterPipelineRequest, context: grpc.ServicerContext) -> PipelineResponse: logger.info(f"Registering pipeline with request - {str(request)}") - pipeline_id = self.selector_manager.register_pipeline(request.num_workers, request.selection_strategy.value) + pipeline_id = self.selector_manager.register_pipeline( + request.num_workers, request.selection_strategy.value, request.model_id, request.model_configuration.value + ) return PipelineResponse(pipeline_id=pipeline_id) def get_sample_keys_and_weights( # pylint: disable-next=unused-argument diff --git a/modyn/selector/internal/selector_manager.py b/modyn/selector/internal/selector_manager.py index a325d9b15..13762135c 100644 --- a/modyn/selector/internal/selector_manager.py +++ b/modyn/selector/internal/selector_manager.py @@ -57,7 +57,7 @@ def _init_trigger_sample_directory(self) -> None: + f"Directory info: {os.stat(trigger_sample_directory)}" ) - def register_pipeline(self, num_workers: int, selection_strategy: str) -> int: + def register_pipeline(self, num_workers: int, selection_strategy: str, model_id: str, model_config: str) -> int: """ Registers a new pipeline at the Selector. Returns: @@ -70,7 +70,7 @@ def register_pipeline(self, num_workers: int, selection_strategy: str) -> int: with self._next_pipeline_lock: with MetadataDatabaseConnection(self._modyn_config) as database: - pipeline_id = database.register_pipeline(num_workers) + pipeline_id = database.register_pipeline(num_workers, model_id, model_config) selection_strategy = self._instantiate_strategy(json.loads(selection_strategy), pipeline_id) selector = Selector(selection_strategy, pipeline_id, num_workers, self._selector_cache_size) diff --git a/modyn/supervisor/internal/grpc_handler.py b/modyn/supervisor/internal/grpc_handler.py index 0919ef8b1..df49731d8 100644 --- a/modyn/supervisor/internal/grpc_handler.py +++ b/modyn/supervisor/internal/grpc_handler.py @@ -190,12 +190,19 @@ def register_pipeline_at_selector(self, pipeline_config: dict) -> int: if not self.connected_to_selector: raise ConnectionError("Tried to register pipeline at selector, but no connection was made.") + if "config" in pipeline_config["model"]: + model_config = json.dumps(pipeline_config["model"]["config"]) + else: + model_config = "{}" + pipeline_id = self.selector.register_pipeline( RegisterPipelineRequest( num_workers=pipeline_config["training"]["dataloader_workers"], selection_strategy=SelectorJsonString( value=json.dumps(pipeline_config["training"]["selection_strategy"]) ), + model_id=pipeline_config["model"]["id"], + model_configuration=SelectorJsonString(value=model_config), ) ).pipeline_id @@ -255,11 +262,6 @@ def start_training( if not self.connected_to_trainer_server: raise ConnectionError("Tried to start training at trainer server, but not there is no gRPC connection.") - if "config" in pipeline_config["model"]: - model_config = json.dumps(pipeline_config["model"]["config"]) - else: - model_config = "{}" - optimizers_config = {} for optimizer in pipeline_config["training"]["optimizers"]: optimizer_config = {} @@ -328,8 +330,6 @@ def start_training( "trigger_id": trigger_id, "device": pipeline_config["training"]["device"], "amp": amp, - "model_id": pipeline_config["model"]["id"], - "model_configuration": TrainerServerJsonString(value=model_config), "use_pretrained_model": previous_model_id is not None, "pretrained_model_id": previous_model_id or -1, "load_optimizer_state": False, # TODO(#137): Think about this. diff --git a/modyn/tests/metadata_database/models/test_pipelines.py b/modyn/tests/metadata_database/models/test_pipelines.py index ba618fec2..d270aa682 100644 --- a/modyn/tests/metadata_database/models/test_pipelines.py +++ b/modyn/tests/metadata_database/models/test_pipelines.py @@ -1,4 +1,6 @@ # pylint: disable=redefined-outer-name +import json + import pytest from modyn.metadata_database.models import Pipeline from sqlalchemy import create_engine @@ -19,20 +21,20 @@ def session(): def test_add_pipeline(session): - pipeline = Pipeline( - num_workers=10, - ) + pipeline = Pipeline(num_workers=10, model_id="ResNet18", model_config=json.dumps({"num_classes": 10})) session.add(pipeline) session.commit() - assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first() is not None - assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().num_workers == 10 + extracted_pipeline: Pipeline = session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first() + + assert extracted_pipeline is not None + assert extracted_pipeline.num_workers == 10 + assert extracted_pipeline.model_id == "ResNet18" + assert json.loads(extracted_pipeline.model_config)["num_classes"] == 10 def test_update_pipeline(session): - pipeline = Pipeline( - num_workers=10, - ) + pipeline = Pipeline(num_workers=10, model_id="ResNet18", model_config="{}") session.add(pipeline) session.commit() @@ -42,11 +44,14 @@ def test_update_pipeline(session): assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first() is not None assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().num_workers == 20 + pipeline.model_id = "test_model" + session.commit() + + assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().model_id == "test_model" + def test_delete_pipeline(session): - pipeline = Pipeline( - num_workers=10, - ) + pipeline = Pipeline(num_workers=10, model_id="ResNet18", model_config="{}") session.add(pipeline) session.commit() diff --git a/modyn/tests/metadata_database/test_metadata_database_connection.py b/modyn/tests/metadata_database/test_metadata_database_connection.py index eb96e579d..1cbae327a 100644 --- a/modyn/tests/metadata_database/test_metadata_database_connection.py +++ b/modyn/tests/metadata_database/test_metadata_database_connection.py @@ -1,3 +1,5 @@ +import json + from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.metadata_database.models import TrainedModel, Trigger @@ -24,16 +26,16 @@ def test_database_connection(): def test_register_pipeline(): with MetadataDatabaseConnection(get_minimal_modyn_config()) as database: database.create_tables() - pipeline_id = database.register_pipeline(1) + pipeline_id = database.register_pipeline(1, "ResNet18", "{}") assert pipeline_id == 1 - pipeline_id = database.register_pipeline(1) + pipeline_id = database.register_pipeline(1, "ResNet18", "{}") assert pipeline_id == 2 def test_add_trained_model(): with MetadataDatabaseConnection(get_minimal_modyn_config()) as database: database.create_tables() - pipeline_id = database.register_pipeline(1) + pipeline_id = database.register_pipeline(1, "ResNet18", "{}") trigger = Trigger(pipeline_id=pipeline_id, trigger_id=5) database.session.add(trigger) @@ -49,3 +51,16 @@ def test_add_trained_model(): assert model.model_id == 1 assert model.model_path == "test_path.modyn" assert model.pipeline_id == 1 and model.trigger_id == 5 + + +def test_get_model_configuration(): + with MetadataDatabaseConnection(get_minimal_modyn_config()) as database: + database.create_tables() + pipeline_id = database.register_pipeline(1, "ResNet18", json.dumps({"num_classes": 10})) + + assert pipeline_id == 1 + + model_id, model_config = database.get_model_configuration(pipeline_id) + + assert model_id == "ResNet18" + assert json.loads(model_config) == {"num_classes": 10} diff --git a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py index 8e13a90f5..5fb1a1593 100644 --- a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py +++ b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py @@ -1,3 +1,4 @@ +import json import os import pathlib import shutil @@ -43,13 +44,13 @@ def setup(): with MetadataDatabaseConnection(get_modyn_config()) as database: database.create_tables() - pipeline_id = database.register_pipeline(1) + pipeline_id = database.register_pipeline(1, "ResNet18", json.dumps({"num_classes": 10})) trigger = Trigger(trigger_id=10, pipeline_id=pipeline_id) database.session.add(trigger) database.session.commit() - pipeline2 = database.register_pipeline(4) + pipeline2 = database.register_pipeline(4, "ResNet18", json.dumps({"num_classes": 10})) trigger2 = Trigger(trigger_id=50, pipeline_id=pipeline2) database.session.add(trigger2) diff --git a/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py b/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py index fecc6a14f..8af446797 100644 --- a/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py +++ b/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py @@ -49,13 +49,18 @@ def test_register_pipeline(test_register_pipeline: MagicMock): config["selector"]["trigger_sample_directory"] = tmp_dir mgr = SelectorManager(config) servicer = SelectorGRPCServicer(mgr, 8096) - request = RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value="strat")) + request = RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value="strat"), + model_id="ResNet18", + model_configuration=JsonString(value="{}"), + ) test_register_pipeline.return_value = 42 response: PipelineResponse = servicer.register_pipeline(request, None) assert response.pipeline_id == 42 - test_register_pipeline.assert_called_once_with(2, "strat") + test_register_pipeline.assert_called_once_with(2, "strat", "ResNet18", "{}") @patch.object(SelectorManager, "init_metadata_db", noop_init_metadata_db) diff --git a/modyn/tests/selector/internal/test_selector_manager.py b/modyn/tests/selector/internal/test_selector_manager.py index 045acbdcf..7bb72a62d 100644 --- a/modyn/tests/selector/internal/test_selector_manager.py +++ b/modyn/tests/selector/internal/test_selector_manager.py @@ -44,7 +44,8 @@ class MockDatabaseConnection: def __init__(self, modyn_config: dict): # pylint: disable=super-init-not-called,unused-argument self.current_pipeline_id = 0 - def register_pipeline(self, number_of_workers: int) -> Optional[int]: # pylint: disable=unused-argument + # pylint: disable=unused-argument + def register_pipeline(self, number_of_workers: int, model_id: int, model_config: dict) -> Optional[int]: pid = self.current_pipeline_id self.current_pipeline_id += 1 return pid @@ -89,13 +90,13 @@ def test_register_pipeline(test__instantiate_strategy: MagicMock): assert len(selec._selectors) == 0 - assert selec.register_pipeline(42, "{}") == 0 + assert selec.register_pipeline(42, "{}", "RestNet18", "{}") == 0 assert len(selec._selectors) == 1 assert isinstance(selec._selectors[0]._strategy, MockStrategy) with pytest.raises(ValueError): - selec.register_pipeline(0, "strat") + selec.register_pipeline(0, "strat", "RestNet18", "{}") @patch("modyn.selector.internal.selector_manager.MetadataDatabaseConnection", MockDatabaseConnection) @@ -111,7 +112,7 @@ def test_get_sample_keys_and_weights( selec = SelectorManager(config) test__instantiate_strategy.return_value = MockStrategy() - pipe_id = selec.register_pipeline(2, "{}") + pipe_id = selec.register_pipeline(2, "{}", "RestNet18", "{}") with pytest.raises(ValueError): # Non existing pipeline @@ -142,7 +143,7 @@ def test_inform_data(selector_inform_data: MagicMock, test__instantiate_strategy with pytest.raises(ValueError): selec.inform_data(0, [10], [0], [0]) - pipe_id = selec.register_pipeline(2, "{}") + pipe_id = selec.register_pipeline(2, "{}", "RestNet18", "{}") selector_inform_data.return_value = None selec.inform_data(pipe_id, [10], [0], [0]) @@ -164,7 +165,7 @@ def test_inform_data_and_trigger(selector_inform_data_and_trigger: MagicMock, te with pytest.raises(ValueError): selec.inform_data_and_trigger(0, [10], [0], [0]) - pipe_id = selec.register_pipeline(2, "{}") + pipe_id = selec.register_pipeline(2, "{}", "RestNet18", "{}") selector_inform_data_and_trigger.return_value = None selec.inform_data_and_trigger(pipe_id, [10], [0], [0]) @@ -183,7 +184,7 @@ def test_get_available_labels(selector_get_available_labels: MagicMock, test__in selector = SelectorManager(config) test__instantiate_strategy.return_value = MockStrategy() - pipe_id = selector.register_pipeline(2, "{}") + pipe_id = selector.register_pipeline(2, "{}", "RestNet18", "{}") selector_get_available_labels.return_value = None selector.get_available_labels(pipe_id) @@ -227,7 +228,7 @@ def test_get_number_of_samples(selector_get_number_of_samples: MagicMock, test__ with pytest.raises(ValueError): selec.get_number_of_samples(0, 0) - pipe_id = selec.register_pipeline(2, "{}") + pipe_id = selec.register_pipeline(2, "{}", "RestNet18", "{}") selector_get_number_of_samples.return_value = 12 assert selec.get_number_of_samples(pipe_id, 21) == 12 diff --git a/modyn/tests/supervisor/internal/test_grpc_handler.py b/modyn/tests/supervisor/internal/test_grpc_handler.py index 2468629cc..a2ae55118 100644 --- a/modyn/tests/supervisor/internal/test_grpc_handler.py +++ b/modyn/tests/supervisor/internal/test_grpc_handler.py @@ -335,11 +335,22 @@ def test_register_pipeline_at_selector(test_grpc_connection_established): mock.return_value = PipelineResponse(pipeline_id=42) result = handler.register_pipeline_at_selector( - {"pipeline": {"name": "test"}, "training": {"dataloader_workers": 2, "selection_strategy": {}}} + { + "pipeline": {"name": "test"}, + "training": {"dataloader_workers": 2, "selection_strategy": {}}, + "model": {"id": "ResNet18"}, + } ) assert result == 42 - mock.assert_called_once_with(RegisterPipelineRequest(num_workers=2, selection_strategy=JsonString(value="{}"))) + mock.assert_called_once_with( + RegisterPipelineRequest( + num_workers=2, + selection_strategy=JsonString(value="{}"), + model_id="ResNet18", + model_configuration=JsonString(value="{}"), + ) + ) def test_unregister_pipeline_at_selector(): diff --git a/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py b/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py index 4a53f0ed3..a4c31ba43 100644 --- a/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py +++ b/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py @@ -12,6 +12,7 @@ from unittest.mock import MagicMock, patch import torch +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( FetchModelRequest, FetchModelResponse, @@ -35,6 +36,8 @@ from modyn.trainer_server.internal.utils.training_info import TrainingInfo from modyn.trainer_server.internal.utils.training_process_info import TrainingProcessInfo +DATABASE = pathlib.Path(os.path.abspath(__file__)).parent / "test_trainer_server.database" + trainer_available_request = TrainerAvailableRequest() get_status_request = TrainingStatusRequest(training_id=1) store_final_model_request = StoreFinalModelRequest(training_id=1) @@ -47,6 +50,14 @@ "ftp_port": "3001", "offline_dataset_directory": "/tmp/offline_dataset", }, + "metadata_database": { + "drivername": "sqlite", + "username": "", + "password": "", + "host": "", + "port": 0, + "database": f"{DATABASE}", + }, "storage": {"hostname": "storage", "port": "5002"}, "selector": {"hostname": "selector", "port": "5003"}, "model_storage": {"hostname": "model_storage", "port": "5004"}, @@ -59,12 +70,34 @@ "ftp_port": "3001", "offline_dataset_directory": "/tmp/offline_dataset", }, + "metadata_database": { + "drivername": "sqlite", + "username": "", + "password": "", + "host": "", + "port": 0, + "database": f"{DATABASE}", + }, "storage": {"hostname": "storage", "port": "5002"}, "selector": {"hostname": "selector", "port": "5003"}, "model_storage": {"hostname": "localhost", "port": "5004", "ftp_port": "3002"}, } +def setup(): + if os.path.exists(DATABASE): + os.remove(DATABASE) + + with MetadataDatabaseConnection(modyn_config) as database: + database.create_tables() + + database.register_pipeline(1, "model", json.dumps({})) + + +def teardown(): + os.remove(DATABASE) + + class DummyModelStorageStub: # pylint: disable-next=invalid-name def FetchModel(self, request: FetchModelRequest) -> FetchModelResponse: @@ -104,13 +137,12 @@ def get_training_process_info(): return training_process_info -def get_start_training_request(checkpoint_path="", valid_model=True): +def get_start_training_request(checkpoint_path=""): return StartTrainingRequest( pipeline_id=1, trigger_id=1, device="cpu", amp=False, - model_id="model" if valid_model else "unknown", batch_size=32, torch_optimizers_configuration=JsonString( value=json.dumps( @@ -118,7 +150,6 @@ def get_start_training_request(checkpoint_path="", valid_model=True): ) ), torch_criterion="CrossEntropyLoss", - model_configuration=JsonString(value=json.dumps({})), criterion_parameters=JsonString(value=json.dumps({})), data_info=Data(dataset_id="Dataset", num_dataloaders=1), checkpoint_info=CheckpointInfo(checkpoint_interval=10, checkpoint_path=checkpoint_path), @@ -142,7 +173,14 @@ def get_training_info( request = get_start_training_request(temp) offline_dataset_path = "/tmp/offline_dataset" training_info = TrainingInfo( - request, training_id, storage_address, selector_address, offline_dataset_path, pathlib.Path(final_temp) + request, + training_id, + "model", + json.dumps({}), + storage_address, + selector_address, + offline_dataset_path, + pathlib.Path(final_temp), ) return training_info @@ -181,7 +219,7 @@ def test_trainer_not_available(test_is_alive, test_connect_to_model_storage): def test_start_training_invalid(test_hasattr, test_connect_to_model_storage): with tempfile.TemporaryDirectory() as modyn_temp: trainer_server = TrainerServerGRPCServicer(modyn_config, modyn_temp) - response = trainer_server.start_training(get_start_training_request(valid_model=False), None) + response = trainer_server.start_training(get_start_training_request(), None) assert not response.training_started assert not trainer_server._training_dict assert trainer_server._next_training_id == 0 @@ -192,7 +230,7 @@ def test_start_training_invalid(test_hasattr, test_connect_to_model_storage): def test_start_training_invalid_id(test_hasattr, test_connect_to_model_storage): with tempfile.TemporaryDirectory() as modyn_temp: trainer_server = TrainerServerGRPCServicer(modyn_config, modyn_temp) - req = get_start_training_request(valid_model=True) + req = get_start_training_request() req.use_pretrained_model = True req.pretrained_model_id = 15 resp = trainer_server.start_training(req, None) @@ -215,16 +253,18 @@ def test_start_training(test_getattr, test_hasattr, test_connect_to_model_storag mock_start.side_effect = noop trainer_server._training_dict[1] = None with patch("multiprocessing.Process.start", mock_start): - trainer_server.start_training(get_start_training_request(valid_model=True), None) + trainer_server.start_training(get_start_training_request(), None) assert 0 in trainer_server._training_process_dict assert trainer_server._next_training_id == 1 # start new training - trainer_server.start_training(get_start_training_request(valid_model=True), None) + trainer_server.start_training(get_start_training_request(), None) assert 1 in trainer_server._training_process_dict assert trainer_server._next_training_id == 2 + assert trainer_server._training_dict[1].model_id == "model" + assert trainer_server._training_dict[1].model_configuration_dict == {} - request = get_start_training_request(valid_model=True) + request = get_start_training_request() request.use_pretrained_model = True request.pretrained_model_id = 10 diff --git a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py index 7f57d4b2e..d22e77515 100644 --- a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py +++ b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py @@ -204,9 +204,7 @@ def get_training_info( amp=False, data_info=Data(dataset_id="MNIST", num_dataloaders=2), torch_optimizers_configuration=JsonString(value=json.dumps(torch_optimizers_configuration)), - model_configuration=JsonString(value=json.dumps({})), criterion_parameters=JsonString(value=json.dumps({})), - model_id="model", batch_size=32, torch_criterion="CrossEntropyLoss", checkpoint_info=CheckpointInfo(checkpoint_interval=10, checkpoint_path=tmpdirname), @@ -223,6 +221,8 @@ def get_training_info( training_info = TrainingInfo( request, training_id, + "model", + json.dumps({}), storage_address, selector_address, offline_dataset_path, diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py index 1417e7a29..9df036ad7 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xfc\x05\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x0b\n\x03\x61mp\x18\x04 \x01(\x08\x12\x10\n\x08model_id\x18\x05 \x01(\t\x12\x30\n\x13model_configuration\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1c\n\x14use_pretrained_model\x18\x07 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x08 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\t \x01(\x05\x12\x12\n\nbatch_size\x18\n \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x0b \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\x0c \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\r \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0e \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0f \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x11 \x03(\t\x12)\n\x0clr_scheduler\x18\x12 \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x13 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x14 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x15 \x01(\x05\x12\x11\n\x04seed\x18\x16 \x01(\x05H\x00\x88\x01\x01\x42\x07\n\x05_seed\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\x84\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12\x16\n\texception\x18\x06 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x07 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x08 \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\t \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\n \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xb8\x05\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x0b\n\x03\x61mp\x18\x04 \x01(\x08\x12\x1c\n\x14use_pretrained_model\x18\x07 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x08 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\t \x01(\x05\x12\x12\n\nbatch_size\x18\n \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x0b \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\x0c \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\r \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0e \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0f \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x11 \x03(\t\x12)\n\x0clr_scheduler\x18\x12 \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x13 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x14 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x15 \x01(\x05\x12\x11\n\x04seed\x18\x16 \x01(\x05H\x00\x88\x01\x01\x42\x07\n\x05_seed\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\x84\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12\x16\n\texception\x18\x06 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x07 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x08 \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\t \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\n \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trainer_server_pb2', globals()) @@ -34,21 +34,21 @@ _CHECKPOINTINFO._serialized_start=220 _CHECKPOINTINFO._serialized_end=290 _STARTTRAININGREQUEST._serialized_start=293 - _STARTTRAININGREQUEST._serialized_end=1057 - _STARTTRAININGRESPONSE._serialized_start=1059 - _STARTTRAININGRESPONSE._serialized_end=1129 - _TRAININGSTATUSREQUEST._serialized_start=1131 - _TRAININGSTATUSREQUEST._serialized_end=1175 - _TRAININGSTATUSRESPONSE._serialized_start=1178 - _TRAININGSTATUSRESPONSE._serialized_end=1566 - _STOREFINALMODELREQUEST._serialized_start=1568 - _STOREFINALMODELREQUEST._serialized_end=1613 - _STOREFINALMODELRESPONSE._serialized_start=1615 - _STOREFINALMODELRESPONSE._serialized_end=1679 - _GETLATESTMODELREQUEST._serialized_start=1681 - _GETLATESTMODELREQUEST._serialized_end=1725 - _GETLATESTMODELRESPONSE._serialized_start=1727 - _GETLATESTMODELRESPONSE._serialized_end=1792 - _TRAINERSERVER._serialized_start=1795 - _TRAINERSERVER._serialized_end=2252 + _STARTTRAININGREQUEST._serialized_end=989 + _STARTTRAININGRESPONSE._serialized_start=991 + _STARTTRAININGRESPONSE._serialized_end=1061 + _TRAININGSTATUSREQUEST._serialized_start=1063 + _TRAININGSTATUSREQUEST._serialized_end=1107 + _TRAININGSTATUSRESPONSE._serialized_start=1110 + _TRAININGSTATUSRESPONSE._serialized_end=1498 + _STOREFINALMODELREQUEST._serialized_start=1500 + _STOREFINALMODELREQUEST._serialized_end=1545 + _STOREFINALMODELRESPONSE._serialized_start=1547 + _STOREFINALMODELRESPONSE._serialized_end=1611 + _GETLATESTMODELREQUEST._serialized_start=1613 + _GETLATESTMODELREQUEST._serialized_end=1657 + _GETLATESTMODELRESPONSE._serialized_start=1659 + _GETLATESTMODELRESPONSE._serialized_end=1724 + _TRAINERSERVER._serialized_start=1727 + _TRAINERSERVER._serialized_end=2184 # @@protoc_insertion_point(module_scope) diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi index fae4b64c3..2175b8cb0 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi @@ -116,8 +116,6 @@ class StartTrainingRequest(google.protobuf.message.Message): TRIGGER_ID_FIELD_NUMBER: builtins.int DEVICE_FIELD_NUMBER: builtins.int AMP_FIELD_NUMBER: builtins.int - MODEL_ID_FIELD_NUMBER: builtins.int - MODEL_CONFIGURATION_FIELD_NUMBER: builtins.int USE_PRETRAINED_MODEL_FIELD_NUMBER: builtins.int LOAD_OPTIMIZER_STATE_FIELD_NUMBER: builtins.int PRETRAINED_MODEL_ID_FIELD_NUMBER: builtins.int @@ -138,9 +136,6 @@ class StartTrainingRequest(google.protobuf.message.Message): trigger_id: builtins.int device: builtins.str amp: builtins.bool - model_id: builtins.str - @property - def model_configuration(self) -> global___JsonString: ... use_pretrained_model: builtins.bool load_optimizer_state: builtins.bool pretrained_model_id: builtins.int @@ -173,8 +168,6 @@ class StartTrainingRequest(google.protobuf.message.Message): trigger_id: builtins.int = ..., device: builtins.str = ..., amp: builtins.bool = ..., - model_id: builtins.str = ..., - model_configuration: global___JsonString | None = ..., use_pretrained_model: builtins.bool = ..., load_optimizer_state: builtins.bool = ..., pretrained_model_id: builtins.int = ..., @@ -192,8 +185,8 @@ class StartTrainingRequest(google.protobuf.message.Message): epochs_per_trigger: builtins.int = ..., seed: builtins.int | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "lr_scheduler", b"lr_scheduler", "model_configuration", b"model_configuration", "seed", b"seed", "torch_optimizers_configuration", b"torch_optimizers_configuration"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "amp", b"amp", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "model_configuration", b"model_configuration", "model_id", b"model_id", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "lr_scheduler", b"lr_scheduler", "seed", b"seed", "torch_optimizers_configuration", b"torch_optimizers_configuration"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "amp", b"amp", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ... def WhichOneof(self, oneof_group: typing_extensions.Literal["_seed", b"_seed"]) -> typing_extensions.Literal["seed"] | None: ... global___StartTrainingRequest = StartTrainingRequest diff --git a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py index 660bdddaa..7fd8f6feb 100644 --- a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py +++ b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py @@ -11,6 +11,7 @@ # pylint: disable=no-name-in-module from modyn.common.ftp import download_file, get_pretrained_model_callback +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( FetchModelRequest, FetchModelResponse, @@ -86,6 +87,7 @@ def trainer_available( return TrainerAvailableResponse(available=True) + # pylint: disable=too-many-locals def start_training( self, request: StartTrainingRequest, @@ -93,8 +95,11 @@ def start_training( ) -> StartTrainingResponse: logger.info("Received start training request.") - if not hasattr(dynamic_module_import("modyn.models"), request.model_id): - logger.error(f"Model {request.model_id} not available!") + with MetadataDatabaseConnection(self._config) as database: + model_id, model_config = database.get_model_configuration(request.pipeline_id) + + if not hasattr(dynamic_module_import("modyn.models"), model_id): + logger.error(f"Model {model_id} not available!") return StartTrainingResponse(training_started=False) pretrained_model_path: Optional[pathlib.Path] = None @@ -135,6 +140,8 @@ def start_training( training_info = TrainingInfo( request, training_id, + model_id, + model_config, self._storage_address, self._selector_address, self._offline_dataset_directory, diff --git a/modyn/trainer_server/internal/utils/training_info.py b/modyn/trainer_server/internal/utils/training_info.py index f3ccd087a..893990e35 100644 --- a/modyn/trainer_server/internal/utils/training_info.py +++ b/modyn/trainer_server/internal/utils/training_info.py @@ -17,6 +17,8 @@ def __init__( self, request: StartTrainingRequest, training_id: int, + model_id: str, + model_config: str, storage_address: str, selector_address: str, offline_dataset_path: str, @@ -32,7 +34,7 @@ def __init__( self.epochs_per_trigger = request.epochs_per_trigger self.torch_optimizers_configuration = json.loads(request.torch_optimizers_configuration.value) - self.model_configuration_dict = json.loads(request.model_configuration.value) + self.model_configuration_dict = json.loads(model_config) self.criterion_dict = json.loads(request.criterion_parameters.value) self.grad_scaler_configuration = json.loads(request.grad_scaler_configuration.value) @@ -40,7 +42,7 @@ def __init__( self.bytes_parser = request.bytes_parser.value self.label_transformer = request.label_transformer.value - self.model_id = request.model_id + self.model_id = model_id model_module = dynamic_module_import("modyn.models") self.model_handler = getattr(model_module, self.model_id) From d7dcc3df6123edbf8e8c7e53e02d53d29e4a6678 Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Wed, 16 Aug 2023 20:24:38 +0200 Subject: [PATCH 02/12] Implement Model Management Policies * create model storage manager to handle the new strategies * create 2 full model strategies (naive implementation and compressed version thereof) and one incremental strategy (weights difference strategy) with 2 difference operators (sub and xor) * enabled option to verify checksum of downloaded files * add model storage strategy as part of pipeline to the metadata database * rework of model storage component * separate metadata from model information in model storage * add method to (un-)zip files * add various unit tests --- benchmark/mnist/mnist.yaml | 12 +- .../integrationtest_metadata_processor.py | 5 +- .../integrationtest_model_storage.py | 52 ++- .../selector/integrationtest_selector.py | 24 ++ modyn/common/ftp/__init__.py | 8 +- modyn/common/ftp/ftp_utils.py | 53 ++- modyn/config/examples/example-pipeline.yaml | 12 +- modyn/config/schema/pipeline-schema.yaml | 60 ++- .../internal/grpc/evaluator_grpc_servicer.py | 52 +-- .../internal/grpc/generated/evaluator_pb2.py | 32 +- .../internal/grpc/generated/evaluator_pb2.pyi | 14 +- .../internal/utils/evaluation_info.py | 9 +- .../metadata_database_connection.py | 60 ++- modyn/metadata_database/models/pipelines.py | 16 +- .../models/trained_models.py | 6 +- modyn/metadata_database/utils/__init__.py | 12 + .../utils/model_storage_strategy_config.py | 13 + modyn/model_storage/internal/__init__.py | 2 + .../grpc/generated/model_storage_pb2.py | 30 +- .../grpc/generated/model_storage_pb2.pyi | 15 +- .../internal/grpc/grpc_server.py | 9 +- .../grpc/model_storage_grpc_servicer.py | 100 ++--- .../internal/model_storage_manager.py | 190 +++++++++ .../internal/storage_strategies/__init__.py | 14 + .../abstract_difference_operator.py | 41 ++ .../abstract_model_storage_strategy.py | 42 ++ .../difference_operators/__init__.py | 14 + .../sub_difference_operator.py | 17 + .../xor_difference_operator.py | 30 ++ .../full_model_strategies/__init__.py | 15 + .../abstract_full_model_strategy.py | 53 +++ .../compressed_full_model.py | 23 ++ .../pytorch_full_model.py | 21 + .../incremental_model_strategies/__init__.py | 14 + .../abstract_incremental_model_strategy.py | 54 +++ .../weights_difference.py | 47 +++ .../model_storage/internal/utils/__init__.py | 20 + .../internal/utils/data_types.py | 51 +++ .../internal/utils/model_storage_strategy.py | 68 ++++ modyn/model_storage/model_storage.py | 31 +- modyn/protos/evaluator.proto | 13 +- modyn/protos/model_storage.proto | 3 + modyn/protos/selector.proto | 15 + modyn/protos/trainer_server.proto | 33 +- .../internal/grpc/generated/selector_pb2.py | 90 +++-- .../internal/grpc/generated/selector_pb2.pyi | 72 +++- .../internal/grpc/selector_grpc_servicer.py | 45 ++- modyn/selector/internal/selector_manager.py | 24 +- modyn/supervisor/internal/grpc_handler.py | 51 +-- .../grpc/test_evaluator_grpc_servicer.py | 111 ++++-- .../internal/test_pytorch_evaluator.py | 5 +- .../models/test_pipelines.py | 23 +- .../test_metadata_database_connection.py | 38 +- .../grpc/test_model_storage_grpc_server.py | 13 +- .../grpc/test_model_storage_grpc_servicer.py | 165 ++++---- .../test_sub_difference_operator.py | 36 ++ .../test_xor_difference_operator.py | 36 ++ .../test_compressed_full_model.py | 41 ++ .../test_pytorch_full_model.py | 76 ++++ .../test_weights_difference.py | 87 ++++ .../internal/test_model_storage_manager.py | 371 ++++++++++++++++++ .../internal/utils/test_data_types.py | 24 ++ .../utils/test_model_storage_strategy.py | 56 +++ .../tests/model_storage/test_model_storage.py | 40 +- .../grpc/test_selector_grpc_servicer.py | 20 +- .../internal/test_selector_manager.py | 43 +- .../supervisor/internal/test_grpc_handler.py | 47 ++- .../grpc/test_trainer_server_grpc_servicer.py | 69 ++-- .../internal/trainer/test_pytorch_trainer.py | 2 +- modyn/tests/utils/test_utils.py | 47 +++ .../grpc/generated/trainer_server_pb2.py | 36 +- .../grpc/generated/trainer_server_pb2.pyi | 5 +- .../grpc/trainer_server_grpc_servicer.py | 53 +-- .../internal/utils/training_info.py | 3 +- modyn/utils/__init__.py | 3 + modyn/utils/utils.py | 66 ++++ 76 files changed, 2681 insertions(+), 522 deletions(-) create mode 100644 modyn/metadata_database/utils/__init__.py create mode 100644 modyn/metadata_database/utils/model_storage_strategy_config.py create mode 100644 modyn/model_storage/internal/model_storage_manager.py create mode 100644 modyn/model_storage/internal/storage_strategies/__init__.py create mode 100644 modyn/model_storage/internal/storage_strategies/abstract_difference_operator.py create mode 100644 modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py create mode 100644 modyn/model_storage/internal/storage_strategies/difference_operators/__init__.py create mode 100644 modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py create mode 100644 modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py create mode 100644 modyn/model_storage/internal/storage_strategies/full_model_strategies/__init__.py create mode 100644 modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py create mode 100644 modyn/model_storage/internal/storage_strategies/full_model_strategies/compressed_full_model.py create mode 100644 modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py create mode 100644 modyn/model_storage/internal/storage_strategies/incremental_model_strategies/__init__.py create mode 100644 modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py create mode 100644 modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py create mode 100644 modyn/model_storage/internal/utils/__init__.py create mode 100644 modyn/model_storage/internal/utils/data_types.py create mode 100644 modyn/model_storage/internal/utils/model_storage_strategy.py create mode 100644 modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_sub_difference_operator.py create mode 100644 modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_xor_difference_operator.py create mode 100644 modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_compressed_full_model.py create mode 100644 modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py create mode 100644 modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py create mode 100644 modyn/tests/model_storage/internal/test_model_storage_manager.py create mode 100644 modyn/tests/model_storage/internal/utils/test_data_types.py create mode 100644 modyn/tests/model_storage/internal/utils/test_model_storage_strategy.py diff --git a/benchmark/mnist/mnist.yaml b/benchmark/mnist/mnist.yaml index 4b45d946d..59f48ae99 100644 --- a/benchmark/mnist/mnist.yaml +++ b/benchmark/mnist/mnist.yaml @@ -6,6 +6,17 @@ model: id: ResNet18 config: num_classes: 10 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" + zip: True + zip_algorithm: ZIP_DEFLATED + incremental_model_strategy: + name: "WeightsDifference" + config: + operator: xor + split_exponent: True + full_model_interval: 10 training: gpus: 1 device: "cuda:0" @@ -43,7 +54,6 @@ data: import io def bytes_parser_function(data: bytes) -> Image: return Image.open(io.BytesIO(data)).convert("RGB") - trigger: id: DataAmountTrigger trigger_config: diff --git a/integrationtests/metadata_processor/integrationtest_metadata_processor.py b/integrationtests/metadata_processor/integrationtest_metadata_processor.py index 5e4b9137a..6b4fba996 100644 --- a/integrationtests/metadata_processor/integrationtest_metadata_processor.py +++ b/integrationtests/metadata_processor/integrationtest_metadata_processor.py @@ -6,6 +6,7 @@ import yaml from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.metadata_database.models import SampleTrainingMetadata, TriggerTrainingMetadata +from modyn.metadata_database.utils import ModelStorageStrategyConfig # pylint: disable-next=no-name-in-module from modyn.metadata_processor.internal.grpc.generated.metadata_processor_pb2 import ( # noqa: E402, E501 @@ -49,7 +50,9 @@ def get_grpc_channel(config: dict, component: str) -> grpc.Channel: def send_metadata_and_check_database(processor_client: MetadataProcessorClient, config: dict) -> int: with MetadataDatabaseConnection(config) as database: - pipeline_id = database.register_pipeline(2, "ResNet18", "{}") + pipeline_id = database.register_pipeline( + 2, "ResNet18", "{}", False, ModelStorageStrategyConfig("PyTorchFullModel") + ) req = TrainingMetadataRequest( pipeline_id=pipeline_id, diff --git a/integrationtests/model_storage/integrationtest_model_storage.py b/integrationtests/model_storage/integrationtest_model_storage.py index e14dd3647..16023bfe0 100644 --- a/integrationtests/model_storage/integrationtest_model_storage.py +++ b/integrationtests/model_storage/integrationtest_model_storage.py @@ -1,13 +1,16 @@ # end-to-end testing of the model storage component +import io import json import pathlib import shutil import grpc +import torch from integrationtests.utils import get_modyn_config from modyn.common.ftp import delete_file, download_file, upload_file from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.metadata_database.models import Trigger +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( DeleteModelRequest, DeleteModelResponse, @@ -17,19 +20,20 @@ RegisterModelResponse, ) from modyn.model_storage.internal.grpc.generated.model_storage_pb2_grpc import ModelStorageStub -from modyn.utils import grpc_connection_established +from modyn.models import ResNet18 +from modyn.utils import calculate_checksum, grpc_connection_established TEST_MODELS_PATH = pathlib.Path("/app") / "model_storage" / "test_models" -TEST_FILE_NAME_LOCAL = "test_model_local.txt" -TEST_FILE_NAME_LOCAL_RESP = "test_model_local_response.txt" -TEST_FILE_NAME_REMOTE = "test_model_remote.txt" +TEST_FILE_NAME_LOCAL = "test_model_local.modyn" +TEST_FILE_NAME_LOCAL_RESP = "test_model_local_response.modyn" +TEST_FILE_NAME_REMOTE = "test_model_remote.modyn" +SAMPLE_MODEL = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) def create_dummy_file(): pathlib.Path(TEST_MODELS_PATH).mkdir(parents=True, exist_ok=True) - with open(TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL, "w") as f: - f.write("Test model storage component") + torch.save({"model": SAMPLE_MODEL.model.state_dict(), "metadata": True}, TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL) def cleanup_models_dir() -> None: @@ -68,8 +72,13 @@ def delete_dummy_file_from_trainer(config: dict): def insert_trigger_into_database(config: dict) -> (int, int): + model_storage_strategy = ModelStorageStrategyConfig("CompressedFullModel") + model_storage_strategy.zip = True + with MetadataDatabaseConnection(config) as database: - pipeline_id = database.register_pipeline(2, "ResNet18", json.dumps({"num_classes": 10})) + pipeline_id = database.register_pipeline( + 2, "ResNet18", json.dumps({"num_classes": 10}), False, model_storage_strategy + ) trigger = Trigger(trigger_id=10, pipeline_id=pipeline_id) database.session.add(trigger) @@ -86,15 +95,32 @@ def delete_data_from_database(config: dict, pipeline_id: int, trigger_id: int): database.session.commit() +def check_loaded_model() -> None: + with open(TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL_RESP, "rb") as state_file: + checkpoint = torch.load(io.BytesIO(state_file.read())) + + assert "model" in checkpoint, "Model state is not stored in file" + resnet = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) + resnet.model.load_state_dict(checkpoint["model"]) + + assert checkpoint["metadata"] + + loaded_state = resnet.model.state_dict() + original_state = SAMPLE_MODEL.model.state_dict() + for layer_name, _ in resnet.model.state_dict().items(): + assert torch.all(torch.eq(loaded_state[layer_name], original_state[layer_name])) + + def test_model_storage(config: dict): # register pipeline and trigger pipeline_id, trigger_id = insert_trigger_into_database(config) with MetadataDatabaseConnection(config) as database: - model_id, model_config = database.get_model_configuration(pipeline_id) + model_id, model_config, amp = database.get_model_configuration(pipeline_id) assert model_id == "ResNet18" assert json.loads(model_config) == {"num_classes": 10} + assert not amp model_storage_channel = connect_to_model_storage(config) model_storage = ModelStorageStub(model_storage_channel) @@ -106,6 +132,7 @@ def test_model_storage(config: dict): hostname=config["trainer_server"]["hostname"], port=int(config["trainer_server"]["ftp_port"]), model_path=str(TEST_FILE_NAME_REMOTE), + checksum=calculate_checksum(TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL), ) response_register: RegisterModelResponse = model_storage.RegisterModel(request_register) @@ -113,9 +140,8 @@ def test_model_storage(config: dict): model_id = response_register.model_id # try to fetch the registered model - request_fetch = FetchModelRequest(model_id=model_id) + request_fetch = FetchModelRequest(model_id=model_id, load_metadata=True) response_fetch: FetchModelResponse = model_storage.FetchModel(request_fetch) - model_path = pathlib.Path(response_fetch.model_path) assert response_fetch.success, "Could not find model with this id" @@ -125,13 +151,13 @@ def test_model_storage(config: dict): int(config["model_storage"]["ftp_port"]), "modyn", "modyn", - remote_file_path=model_path, + remote_file_path=pathlib.Path(response_fetch.model_path), local_file_path=TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL_RESP, + checksum=response_fetch.checksum, ) # compare if content matches initial dummy file - with open(TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL_RESP, "r") as resp_file: - assert resp_file.read() == "Test model storage component", "File contents do not match" + check_loaded_model() # delete model on model storage component request_delete = DeleteModelRequest(model_id=model_id) diff --git a/integrationtests/selector/integrationtest_selector.py b/integrationtests/selector/integrationtest_selector.py index 23af71993..68b6cbc6d 100644 --- a/integrationtests/selector/integrationtest_selector.py +++ b/integrationtests/selector/integrationtest_selector.py @@ -8,8 +8,10 @@ GetNumberOfPartitionsRequest, GetSamplesRequest, JsonString, + ModelStorageStrategyInfo, RegisterPipelineRequest, SamplesResponse, + StrategyConfig, ) from modyn.selector.internal.grpc.generated.selector_pb2_grpc import SelectorStub from modyn.utils import grpc_connection_established @@ -29,6 +31,10 @@ def connect_to_selector_servicer() -> grpc.Channel: return selector_channel +def get_model_storage_strategy() -> ModelStorageStrategyInfo: + return ModelStorageStrategyInfo(full_model_strategy_config=StrategyConfig(name="PyTorchFullModel")) + + def test_label_balanced_presampling_huge() -> None: selector_channel = connect_to_selector_servicer() selector = SelectorStub(selector_channel) @@ -49,6 +55,8 @@ def test_label_balanced_presampling_huge() -> None: selection_strategy=JsonString(value=json.dumps(strategy_config)), model_id="ResNet10", model_configuration=JsonString(value="{}"), + amp=False, + model_storage_strategy=get_model_storage_strategy(), ) ).pipeline_id @@ -134,6 +142,8 @@ def test_label_balanced_force_same_size(): selection_strategy=JsonString(value=json.dumps(strategy_config)), model_id="ResNet10", model_configuration=JsonString(value="{}"), + amp=False, + model_storage_strategy=get_model_storage_strategy(), ) ).pipeline_id @@ -223,6 +233,8 @@ def test_label_balanced_force_all_samples(): selection_strategy=JsonString(value=json.dumps(strategy_config)), model_id="ResNet10", model_configuration=JsonString(value="{}"), + amp=False, + model_storage_strategy=get_model_storage_strategy(), ) ).pipeline_id @@ -318,6 +330,8 @@ def test_newdata() -> None: selection_strategy=JsonString(value=json.dumps(strategy_config)), model_id="ResNet10", model_configuration=JsonString(value="{}"), + amp=False, + model_storage_strategy=get_model_storage_strategy(), ) ).pipeline_id @@ -462,6 +476,8 @@ def test_abstract_downsampler(reset_after_trigger) -> None: selection_strategy=JsonString(value=json.dumps(strategy_config)), model_id="ResNet10", model_configuration=JsonString(value="{}"), + amp=False, + model_storage_strategy=get_model_storage_strategy(), ) ).pipeline_id @@ -616,6 +632,8 @@ def test_empty_triggers() -> None: selection_strategy=JsonString(value=json.dumps(strategy_config)), model_id="ResNet10", model_configuration=JsonString(value="{}"), + amp=False, + model_storage_strategy=get_model_storage_strategy(), ) ).pipeline_id @@ -789,6 +807,8 @@ def test_many_samples_evenly_distributed(): selection_strategy=JsonString(value=json.dumps(strategy_config)), model_id="ResNet10", model_configuration=JsonString(value="{}"), + amp=False, + model_storage_strategy=get_model_storage_strategy(), ) ).pipeline_id @@ -864,6 +884,8 @@ def test_many_samples_unevenly_distributed(): selection_strategy=JsonString(value=json.dumps(strategy_config)), model_id="ResNet10", model_configuration=JsonString(value="{}"), + amp=False, + model_storage_strategy=get_model_storage_strategy(), ) ).pipeline_id @@ -940,6 +962,8 @@ def test_get_available_labels(reset_after_trigger: bool): selection_strategy=JsonString(value=json.dumps(strategy_config)), model_id="ResNet10", model_configuration=JsonString(value="{}"), + amp=False, + model_storage_strategy=get_model_storage_strategy(), ) ).pipeline_id diff --git a/modyn/common/ftp/__init__.py b/modyn/common/ftp/__init__.py index d9f091f01..de047c5ea 100644 --- a/modyn/common/ftp/__init__.py +++ b/modyn/common/ftp/__init__.py @@ -1,7 +1,13 @@ import os from .ftp_server import FTPServer # noqa: F401 -from .ftp_utils import delete_file, download_file, get_pretrained_model_callback, upload_file # noqa: F401 +from .ftp_utils import ( # noqa: F401 + delete_file, + download_file, + download_trained_model, + get_pretrained_model_callback, + upload_file, +) files = os.listdir(os.path.dirname(__file__)) files.remove("__init__.py") diff --git a/modyn/common/ftp/ftp_utils.py b/modyn/common/ftp/ftp_utils.py index 18afc8487..5e675752f 100644 --- a/modyn/common/ftp/ftp_utils.py +++ b/modyn/common/ftp/ftp_utils.py @@ -1,10 +1,11 @@ # Utils file containing functions in order to simplify FTP server interactions. +import logging import pathlib from ftplib import FTP from logging import Logger from typing import Any, Callable, Optional -from modyn.utils import EMIT_MESSAGE_PERCENTAGES +from modyn.utils import EMIT_MESSAGE_PERCENTAGES, calculate_checksum def download_file( @@ -15,7 +16,8 @@ def download_file( remote_file_path: pathlib.Path, local_file_path: pathlib.Path, callback: Optional[Callable[[float], None]] = None, -) -> None: + checksum: Optional[bytes] = None, +) -> bool: """Downloads a file from a given host to the local filesystem. If the file already exists, it gets overwritten. Args: @@ -26,9 +28,9 @@ def download_file( remote_file_path: path to the remote file. local_file_path: local path to the file. callback(float): function called every block of data with the current progress in [0, 1]. - + checksum: the expected hash of the file. Returns: - + bool: whether the file was successfully downloaded. """ ftp = FTP() ftp.connect(hostname, port, timeout=3) @@ -54,6 +56,11 @@ def write_callback(data: Any) -> None: ftp.close() + if checksum: + local_hash = calculate_checksum(local_file_path) + return local_hash == checksum + return True + def upload_file( hostname: str, port: int, user: str, password: str, local_file_path: pathlib.Path, remote_file_path: pathlib.Path @@ -121,3 +128,41 @@ def download_callback(current_progress: float) -> None: last_progress = current_progress return download_callback + + +def download_trained_model( + logger: logging.Logger, + model_storage_config: dict, + remote_path: pathlib.Path, + checksum: bytes, + identifier: int, + base_directory: pathlib.Path, +) -> Optional[pathlib.Path]: + model_path = base_directory / f"trained_model_{identifier}.modyn" + + success = download_file( + hostname=model_storage_config["hostname"], + port=int(model_storage_config["ftp_port"]), + user="modyn", + password="modyn", + remote_file_path=remote_path, + local_file_path=model_path, + callback=get_pretrained_model_callback(logger), + checksum=checksum, + ) + + if not success: + logger.error("Checksums did not match, evaluation cannot be started.") + return None + + delete_file( + hostname=model_storage_config["hostname"], + port=int(model_storage_config["ftp_port"]), + user="modyn", + password="modyn", + remote_file_path=pathlib.Path(remote_path), + ) + + logger.info(f"Successfully downloaded trained model to {model_path}.") + + return model_path diff --git a/modyn/config/examples/example-pipeline.yaml b/modyn/config/examples/example-pipeline.yaml index 6054a8190..279845d59 100644 --- a/modyn/config/examples/example-pipeline.yaml +++ b/modyn/config/examples/example-pipeline.yaml @@ -6,6 +6,17 @@ model: id: ResNet18 config: num_classes: 10 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" + zip: True + zip_algorithm: ZIP_DEFLATED + incremental_model_strategy: + name: "WeightsDifference" + config: + operator: xor + split_exponent: True + full_model_interval: 10 training: gpus: 1 device: "cpu" @@ -63,7 +74,6 @@ trigger: data_points_for_trigger: 100 evaluation: device: "cpu" - amp: False datasets: - dataset_id: mnist transformations: ["transforms.ToTensor()", diff --git a/modyn/config/schema/pipeline-schema.yaml b/modyn/config/schema/pipeline-schema.yaml index 701a40323..cba383848 100644 --- a/modyn/config/schema/pipeline-schema.yaml +++ b/modyn/config/schema/pipeline-schema.yaml @@ -37,6 +37,62 @@ properties: Configuration dictionary that will be passed to the model on initialization. required: - id + model_storage: + type: object + properties: + full_model_strategy: + type: object + description: | + Which full model strategy is used. + properties: + name: + type: string + description: | + Name of the full model strategy. We currently support NaiveFullModelStrategy and BinaryFullModelStrategy. + config: + type: object + description: | + Configuration dictionary that will be passed to the strategy. + zip: + type: boolean + description: | + Whether to zip the file in the end. Defaults to False. + zip_algorithm: + type: string + description: | + Which zip algorithm to use. Default is DEFLATED. + required: + - name + incremental_model_strategy: + type: object + description: | + Which incremental model strategy is used. + properties: + name: + type: string + description: | + Name of the incremental model strategy. We currently support NaiveDeltaBasedModelStrategy and + BinaryDeltaBasedModelStrategy. + config: + type: object + description: | + Configuration dictionary that will be passed to the strategy. + zip: + type: boolean + description: | + Whether to zip the file in the end. Defaults to False. + zip_algorithm: + type: string + description: | + Which zip algorithm to use. Default is DEFLATED. + required: + - name + full_model_interval: + type: number + description: | + In which interval we are using the full model strategy. + required: + - full_model_strategy training: type: object properties: @@ -353,10 +409,6 @@ properties: description: | The device the model should be put on. In the future (#131), we might want this to be either "cpu" or "gpu" and let the evaluator figure out the exact device, but for now, this really is the identifier of the device. - amp: - type: boolean - description: | - If True, automatic mixed precision will be used. datasets: type: array description: | diff --git a/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py b/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py index fabe505af..5868339f0 100644 --- a/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py +++ b/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py @@ -8,7 +8,7 @@ from typing import Any, Optional import grpc -from modyn.common.ftp import download_file, get_pretrained_model_callback +from modyn.common.ftp import download_trained_model # pylint: disable-next=no-name-in-module from modyn.evaluator.internal.grpc.generated.evaluator_pb2 import ( @@ -26,6 +26,8 @@ from modyn.evaluator.internal.metrics import AbstractEvaluationMetric from modyn.evaluator.internal.pytorch_evaluator import evaluate from modyn.evaluator.internal.utils import EvaluationInfo, EvaluationProcessInfo, EvaluatorMessages +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.models import TrainedModel # pylint: disable-next=no-name-in-module from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import FetchModelRequest, FetchModelResponse @@ -87,14 +89,23 @@ def connect_to_storage(storage_address: str) -> StorageStub: raise ConnectionError(f"Could not establish gRPC connection to storage at address {storage_address}.") return StorageStub(storage_channel) + # pylint: disable=too-many-locals def evaluate_model(self, request: EvaluateModelRequest, context: grpc.ServicerContext) -> EvaluateModelResponse: logger.info("Received evaluate model request.") - if not hasattr(dynamic_module_import("modyn.models"), request.model_id): - logger.error(f"Model {request.model_id} not available!") + with MetadataDatabaseConnection(self._config) as database: + trained_model: Optional[TrainedModel] = database.session.get(TrainedModel, request.trained_model_id) + + if not trained_model: + logger.error(f"Trained model {request.trained_model_id} does not exist!") + return EvaluateModelResponse(evaluation_started=False) + model_id, model_config, amp = database.get_model_configuration(trained_model.pipeline_id) + + if not hasattr(dynamic_module_import("modyn.models"), model_id): + logger.error(f"Model {model_id} not available!") return EvaluateModelResponse(evaluation_started=False) - fetch_request = FetchModelRequest(model_id=request.trained_model_id) + fetch_request = FetchModelRequest(model_id=request.trained_model_id, load_metadata=False) fetch_resp: FetchModelResponse = self._model_storage_stub.FetchModel(fetch_request) if not fetch_resp.success: @@ -118,10 +129,22 @@ def evaluate_model(self, request: EvaluateModelRequest, context: grpc.ServicerCo evaluation_id = self._next_evaluation_id self._next_evaluation_id += 1 - local_model_path = self._download_trained_model(fetch_resp, evaluation_id) + trained_model_path = download_trained_model( + logger=logger, + model_storage_config=self._config["model_storage"], + remote_path=pathlib.Path(fetch_resp.model_path), + checksum=fetch_resp.checksum, + identifier=evaluation_id, + base_directory=self._base_dir, + ) + + if not trained_model_path: + return EvaluateModelResponse(evaluation_started=False) metrics = self._setup_metrics(request.metrics) - evaluation_info = EvaluationInfo(request, evaluation_id, self._storage_address, metrics, local_model_path) + evaluation_info = EvaluationInfo( + request, evaluation_id, model_id, model_config, amp, self._storage_address, metrics, trained_model_path + ) self._evaluation_dict[evaluation_id] = evaluation_info self._run_evaluation(evaluation_id) @@ -130,23 +153,6 @@ def evaluate_model(self, request: EvaluateModelRequest, context: grpc.ServicerCo evaluation_started=True, evaluation_id=evaluation_id, dataset_size=dataset_size_response.num_keys ) - def _download_trained_model(self, fetch_resp: FetchModelResponse, evaluation_id: int) -> pathlib.Path: - local_model_path = self._base_dir / f"trained_model_{evaluation_id}.modyn" - - download_file( - hostname=self._config["model_storage"]["hostname"], - port=int(self._config["model_storage"]["ftp_port"]), - user="modyn", - password="modyn", - remote_file_path=pathlib.Path(fetch_resp.model_path), - local_file_path=local_model_path, - callback=get_pretrained_model_callback(logger), - ) - - logger.info(f"Successfully downloaded trained model to {local_model_path}.") - - return local_model_path - @staticmethod def _setup_metrics(metric_configurations: list[MetricConfiguration]) -> list[AbstractEvaluationMetric]: metrics = [] diff --git a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py index 35f6f4eef..0f0d19ca6 100644 --- a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py +++ b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x65valuator.proto\x12\x0fmodyn.evaluator\":\n\x0b\x44\x61tasetInfo\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x8f\x01\n\x13MetricConfiguration\x12\x0c\n\x04name\x18\x01 \x01(\t\x12+\n\x06\x63onfig\x18\x02 \x01(\x0b\x32\x1b.modyn.evaluator.JsonString\x12=\n\x16\x65valuation_transformer\x18\x03 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"\x9f\x03\n\x14\x45valuateModelRequest\x12\x18\n\x10trained_model_id\x18\x01 \x01(\x05\x12\x32\n\x0c\x64\x61taset_info\x18\x02 \x01(\x0b\x32\x1c.modyn.evaluator.DatasetInfo\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x0b\n\x03\x61mp\x18\x04 \x01(\x08\x12\x12\n\nbatch_size\x18\x05 \x01(\x05\x12\x35\n\x07metrics\x18\x06 \x03(\x0b\x32$.modyn.evaluator.MetricConfiguration\x12\x10\n\x08model_id\x18\x07 \x01(\t\x12\x38\n\x13model_configuration\x18\x08 \x01(\x0b\x32\x1b.modyn.evaluator.JsonString\x12\x16\n\x0etransform_list\x18\t \x03(\t\x12\x33\n\x0c\x62ytes_parser\x18\n \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\x12\x38\n\x11label_transformer\x18\x0b \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"`\n\x15\x45valuateModelResponse\x12\x1a\n\x12\x65valuation_started\x18\x01 \x01(\x08\x12\x15\n\revaluation_id\x18\x02 \x01(\x05\x12\x14\n\x0c\x64\x61taset_size\x18\x03 \x01(\x03\"0\n\x17\x45valuationStatusRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"\xe5\x01\n\x18\x45valuationStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x17\n\x0fstate_available\x18\x03 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x04 \x01(\x08\x12\x16\n\texception\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x06 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x07 \x01(\x03H\x02\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seen\"0\n\x0e\x45valuationData\x12\x0e\n\x06metric\x18\x01 \x01(\t\x12\x0e\n\x06result\x18\x02 \x01(\x02\"0\n\x17\x45valuationResultRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"c\n\x18\x45valuationResultResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x38\n\x0f\x65valuation_data\x18\x02 \x03(\x0b\x32\x1f.modyn.evaluator.EvaluationData2\xce\x02\n\tEvaluator\x12\x61\n\x0e\x65valuate_model\x12%.modyn.evaluator.EvaluateModelRequest\x1a&.modyn.evaluator.EvaluateModelResponse\"\x00\x12n\n\x15get_evaluation_status\x12(.modyn.evaluator.EvaluationStatusRequest\x1a).modyn.evaluator.EvaluationStatusResponse\"\x00\x12n\n\x15get_evaluation_result\x12(.modyn.evaluator.EvaluationResultRequest\x1a).modyn.evaluator.EvaluationResultResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x65valuator.proto\x12\x0fmodyn.evaluator\":\n\x0b\x44\x61tasetInfo\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x8f\x01\n\x13MetricConfiguration\x12\x0c\n\x04name\x18\x01 \x01(\t\x12+\n\x06\x63onfig\x18\x02 \x01(\x0b\x32\x1b.modyn.evaluator.JsonString\x12=\n\x16\x65valuation_transformer\x18\x03 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"\xc6\x02\n\x14\x45valuateModelRequest\x12\x18\n\x10trained_model_id\x18\x01 \x01(\x05\x12\x32\n\x0c\x64\x61taset_info\x18\x02 \x01(\x0b\x32\x1c.modyn.evaluator.DatasetInfo\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\x05\x12\x35\n\x07metrics\x18\x05 \x03(\x0b\x32$.modyn.evaluator.MetricConfiguration\x12\x16\n\x0etransform_list\x18\x06 \x03(\t\x12\x33\n\x0c\x62ytes_parser\x18\x07 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\x12\x38\n\x11label_transformer\x18\x08 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"`\n\x15\x45valuateModelResponse\x12\x1a\n\x12\x65valuation_started\x18\x01 \x01(\x08\x12\x15\n\revaluation_id\x18\x02 \x01(\x05\x12\x14\n\x0c\x64\x61taset_size\x18\x03 \x01(\x03\"0\n\x17\x45valuationStatusRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"\xe5\x01\n\x18\x45valuationStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x17\n\x0fstate_available\x18\x03 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x04 \x01(\x08\x12\x16\n\texception\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x06 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x07 \x01(\x03H\x02\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seen\"0\n\x0e\x45valuationData\x12\x0e\n\x06metric\x18\x01 \x01(\t\x12\x0e\n\x06result\x18\x02 \x01(\x02\"0\n\x17\x45valuationResultRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"c\n\x18\x45valuationResultResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x38\n\x0f\x65valuation_data\x18\x02 \x03(\x0b\x32\x1f.modyn.evaluator.EvaluationData2\xce\x02\n\tEvaluator\x12\x61\n\x0e\x65valuate_model\x12%.modyn.evaluator.EvaluateModelRequest\x1a&.modyn.evaluator.EvaluateModelResponse\"\x00\x12n\n\x15get_evaluation_status\x12(.modyn.evaluator.EvaluationStatusRequest\x1a).modyn.evaluator.EvaluationStatusResponse\"\x00\x12n\n\x15get_evaluation_result\x12(.modyn.evaluator.EvaluationResultRequest\x1a).modyn.evaluator.EvaluationResultResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'evaluator_pb2', globals()) @@ -30,19 +30,19 @@ _METRICCONFIGURATION._serialized_start=157 _METRICCONFIGURATION._serialized_end=300 _EVALUATEMODELREQUEST._serialized_start=303 - _EVALUATEMODELREQUEST._serialized_end=718 - _EVALUATEMODELRESPONSE._serialized_start=720 - _EVALUATEMODELRESPONSE._serialized_end=816 - _EVALUATIONSTATUSREQUEST._serialized_start=818 - _EVALUATIONSTATUSREQUEST._serialized_end=866 - _EVALUATIONSTATUSRESPONSE._serialized_start=869 - _EVALUATIONSTATUSRESPONSE._serialized_end=1098 - _EVALUATIONDATA._serialized_start=1100 - _EVALUATIONDATA._serialized_end=1148 - _EVALUATIONRESULTREQUEST._serialized_start=1150 - _EVALUATIONRESULTREQUEST._serialized_end=1198 - _EVALUATIONRESULTRESPONSE._serialized_start=1200 - _EVALUATIONRESULTRESPONSE._serialized_end=1299 - _EVALUATOR._serialized_start=1302 - _EVALUATOR._serialized_end=1636 + _EVALUATEMODELREQUEST._serialized_end=629 + _EVALUATEMODELRESPONSE._serialized_start=631 + _EVALUATEMODELRESPONSE._serialized_end=727 + _EVALUATIONSTATUSREQUEST._serialized_start=729 + _EVALUATIONSTATUSREQUEST._serialized_end=777 + _EVALUATIONSTATUSRESPONSE._serialized_start=780 + _EVALUATIONSTATUSRESPONSE._serialized_end=1009 + _EVALUATIONDATA._serialized_start=1011 + _EVALUATIONDATA._serialized_end=1059 + _EVALUATIONRESULTREQUEST._serialized_start=1061 + _EVALUATIONRESULTREQUEST._serialized_end=1109 + _EVALUATIONRESULTRESPONSE._serialized_start=1111 + _EVALUATIONRESULTRESPONSE._serialized_end=1210 + _EVALUATOR._serialized_start=1213 + _EVALUATOR._serialized_end=1547 # @@protoc_insertion_point(module_scope) diff --git a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi index efb9d14c2..3b16f97f0 100644 --- a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi +++ b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi @@ -96,11 +96,8 @@ class EvaluateModelRequest(google.protobuf.message.Message): TRAINED_MODEL_ID_FIELD_NUMBER: builtins.int DATASET_INFO_FIELD_NUMBER: builtins.int DEVICE_FIELD_NUMBER: builtins.int - AMP_FIELD_NUMBER: builtins.int BATCH_SIZE_FIELD_NUMBER: builtins.int METRICS_FIELD_NUMBER: builtins.int - MODEL_ID_FIELD_NUMBER: builtins.int - MODEL_CONFIGURATION_FIELD_NUMBER: builtins.int TRANSFORM_LIST_FIELD_NUMBER: builtins.int BYTES_PARSER_FIELD_NUMBER: builtins.int LABEL_TRANSFORMER_FIELD_NUMBER: builtins.int @@ -108,13 +105,9 @@ class EvaluateModelRequest(google.protobuf.message.Message): @property def dataset_info(self) -> global___DatasetInfo: ... device: builtins.str - amp: builtins.bool batch_size: builtins.int @property def metrics(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___MetricConfiguration]: ... - model_id: builtins.str - @property - def model_configuration(self) -> global___JsonString: ... @property def transform_list(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ... @property @@ -127,17 +120,14 @@ class EvaluateModelRequest(google.protobuf.message.Message): trained_model_id: builtins.int = ..., dataset_info: global___DatasetInfo | None = ..., device: builtins.str = ..., - amp: builtins.bool = ..., batch_size: builtins.int = ..., metrics: collections.abc.Iterable[global___MetricConfiguration] | None = ..., - model_id: builtins.str = ..., - model_configuration: global___JsonString | None = ..., transform_list: collections.abc.Iterable[builtins.str] | None = ..., bytes_parser: global___PythonString | None = ..., label_transformer: global___PythonString | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["bytes_parser", b"bytes_parser", "dataset_info", b"dataset_info", "label_transformer", b"label_transformer", "model_configuration", b"model_configuration"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["amp", b"amp", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "dataset_info", b"dataset_info", "device", b"device", "label_transformer", b"label_transformer", "metrics", b"metrics", "model_configuration", b"model_configuration", "model_id", b"model_id", "trained_model_id", b"trained_model_id", "transform_list", b"transform_list"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["bytes_parser", b"bytes_parser", "dataset_info", b"dataset_info", "label_transformer", b"label_transformer"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "dataset_info", b"dataset_info", "device", b"device", "label_transformer", b"label_transformer", "metrics", b"metrics", "trained_model_id", b"trained_model_id", "transform_list", b"transform_list"]) -> None: ... global___EvaluateModelRequest = EvaluateModelRequest diff --git a/modyn/evaluator/internal/utils/evaluation_info.py b/modyn/evaluator/internal/utils/evaluation_info.py index 2ab27459b..aa12e904e 100644 --- a/modyn/evaluator/internal/utils/evaluation_info.py +++ b/modyn/evaluator/internal/utils/evaluation_info.py @@ -17,6 +17,9 @@ def __init__( self, request: EvaluateModelRequest, evaluation_id: int, + model_id: str, + model_config: str, + amp: bool, storage_address: str, metrics: list[AbstractEvaluationMetric], model_path: pathlib.Path, @@ -26,14 +29,14 @@ def __init__( self.num_dataloaders = request.dataset_info.num_dataloaders self.device = request.device - self.amp = request.amp + self.amp = amp self.batch_size = request.batch_size self.metrics = metrics - self.model_id = request.model_id + self.model_id = model_id model_module = dynamic_module_import("modyn.models") self.model_handler = getattr(model_module, self.model_id) - self.model_configuration_dict = json.loads(request.model_configuration.value) + self.model_configuration_dict = json.loads(model_config) self.transform_list = list(request.transform_list) self.bytes_parser = request.bytes_parser.value diff --git a/modyn/metadata_database/metadata_database_connection.py b/modyn/metadata_database/metadata_database_connection.py index ab8ec6dd0..c8683a8a7 100644 --- a/modyn/metadata_database/metadata_database_connection.py +++ b/modyn/metadata_database/metadata_database_connection.py @@ -3,12 +3,14 @@ from __future__ import annotations import logging +from typing import Optional from modyn.database.abstract_database_connection import AbstractDatabaseConnection from modyn.metadata_database.metadata_base import MetadataBase from modyn.metadata_database.models import Pipeline from modyn.metadata_database.models.selector_state_metadata import SelectorStateMetadata from modyn.metadata_database.models.trained_models import TrainedModel +from modyn.metadata_database.utils import ModelStorageStrategyConfig from sqlalchemy import func logger = logging.getLogger(__name__) @@ -67,17 +69,45 @@ def create_tables(self) -> None: """ MetadataBase.metadata.create_all(self.engine) - def register_pipeline(self, num_workers: int, model_id: str, model_config: str) -> int: + def register_pipeline( + self, + num_workers: int, + model_id: str, + model_config: str, + amp: bool, + full_model_strategy: ModelStorageStrategyConfig, + incremental_model_strategy: Optional[ModelStorageStrategyConfig] = None, + full_model_interval: Optional[int] = None, + ) -> int: """Register a new pipeline in the database. Args: num_workers (int): Number of workers in the pipeline. model_id (str): the model name that is used by the pipeline. model_config (str): the serialized model configuration options. + amp (bool): whether amp is enabled for the model. + full_model_strategy: the strategy used to store full models. + incremental_model_strategy: the strategy used to store models incrementally. + full_model_interval: interval between which the full model strategy is used. Returns: int: Id of the newly created pipeline. """ - pipeline = Pipeline(num_workers=num_workers, model_id=model_id, model_config=model_config) + pipeline = Pipeline( + num_workers=num_workers, + model_id=model_id, + model_config=model_config, + amp=amp, + full_model_strategy_name=full_model_strategy.name, + full_model_strategy_zip=full_model_strategy.zip, + full_model_strategy_zip_algorithm=full_model_strategy.zip_algorithm, + full_model_strategy_config=full_model_strategy.config, + ) + if incremental_model_strategy: + pipeline.inc_model_strategy_name = incremental_model_strategy.name + pipeline.inc_model_strategy_zip = incremental_model_strategy.zip + pipeline.inc_model_strategy_zip_algorithm = incremental_model_strategy.zip_algorithm + pipeline.inc_model_strategy_config = incremental_model_strategy.config + pipeline.full_model_interval = full_model_interval self.session.add(pipeline) self.session.commit() pipeline_id = pipeline.pipeline_id @@ -96,31 +126,45 @@ def add_selector_state_metadata_trigger(self, pipeline_id: int, trigger_id: int) pipeline_id, trigger_id, self.session, self.engine, self.hash_partition_modulus ) - def add_trained_model(self, pipeline_id: int, trigger_id: int, model_path: str) -> int: + def add_trained_model( + self, + pipeline_id: int, + trigger_id: int, + model_path: str, + metadata_path: Optional[str] = None, + parent_model: Optional[int] = None, + ) -> int: """Add a trained model to the database. Args: pipeline_id: id of the pipeline it was created from. trigger_id: id of the trigger it was created. model_path: path on the local filesystem on which the model is stored. - + metadata_path: the path on the local filesystem on which metadata to the model are stored. + parent_model: id of the parent model. Returns: int: Id of the registered model """ - trained_model = TrainedModel(pipeline_id=pipeline_id, trigger_id=trigger_id, model_path=model_path) + trained_model = TrainedModel( + pipeline_id=pipeline_id, + trigger_id=trigger_id, + model_path=model_path, + metadata_path=metadata_path, + parent_model=parent_model, + ) self.session.add(trained_model) self.session.commit() model_id = trained_model.model_id return model_id - def get_model_configuration(self, pipeline_id: int) -> (str, str): + def get_model_configuration(self, pipeline_id: int) -> tuple[str, str, bool]: """Get the model id and its configuration options for a given pipeline. Args: pipeline_id: id of the pipeline from which we want to extract the model. Returns: - (str, str): the model id and its configuration options. + (str, str, bool): the model id, its configuration options and if amp is enabled. """ pipeline: Pipeline = self.session.query(Pipeline).get(pipeline_id) - return pipeline.model_id, pipeline.model_config + return pipeline.model_id, pipeline.model_config, pipeline.amp diff --git a/modyn/metadata_database/models/pipelines.py b/modyn/metadata_database/models/pipelines.py index 7d78666f5..739160de1 100644 --- a/modyn/metadata_database/models/pipelines.py +++ b/modyn/metadata_database/models/pipelines.py @@ -1,7 +1,7 @@ """Pipeline model.""" from modyn.metadata_database.metadata_base import MetadataBase -from sqlalchemy import Column, Integer, String +from sqlalchemy import Boolean, Column, Integer, String class Pipeline(MetadataBase): @@ -14,6 +14,20 @@ class Pipeline(MetadataBase): num_workers = Column("num_workers", Integer, nullable=False) model_id = Column("model_id", String(length=50), nullable=False) model_config = Column("model_config", String(length=500), nullable=False) + amp = Column("amp", Boolean, nullable=False) + full_model_strategy_name = Column("full_model_strategy_name", String(length=50), nullable=False) + full_model_strategy_zip = Column("full_model_strategy_zip", Boolean, default=None, nullable=True) + full_model_strategy_zip_algorithm = Column( + "full_model_strategy_zip_algorithm", String(length=50), default=None, nullable=True + ) + full_model_strategy_config = Column("full_model_strategy_config", String(length=500), default=None, nullable=True) + inc_model_strategy_name = Column("inc_model_strategy_name", String(length=50), default=None, nullable=True) + inc_model_strategy_zip = Column("inc_model_strategy_zip", Boolean, default=None, nullable=True) + inc_model_strategy_zip_algorithm = Column( + "inc_model_strategy_zip_algorithm", String(length=50), default=None, nullable=True + ) + inc_model_strategy_config = Column("inc_model_strategy_config", String(length=500), default=None, nullable=True) + full_model_interval = Column("full_model_interval", Integer, default=None, nullable=True) def __repr__(self) -> str: """Return string representation.""" diff --git a/modyn/metadata_database/models/trained_models.py b/modyn/metadata_database/models/trained_models.py index 27484c416..111f2d40b 100644 --- a/modyn/metadata_database/models/trained_models.py +++ b/modyn/metadata_database/models/trained_models.py @@ -3,7 +3,8 @@ from modyn.metadata_database.metadata_base import MetadataBase from modyn.metadata_database.models.triggers import Trigger -from sqlalchemy import TIMESTAMP, Column, ForeignKeyConstraint, Integer, String +from sqlalchemy import TIMESTAMP, Column, ForeignKey, ForeignKeyConstraint, Integer, String +from sqlalchemy.orm import relationship class TrainedModel(MetadataBase): @@ -16,6 +17,9 @@ class TrainedModel(MetadataBase): trigger_id = Column("trigger_id", Integer) timestamp = Column("timestamp", TIMESTAMP(timezone=False), default=datetime.now()) model_path = Column("model_path", String(length=200), nullable=False) + metadata_path = Column("metadata_path", String(length=200), nullable=True, default=None) + parent_model = Column("parent_model", Integer, ForeignKey(f"{__tablename__}.model_id"), nullable=True, default=None) + children = relationship("TrainedModel") __table_args__ = ( ForeignKeyConstraint([pipeline_id, trigger_id], [Trigger.pipeline_id, Trigger.trigger_id]), {"extend_existing": True}, diff --git a/modyn/metadata_database/utils/__init__.py b/modyn/metadata_database/utils/__init__.py new file mode 100644 index 000000000..45e3fbcf4 --- /dev/null +++ b/modyn/metadata_database/utils/__init__.py @@ -0,0 +1,12 @@ +"""This package contains the database classes for the metadata module. + +The models are used to abstract the database operations. +This allows the storage module to be used with different databases. +""" +import os + +from .model_storage_strategy_config import ModelStorageStrategyConfig # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/metadata_database/utils/model_storage_strategy_config.py b/modyn/metadata_database/utils/model_storage_strategy_config.py new file mode 100644 index 000000000..de3663754 --- /dev/null +++ b/modyn/metadata_database/utils/model_storage_strategy_config.py @@ -0,0 +1,13 @@ +from typing import Optional + + +class ModelStorageStrategyConfig: + """ + Helper class to represent the configuration of a model storage strategy. + """ + + def __init__(self, name: str): + self.name = name + self.zip: Optional[bool] = None + self.zip_algorithm: Optional[str] = None + self.config: Optional[str] = None diff --git a/modyn/model_storage/internal/__init__.py b/modyn/model_storage/internal/__init__.py index 3dc30aa40..005c390c4 100644 --- a/modyn/model_storage/internal/__init__.py +++ b/modyn/model_storage/internal/__init__.py @@ -6,6 +6,8 @@ import os +from .model_storage_manager import ModelStorageManager # noqa: F401 + files = os.listdir(os.path.dirname(__file__)) files.remove("__init__.py") __all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/grpc/generated/model_storage_pb2.py b/modyn/model_storage/internal/grpc/generated/model_storage_pb2.py index 439e4cee3..8a4083551 100644 --- a/modyn/model_storage/internal/grpc/generated/model_storage_pb2.py +++ b/modyn/model_storage/internal/grpc/generated/model_storage_pb2.py @@ -14,25 +14,25 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13model_storage.proto\x12\x13modyn.model_storage\"s\n\x14RegisterModelRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x0c\n\x04port\x18\x04 \x01(\x05\x12\x12\n\nmodel_path\x18\x05 \x01(\t\":\n\x15RegisterModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\"%\n\x11\x46\x65tchModelRequest\x12\x10\n\x08model_id\x18\x01 \x01(\x05\"9\n\x12\x46\x65tchModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t\"&\n\x12\x44\x65leteModelRequest\x12\x10\n\x08model_id\x18\x01 \x01(\x05\"&\n\x13\x44\x65leteModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xbd\x02\n\x0cModelStorage\x12h\n\rRegisterModel\x12).modyn.model_storage.RegisterModelRequest\x1a*.modyn.model_storage.RegisterModelResponse\"\x00\x12_\n\nFetchModel\x12&.modyn.model_storage.FetchModelRequest\x1a\'.modyn.model_storage.FetchModelResponse\"\x00\x12\x62\n\x0b\x44\x65leteModel\x12\'.modyn.model_storage.DeleteModelRequest\x1a(.modyn.model_storage.DeleteModelResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13model_storage.proto\x12\x13modyn.model_storage\"\x85\x01\n\x14RegisterModelRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x0c\n\x04port\x18\x04 \x01(\x05\x12\x12\n\nmodel_path\x18\x05 \x01(\t\x12\x10\n\x08\x63hecksum\x18\x06 \x01(\x0c\":\n\x15RegisterModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\"<\n\x11\x46\x65tchModelRequest\x12\x10\n\x08model_id\x18\x01 \x01(\x05\x12\x15\n\rload_metadata\x18\x02 \x01(\x08\"K\n\x12\x46\x65tchModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t\x12\x10\n\x08\x63hecksum\x18\x03 \x01(\x0c\"&\n\x12\x44\x65leteModelRequest\x12\x10\n\x08model_id\x18\x01 \x01(\x05\"&\n\x13\x44\x65leteModelResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xbd\x02\n\x0cModelStorage\x12h\n\rRegisterModel\x12).modyn.model_storage.RegisterModelRequest\x1a*.modyn.model_storage.RegisterModelResponse\"\x00\x12_\n\nFetchModel\x12&.modyn.model_storage.FetchModelRequest\x1a\'.modyn.model_storage.FetchModelResponse\"\x00\x12\x62\n\x0b\x44\x65leteModel\x12\'.modyn.model_storage.DeleteModelRequest\x1a(.modyn.model_storage.DeleteModelResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'model_storage_pb2', globals()) if _descriptor._USE_C_DESCRIPTORS == False: DESCRIPTOR._options = None - _REGISTERMODELREQUEST._serialized_start=44 - _REGISTERMODELREQUEST._serialized_end=159 - _REGISTERMODELRESPONSE._serialized_start=161 - _REGISTERMODELRESPONSE._serialized_end=219 - _FETCHMODELREQUEST._serialized_start=221 - _FETCHMODELREQUEST._serialized_end=258 - _FETCHMODELRESPONSE._serialized_start=260 - _FETCHMODELRESPONSE._serialized_end=317 - _DELETEMODELREQUEST._serialized_start=319 - _DELETEMODELREQUEST._serialized_end=357 - _DELETEMODELRESPONSE._serialized_start=359 - _DELETEMODELRESPONSE._serialized_end=397 - _MODELSTORAGE._serialized_start=400 - _MODELSTORAGE._serialized_end=717 + _REGISTERMODELREQUEST._serialized_start=45 + _REGISTERMODELREQUEST._serialized_end=178 + _REGISTERMODELRESPONSE._serialized_start=180 + _REGISTERMODELRESPONSE._serialized_end=238 + _FETCHMODELREQUEST._serialized_start=240 + _FETCHMODELREQUEST._serialized_end=300 + _FETCHMODELRESPONSE._serialized_start=302 + _FETCHMODELRESPONSE._serialized_end=377 + _DELETEMODELREQUEST._serialized_start=379 + _DELETEMODELREQUEST._serialized_end=417 + _DELETEMODELRESPONSE._serialized_start=419 + _DELETEMODELRESPONSE._serialized_end=457 + _MODELSTORAGE._serialized_start=460 + _MODELSTORAGE._serialized_end=777 # @@protoc_insertion_point(module_scope) diff --git a/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi b/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi index 9143824a0..019bdd69b 100644 --- a/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi +++ b/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi @@ -23,11 +23,13 @@ class RegisterModelRequest(google.protobuf.message.Message): HOSTNAME_FIELD_NUMBER: builtins.int PORT_FIELD_NUMBER: builtins.int MODEL_PATH_FIELD_NUMBER: builtins.int + CHECKSUM_FIELD_NUMBER: builtins.int pipeline_id: builtins.int trigger_id: builtins.int hostname: builtins.str port: builtins.int model_path: builtins.str + checksum: builtins.bytes def __init__( self, *, @@ -36,8 +38,9 @@ class RegisterModelRequest(google.protobuf.message.Message): hostname: builtins.str = ..., port: builtins.int = ..., model_path: builtins.str = ..., + checksum: builtins.bytes = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["hostname", b"hostname", "model_path", b"model_path", "pipeline_id", b"pipeline_id", "port", b"port", "trigger_id", b"trigger_id"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["checksum", b"checksum", "hostname", b"hostname", "model_path", b"model_path", "pipeline_id", b"pipeline_id", "port", b"port", "trigger_id", b"trigger_id"]) -> None: ... global___RegisterModelRequest = RegisterModelRequest @@ -64,13 +67,16 @@ class FetchModelRequest(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor MODEL_ID_FIELD_NUMBER: builtins.int + LOAD_METADATA_FIELD_NUMBER: builtins.int model_id: builtins.int + load_metadata: builtins.bool def __init__( self, *, model_id: builtins.int = ..., + load_metadata: builtins.bool = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["model_id", b"model_id"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["load_metadata", b"load_metadata", "model_id", b"model_id"]) -> None: ... global___FetchModelRequest = FetchModelRequest @@ -80,15 +86,18 @@ class FetchModelResponse(google.protobuf.message.Message): SUCCESS_FIELD_NUMBER: builtins.int MODEL_PATH_FIELD_NUMBER: builtins.int + CHECKSUM_FIELD_NUMBER: builtins.int success: builtins.bool model_path: builtins.str + checksum: builtins.bytes def __init__( self, *, success: builtins.bool = ..., model_path: builtins.str = ..., + checksum: builtins.bytes = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["model_path", b"model_path", "success", b"success"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["checksum", b"checksum", "model_path", b"model_path", "success", b"success"]) -> None: ... global___FetchModelResponse = FetchModelResponse diff --git a/modyn/model_storage/internal/grpc/grpc_server.py b/modyn/model_storage/internal/grpc/grpc_server.py index 8580c6434..5d60e5fd2 100644 --- a/modyn/model_storage/internal/grpc/grpc_server.py +++ b/modyn/model_storage/internal/grpc/grpc_server.py @@ -15,14 +15,17 @@ class GRPCServer: """GRPC server context manager.""" - def __init__(self, modyn_config: dict, storage_dir: pathlib.Path) -> None: + def __init__(self, modyn_config: dict, storage_dir: pathlib.Path, ftp_directory: pathlib.Path) -> None: """Initialize the GRPC server. Args: modyn_config (dict): Configuration of the storage module. + storage_dir (path): Path to the model storage directory. + ftp_directory (path): Path to the ftp directory. """ self.modyn_config = modyn_config self.storage_dir = storage_dir + self.ftp_directory = ftp_directory self.server = grpc.server( futures.ThreadPoolExecutor( max_workers=10, @@ -39,7 +42,9 @@ def __enter__(self) -> grpc.Server: Returns: grpc.Server: GRPC server """ - add_ModelStorageServicer_to_server(ModelStorageGRPCServicer(self.modyn_config, self.storage_dir), self.server) + add_ModelStorageServicer_to_server( + ModelStorageGRPCServicer(self.modyn_config, self.storage_dir, self.ftp_directory), self.server + ) port = self.modyn_config["model_storage"]["port"] logger.info(f"Starting GRPC server. Listening on port {port}") self.server.add_insecure_port("[::]:" + port) diff --git a/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py b/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py index 62a13694a..0e16f950c 100644 --- a/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py +++ b/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py @@ -5,9 +5,9 @@ import pathlib import grpc -from modyn.common.ftp.ftp_utils import download_file -from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection -from modyn.metadata_database.models.trained_models import TrainedModel +import torch +from modyn.common.ftp.ftp_utils import download_file, get_pretrained_model_callback +from modyn.model_storage.internal import ModelStorageManager # pylint: disable-next=no-name-in-module from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( @@ -19,7 +19,7 @@ RegisterModelResponse, ) from modyn.model_storage.internal.grpc.generated.model_storage_pb2_grpc import ModelStorageServicer -from modyn.utils import EMIT_MESSAGE_PERCENTAGES, current_time_millis +from modyn.utils import calculate_checksum, current_time_millis logger = logging.getLogger(__name__) @@ -27,16 +27,20 @@ class ModelStorageGRPCServicer(ModelStorageServicer): """GRPC servicer for the storage module.""" - def __init__(self, config: dict, storage_dir: pathlib.Path): + def __init__(self, config: dict, storage_dir: pathlib.Path, ftp_dir: pathlib.Path): """Initialize the model storage GRPC servicer. Args: config (dict): Configuration of the storage module. + storage_dir (path): Path to the model storage directory. + ftp_dir (path): Path to the ftp directory. """ super().__init__() self._config = config + self.ftp_dir = ftp_dir self.storage_dir = storage_dir + self.model_storage_manager = ModelStorageManager(self._config, self.storage_dir) def RegisterModel(self, request: RegisterModelRequest, context: grpc.ServicerContext) -> RegisterModelResponse: """Registers a new model at the model storage component by downloading it from a given server. @@ -55,39 +59,31 @@ def RegisterModel(self, request: RegisterModelRequest, context: grpc.ServicerCon logger.info(f"Try to download model from {hostname}:{port}, pipeline {pipeline_id} and trigger {trigger_id}.") local_file_name = f"{current_time_millis()}_{pipeline_id}_{trigger_id}.modyn" - local_model_path = self.storage_dir / local_file_name + local_model_path = self.ftp_dir / local_file_name logger.info(f"Remote model path is {remote_model_path}, storing at {local_model_path}.") - last_progress = 0.0 - - def callback(current_progress: float) -> None: - nonlocal last_progress - for emit_perc in EMIT_MESSAGE_PERCENTAGES: - if last_progress <= emit_perc < current_progress: - logger.info(f"Completed {emit_perc * 100}% of the download.") - last_progress = current_progress - - download_file( + success = download_file( hostname, port, "modyn", "modyn", remote_file_path=pathlib.Path(remote_model_path), local_file_path=local_model_path, - callback=callback, + callback=get_pretrained_model_callback(logger), + checksum=request.checksum, ) - logger.info("Download completed.") + if not success: + logger.error("Downloaded file does not match its checksum.") + return RegisterModelResponse(success=False) - response = RegisterModelResponse() + logger.info("Download completed. Invoking model storage manager.") - with MetadataDatabaseConnection(self._config) as database: - model_id = database.add_trained_model(pipeline_id, trigger_id, local_file_name) - response.model_id = model_id - response.success = True + model_id = self.model_storage_manager.store_model(pipeline_id, trigger_id, local_model_path) + os.remove(local_model_path) - return response + return RegisterModelResponse(success=True, model_id=model_id) def FetchModel(self, request: FetchModelRequest, context: grpc.ServicerContext) -> FetchModelResponse: """Fetch a model from the model storage component. @@ -101,21 +97,19 @@ def FetchModel(self, request: FetchModelRequest, context: grpc.ServicerContext) """ logger.info(f"Try to fetch model having id {request.model_id}") - response = FetchModelResponse() - with MetadataDatabaseConnection(self._config) as database: - model: TrainedModel = database.session.get(TrainedModel, request.model_id) - - if model: - response.model_path = model.model_path - response.success = True - - logger.info(f"Trained model {request.model_id} has local path {self.storage_dir / model.model_path}") - else: - response.success = False - - logger.warning(f"Trained model {request.model_id} was not found.") - - return response + model_dict = self.model_storage_manager.load_model(request.model_id, request.load_metadata) + if not model_dict: + logger.error(f"Trained model {request.model_id} could not be fetched.") + return FetchModelResponse(success=False) + model_file_path = self.ftp_dir / f"{current_time_millis()}_{request.model_id}.modyn" + torch.save(model_dict, model_file_path) + + logger.info(f"Trained model {request.model_id} has local path {model_file_path}") + return FetchModelResponse( + success=True, + model_path=str(model_file_path.relative_to(self.ftp_dir)), + checksum=calculate_checksum(model_file_path), + ) def DeleteModel(self, request: DeleteModelRequest, context: grpc.ServicerContext) -> DeleteModelResponse: """Delete model from the model storage component. @@ -127,27 +121,15 @@ def DeleteModel(self, request: DeleteModelRequest, context: grpc.ServicerContext Returns: DeleteModelResponse: the response containing information if the model was found in the database. """ - - logger.info(f"Try to delete model having id {request.model_id}") + model_id = request.model_id + logger.info(f"Try to delete model having id {model_id}") response = DeleteModelResponse() - with MetadataDatabaseConnection(self._config) as database: - model: TrainedModel = database.session.get(TrainedModel, request.model_id) - - if model: - local_model_path = self.storage_dir / model.model_path - os.remove(local_model_path) - - database.session.delete(model) - database.session.commit() - - response.success = True - logger.info( - f"Trained model {request.model_id} with path {self.storage_dir / model.model_path} has been removed" - ) - else: - response.success = False - - logger.warning(f"Trained model {request.model_id} was not found.") + success = self.model_storage_manager.delete_model(model_id) + if success: + logger.info(f"Deleted model {request.model_id}.") + else: + logger.error(f"Deletion of model {request.model_id} was not successful.") + response.success = success return response diff --git a/modyn/model_storage/internal/model_storage_manager.py b/modyn/model_storage/internal/model_storage_manager.py new file mode 100644 index 000000000..811f4e9e7 --- /dev/null +++ b/modyn/model_storage/internal/model_storage_manager.py @@ -0,0 +1,190 @@ +import json +import logging +import os +import pathlib +import tempfile +from typing import Optional + +import torch +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.models import Pipeline, TrainedModel +from modyn.model_storage.internal.utils import ModelStorageStrategy +from modyn.utils import current_time_millis, dynamic_module_import, unzip_file, zip_file + +logger = logging.getLogger(__name__) + + +class ModelStorageManager: + """ + Class used as manager of the model storage component. Implements all model storage related functionalities. + """ + + def __init__(self, modyn_config: dict, storage_dir: pathlib.Path): + self._modyn_config = modyn_config + self._storage_dir = storage_dir + + def store_model(self, pipeline_id: int, trigger_id: int, checkpoint_path: pathlib.Path) -> int: + checkpoint = torch.load(checkpoint_path) + + model_storage_strategy = self.get_model_storage_strategy(pipeline_id) + + assert "model" in checkpoint + state_dict = checkpoint["model"] + local_model_filename = f"{current_time_millis()}_{pipeline_id}_{trigger_id}.model" + model_path = self._storage_dir / local_model_filename + parent_id = self._handle_new_model(pipeline_id, trigger_id, state_dict, model_path, model_storage_strategy) + checkpoint.pop("model") + + # now checkpoint only contains optimizer state and metadata + local_metadata_filename = f"{current_time_millis()}_{pipeline_id}_{trigger_id}.metadata.zip" + metadata_path = self._storage_dir / local_metadata_filename + + with tempfile.NamedTemporaryFile() as temp_file: + torch.save(checkpoint, temp_file) + zip_file(pathlib.Path(temp_file.name), metadata_path) + + with MetadataDatabaseConnection(self._modyn_config) as database: + return database.add_trained_model( + pipeline_id, trigger_id, local_model_filename, local_metadata_filename, parent_id + ) + + def _handle_new_model( + self, + pipeline_id: int, + trigger_id: int, + state_dict: dict, + model_path: pathlib.Path, + model_storage_strategy: ModelStorageStrategy, + ) -> Optional[int]: + if model_storage_strategy.incremental_model_strategy and ( + model_storage_strategy.full_model_interval is None + or trigger_id % model_storage_strategy.full_model_interval != 0 + ): + prev_model: Optional[TrainedModel] = self._get_previous_model(pipeline_id, trigger_id) + if prev_model: + # handle incremental model storage + previous_model_state = self._get_base_model_state(pipeline_id) + + # load previous model state + self._reconstruct_model(prev_model.model_id, previous_model_state, model_storage_strategy) + + # store incremental model + model_storage_strategy.incremental_model_strategy.save_model( + state_dict, previous_model_state, model_path + ) + + return prev_model.model_id + logger.warning("Previous model is not available! Storing full model...") + + # handle full model storage + model_storage_strategy.full_model_strategy.save_model(state_dict, model_path) + return None + + def _get_base_model_state(self, pipeline_id: int) -> dict: + with MetadataDatabaseConnection(self._modyn_config) as database: + model_id, model_config, amp = database.get_model_configuration(pipeline_id) + model_module = dynamic_module_import("modyn.models") + assert hasattr(model_module, model_id), f"Model {model_id} not available." + + model_handler = getattr(model_module, model_id) + return model_handler(json.loads(model_config), "cpu", amp).model.state_dict() + + def _reconstruct_model( + self, model_id: int, model_state: dict, model_storage_strategy: ModelStorageStrategy + ) -> None: + # we recursively overwrite the model state + with MetadataDatabaseConnection(self._modyn_config) as database: + model: TrainedModel = database.session.get(TrainedModel, model_id) + if not model.parent_model: + # base case: we can load a fully stored model + model_storage_strategy.full_model_strategy.load_model(model_state, self._storage_dir / model.model_path) + return + + # recursive step: we recurse to load the model state of the parent model + self._reconstruct_model(model.parent_model, model_state, model_storage_strategy) + + # we apply the incremental strategy to load our model state + model_storage_strategy.incremental_model_strategy.load_model(model_state, self._storage_dir / model.model_path) + + def _get_previous_model(self, pipeline_id: int, trigger_id: int) -> Optional[TrainedModel]: + with MetadataDatabaseConnection(self._modyn_config) as database: + return ( + database.session.query(TrainedModel) + .filter(TrainedModel.pipeline_id == pipeline_id, TrainedModel.trigger_id == trigger_id - 1) + .first() + ) + + def load_model(self, model_id: int, metadata: bool) -> Optional[dict]: + with MetadataDatabaseConnection(self._modyn_config) as database: + model: Optional[TrainedModel] = database.session.get(TrainedModel, model_id) + if model is None: + logger.error(f"Model {model_id} does not exist.") + return None + model_storage_strategy = self.get_model_storage_strategy(model.pipeline_id) + + model_state = self._get_base_model_state(model.pipeline_id) + self._reconstruct_model(model_id, model_state, model_storage_strategy) + model_dict = {"model": model_state} + + if metadata: + if not model.metadata_path: + logger.error(f"Metadata not available for model {model_id}") + return None + with tempfile.NamedTemporaryFile() as temp_file: + temp_file_path = pathlib.Path(temp_file.name) + unzip_file(self._storage_dir / model.metadata_path, temp_file_path) + metadata_dict = torch.load(temp_file_path) + model_dict.update(metadata_dict) + + return model_dict + + def delete_model(self, model_id: int) -> bool: + with MetadataDatabaseConnection(self._modyn_config) as database: + model: Optional[TrainedModel] = database.session.get(TrainedModel, model_id) + + if model is None: + logger.error(f"Trained model {model_id} was not found.") + return False + model_storage_strategy = self.get_model_storage_strategy(model.pipeline_id) + child_state = self._get_base_model_state(model.pipeline_id) + + child: TrainedModel + for child in model.children: + assert child.pipeline_id == model.pipeline_id, "Pipeline does not match for parent and child model" + + self._reconstruct_model(child.model_id, child_state, model_storage_strategy) + model_storage_strategy.full_model_strategy.save_model(child_state, self._storage_dir / child.model_path) + database.session.query(TrainedModel).filter(TrainedModel.model_id == child.model_id).update( + {"parent_model": None} + ) + + os.remove(self._storage_dir / model.model_path) + if model.metadata_path: + os.remove(self._storage_dir / model.metadata_path) + + database.session.delete(model) + database.session.commit() + logger.info(f"Successfully deleted model {model_id} and converted child models to be fully stored.") + return True + + def get_model_storage_strategy(self, pipeline_id: int) -> ModelStorageStrategy: + with MetadataDatabaseConnection(self._modyn_config) as database: + pipeline: Pipeline = database.session.query(Pipeline).get(pipeline_id) + + strategy = ModelStorageStrategy( + pipeline.full_model_strategy_name, + pipeline.full_model_strategy_zip, + pipeline.full_model_strategy_zip_algorithm, + pipeline.full_model_strategy_config, + ) + + if pipeline.inc_model_strategy_name is not None: + strategy.register_incremental_model_strategy( + pipeline.inc_model_strategy_name, + pipeline.inc_model_strategy_zip, + pipeline.inc_model_strategy_zip_algorithm, + pipeline.inc_model_strategy_config, + pipeline.full_model_interval, + ) + + return strategy diff --git a/modyn/model_storage/internal/storage_strategies/__init__.py b/modyn/model_storage/internal/storage_strategies/__init__.py new file mode 100644 index 000000000..36034da4f --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/__init__.py @@ -0,0 +1,14 @@ +""" +Model Storage module. + +The model storage module contains all classes and functions related to the storage and retrieval of models. +""" + +import os + +from .abstract_difference_operator import AbstractDifferenceOperator # noqa: F401 +from .abstract_model_storage_strategy import AbstractModelStorageStrategy # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/storage_strategies/abstract_difference_operator.py b/modyn/model_storage/internal/storage_strategies/abstract_difference_operator.py new file mode 100644 index 000000000..25c4bef79 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/abstract_difference_operator.py @@ -0,0 +1,41 @@ +from abc import ABC, abstractmethod +from typing import BinaryIO + +import torch + + +class AbstractDifferenceOperator(ABC): + """ + This is the base class for all difference operators. These operators can be used to calculate the difference + between two successive models in the pipeline and later be used in a incremental model storage strategy. + """ + + @staticmethod + @abstractmethod + def calculate_difference(tensor: torch.Tensor, tensor_prev: torch.Tensor) -> bytes: + """ + Calculate the difference between two tensors. + + Args: + tensor: the tensor representing some weights of the current model. + tensor_prev: the tensor representing the same weights of the preceding model. + + Returns: + bytes: the byte-level difference. + """ + raise NotImplementedError() + + @staticmethod + @abstractmethod + def restore(tensor_prev: torch.Tensor, bytestream: BinaryIO) -> torch.Tensor: + """ + Restores a weight tensor. + + Args: + tensor_prev: the tensor representing some weights of the preceding model. + bytestream: difference bytes from which to restore the weights of the current model. + + Returns: + tensor: the weight tensor of the current model. + """ + raise NotImplementedError() diff --git a/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py b/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py new file mode 100644 index 000000000..3378c6c31 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py @@ -0,0 +1,42 @@ +from abc import ABC, abstractmethod +from zipfile import ZIP_DEFLATED + +from modyn.utils import dynamic_module_import + + +class AbstractModelStorageStrategy(ABC): + """ + Base class for all model storage strategies. + """ + + def __init__(self, zip_activated: bool, zip_algorithm_name: str, config: dict): + """ + Initialize a model storage strategy. + + Args: + zip_activated: whether the generated file(s) are zipped. + zip_algorithm_name: name of the zip algorithm. + config: configuration options for the strategy. + """ + self.zip = zip_activated + self.zip_algorithm = ZIP_DEFLATED + self._validate_zip_config(zip_algorithm_name) + + self.validate_config(config) + + @abstractmethod + def validate_config(self, config: dict) -> None: + """ + Validates the strategy-dependent configuration options. + + Args: + config: the configuration options. + """ + raise NotImplementedError() + + def _validate_zip_config(self, zip_algorithm_name: str) -> None: + if self.zip and zip_algorithm_name: + zip_module = dynamic_module_import("zipfile") + if not hasattr(zip_module, zip_algorithm_name): + raise NotImplementedError(f"The zip algorithm {zip_algorithm_name} is unknown!") + self.zip_algorithm = getattr(zip_module, zip_algorithm_name) diff --git a/modyn/model_storage/internal/storage_strategies/difference_operators/__init__.py b/modyn/model_storage/internal/storage_strategies/difference_operators/__init__.py new file mode 100644 index 000000000..62a936bb3 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/difference_operators/__init__.py @@ -0,0 +1,14 @@ +""" +Model Storage module. + +The model storage module contains all classes and functions related to the storage and retrieval of models. +""" + +import os + +from .sub_difference_operator import SubDifferenceOperator # noqa: F401 +from .xor_difference_operator import XorDifferenceOperator # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py b/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py new file mode 100644 index 000000000..5626a004a --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py @@ -0,0 +1,17 @@ +from typing import BinaryIO + +import torch +from modyn.model_storage.internal.storage_strategies.abstract_difference_operator import AbstractDifferenceOperator +from modyn.model_storage.internal.utils.data_types import read_tensor_from_bytes + + +class SubDifferenceOperator(AbstractDifferenceOperator): + @staticmethod + def calculate_difference(tensor: torch.Tensor, tensor_prev: torch.Tensor) -> bytes: + diff = tensor - tensor_prev + return diff.numpy().tobytes() + + @staticmethod + def restore(tensor_prev: torch.Tensor, bytestream: BinaryIO) -> torch.Tensor: + difference_tensor = read_tensor_from_bytes(tensor_prev, bytestream) + return tensor_prev + difference_tensor diff --git a/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py b/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py new file mode 100644 index 000000000..7e969124a --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py @@ -0,0 +1,30 @@ +import math +from typing import BinaryIO + +import numpy as np +import torch +from modyn.model_storage.internal.storage_strategies.abstract_difference_operator import AbstractDifferenceOperator +from modyn.model_storage.internal.utils.data_types import ( + create_tensor, + torch_dtype_to_byte_size, + torch_dtype_to_numpy_dict, +) + + +class XorDifferenceOperator(AbstractDifferenceOperator): + @staticmethod + def calculate_difference(tensor: torch.Tensor, tensor_prev: torch.Tensor) -> bytes: + bytes_curr = tensor.numpy().tobytes() + bytes_prev = tensor_prev.numpy().tobytes() + + return bytes(a ^ b for (a, b) in zip(bytes_curr, bytes_prev)) + + @staticmethod + def restore(tensor_prev: torch.Tensor, bytestream: BinaryIO) -> torch.Tensor: + shape = tensor_prev.shape + num_bytes = math.prod(shape) * torch_dtype_to_byte_size[tensor_prev.dtype] + byte_data: bytes = bytestream.read(num_bytes) + prev_model_data = tensor_prev.numpy().tobytes() + new_model_data = bytes(a ^ b for (a, b) in zip(byte_data, prev_model_data)) + np_dtype = np.dtype(torch_dtype_to_numpy_dict[tensor_prev.dtype]) + return create_tensor(new_model_data, np_dtype, shape) diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/__init__.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/__init__.py new file mode 100644 index 000000000..31585b38f --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/__init__.py @@ -0,0 +1,15 @@ +""" +Model Storage module. + +The model storage module contains all classes and functions related to the storage and retrieval of models. +""" + +import os + +from .abstract_full_model_strategy import AbstractFullModelStrategy # noqa: F401 +from .compressed_full_model import CompressedFullModel # noqa: F401 +from .pytorch_full_model import PyTorchFullModel # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py new file mode 100644 index 000000000..07bc08983 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py @@ -0,0 +1,53 @@ +import pathlib +import tempfile +from abc import ABC, abstractmethod + +from modyn.model_storage.internal.storage_strategies.abstract_model_storage_strategy import AbstractModelStorageStrategy +from modyn.utils import unzip_file, zip_file + + +class AbstractFullModelStrategy(AbstractModelStorageStrategy, ABC): + """ + This is the base class for all full model strategies. That is, strategies which contain full information about + a model in order to reproduce its model state. + """ + + @abstractmethod + def _save_model(self, model_state: dict, file_path: pathlib.Path) -> None: + """ + Stores the model state to the given file. + + Args: + model_state: the state dictionary of the model. + file_path: the path to the file in which to store the state. + """ + raise NotImplementedError() + + def save_model(self, model_state: dict, file_path: pathlib.Path) -> None: + if self.zip: + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + self._save_model(model_state, temp_file_path) + zip_file(temp_file_path, file_path, self.zip_algorithm, remove_file=False) + else: + self._save_model(model_state, file_path) + + @abstractmethod + def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: + """ + Load the model state from the given file. + + Args: + base_model_state: the base model state which must be overwritten. + file_path: the path to the file that contains the state information. + """ + raise NotImplementedError() + + def load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: + if self.zip: + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + unzip_file(file_path, temp_file_path, compression=self.zip_algorithm, remove_file=False) + self._load_model(base_model_state, temp_file_path) + else: + self._load_model(base_model_state, file_path) diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/compressed_full_model.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/compressed_full_model.py new file mode 100644 index 000000000..3ef7e63e0 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/compressed_full_model.py @@ -0,0 +1,23 @@ +import pathlib + +from modyn.model_storage.internal.storage_strategies.full_model_strategies import AbstractFullModelStrategy +from modyn.model_storage.internal.utils import read_tensor_from_bytes + + +class CompressedFullModel(AbstractFullModelStrategy): + """ + This full model strategy stores the weights as binary sequence. + """ + + def _save_model(self, model_state: dict, file_path: pathlib.Path) -> None: + with open(file_path, "wb") as file: + for _, tensor in model_state.items(): + file.write(tensor.numpy().tobytes()) + + def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: + with open(file_path, "rb") as file: + for layer, tensor in base_model_state.items(): + base_model_state[layer] = read_tensor_from_bytes(tensor, file) + + def validate_config(self, config: dict) -> None: + pass diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py new file mode 100644 index 000000000..e28f039ef --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py @@ -0,0 +1,21 @@ +import pathlib + +import torch +from modyn.model_storage.internal.storage_strategies.full_model_strategies.abstract_full_model_strategy import ( + AbstractFullModelStrategy, +) + + +class PyTorchFullModel(AbstractFullModelStrategy): + """ + This full model strategy naively stores the whole model on disk (default pytorch implementation). + """ + + def _save_model(self, model_state: dict, file_path: pathlib.Path) -> None: + torch.save(model_state, file_path) + + def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: + base_model_state.update(torch.load(file_path)) + + def validate_config(self, config: dict) -> None: + pass diff --git a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/__init__.py b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/__init__.py new file mode 100644 index 000000000..6bf7c0fda --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/__init__.py @@ -0,0 +1,14 @@ +""" +Model Storage module. + +The model storage module contains all classes and functions related to the storage and retrieval of models. +""" + +import os + +from .abstract_incremental_model_strategy import AbstractIncrementalModelStrategy # noqa: F401 +from .weights_difference import WeightsDifference # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py new file mode 100644 index 000000000..ce2a21aea --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py @@ -0,0 +1,54 @@ +import pathlib +import tempfile +from abc import ABC, abstractmethod + +from modyn.model_storage.internal.storage_strategies.abstract_model_storage_strategy import AbstractModelStorageStrategy +from modyn.utils import unzip_file, zip_file + + +class AbstractIncrementalModelStrategy(AbstractModelStorageStrategy, ABC): + """ + This is the base class for all incremental model strategies. These strategies build on the idea of storing a delta + between two successive models in order to reproduce the latter one. + """ + + @abstractmethod + def _save_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: + """ + Stores the delta between to successive models. + + Args: + model_state: the newer model state. + prev_model_state: the state of the preceding model. + file_path: the path to the file in which the delta is stored. + """ + raise NotImplementedError() + + def save_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: + if self.zip: + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + self._save_model(model_state, prev_model_state, temp_file_path) + zip_file(temp_file_path, file_path, self.zip_algorithm, remove_file=False) + else: + self._save_model(model_state, prev_model_state, file_path) + + @abstractmethod + def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> None: + """ + Loads a model state by overwriting the state of the preceding model. + + Args: + prev_model_state: the state of the preceding model. + file_path: the path to the file which contains the delta. + """ + raise NotImplementedError() + + def load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> None: + if self.zip: + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + unzip_file(file_path, temp_file_path, compression=self.zip_algorithm, remove_file=False) + self._load_model(prev_model_state, temp_file_path) + else: + self._load_model(prev_model_state, file_path) diff --git a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py new file mode 100644 index 000000000..92bc23bf7 --- /dev/null +++ b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py @@ -0,0 +1,47 @@ +import io +import pathlib + +from modyn.model_storage.internal.storage_strategies.difference_operators import ( + SubDifferenceOperator, + XorDifferenceOperator, +) +from modyn.model_storage.internal.storage_strategies.incremental_model_strategies import ( + AbstractIncrementalModelStrategy, +) + +available_difference_operators = {"xor": XorDifferenceOperator, "sub": SubDifferenceOperator} + + +class WeightsDifference(AbstractIncrementalModelStrategy): + """ + This incremental model strategy stores the delta between two successive model states as difference of their + weight tensors. It currently supports two difference operators: xor and sub. + """ + + def __init__(self, zip_activated: bool, zip_algorithm_name: str, config: dict): + self.difference_operator = SubDifferenceOperator + self.split_exponent = False + + super().__init__(zip_activated, zip_algorithm_name, config) + + def _save_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: + bytestream = io.BytesIO() + + for tensor_model, tensor_prev_model in zip(model_state.values(), prev_model_state.values()): + bytestream.write(self.difference_operator.calculate_difference(tensor_model, tensor_prev_model)) + + with open(file_path, "wb") as file: + file.write(bytestream.getbuffer().tobytes()) + + def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> None: + with open(file_path, "rb") as file: + for layer_name, tensor in prev_model_state.items(): + prev_model_state[layer_name] = self.difference_operator.restore(tensor, file) + + def validate_config(self, config: dict) -> None: + if "operator" in config: + difference_operator_name = config["operator"] + if difference_operator_name not in available_difference_operators: + raise ValueError(f"Operator should be one of {available_difference_operators}.") + self.difference_operator = available_difference_operators[difference_operator_name] + self.split_exponent = config["split_exponent"] if "split_exponent" in config else False diff --git a/modyn/model_storage/internal/utils/__init__.py b/modyn/model_storage/internal/utils/__init__.py new file mode 100644 index 000000000..846887f99 --- /dev/null +++ b/modyn/model_storage/internal/utils/__init__.py @@ -0,0 +1,20 @@ +""" +Model Storage module. + +The model storage module contains all classes and functions related to the storage and retrieval of models. +""" + +import os + +from .data_types import ( # noqa: F401 + create_tensor, + numpy_dtype_to_torch_dict, + read_tensor_from_bytes, + torch_dtype_to_byte_size, + torch_dtype_to_numpy_dict, +) +from .model_storage_strategy import ModelStorageStrategy # noqa: F401 + +files = os.listdir(os.path.dirname(__file__)) +files.remove("__init__.py") +__all__ = [f[:-3] for f in files if f.endswith(".py")] diff --git a/modyn/model_storage/internal/utils/data_types.py b/modyn/model_storage/internal/utils/data_types.py new file mode 100644 index 000000000..ae929e9cb --- /dev/null +++ b/modyn/model_storage/internal/utils/data_types.py @@ -0,0 +1,51 @@ +""" +This class provides useful functionalities for different data types and conversions between them. +""" +import math +from typing import BinaryIO + +import numpy as np +import torch + +torch_dtype_to_numpy_dict = { + torch.uint8: np.uint8, + torch.int8: np.int8, + torch.int16: np.int16, + torch.int32: np.int32, + torch.int64: np.int64, + torch.float16: np.float16, + torch.float32: np.float32, + torch.float64: np.float64, + torch.complex64: np.complex64, + torch.complex128: np.complex128, +} + +numpy_dtype_to_torch_dict = {value: key for (key, value) in torch_dtype_to_numpy_dict.items()} + +torch_dtype_to_byte_size = { + torch.uint8: 1, + torch.int8: 1, + torch.int16: 2, + torch.int32: 4, + torch.int64: 8, + torch.float16: 2, + torch.float32: 4, + torch.float64: 8, + torch.complex64: 8, + torch.complex128: 16, +} + + +def read_tensor_from_bytes(tensor: torch.Tensor, bytestream: BinaryIO) -> torch.Tensor: + shape = tensor.shape + num_bytes = math.prod(shape) * torch_dtype_to_byte_size[tensor.dtype] + byte_data = bytestream.read(num_bytes) + np_dtype = np.dtype(torch_dtype_to_numpy_dict[tensor.dtype]) + return create_tensor(byte_data, np_dtype, shape) + + +def create_tensor(buffer: bytes, dtype: np.dtype, shape: torch.Size) -> torch.Tensor: + dtype = dtype.newbyteorder("<") + np_array = np.frombuffer(buffer, dtype=dtype) + array_tensor = torch.tensor(np.array(np_array)) + return torch.reshape(array_tensor, shape) diff --git a/modyn/model_storage/internal/utils/model_storage_strategy.py b/modyn/model_storage/internal/utils/model_storage_strategy.py new file mode 100644 index 000000000..077f11216 --- /dev/null +++ b/modyn/model_storage/internal/utils/model_storage_strategy.py @@ -0,0 +1,68 @@ +import json +import logging +from typing import Optional, Union + +from modyn.model_storage.internal.storage_strategies.full_model_strategies import AbstractFullModelStrategy +from modyn.model_storage.internal.storage_strategies.incremental_model_strategies import ( + AbstractIncrementalModelStrategy, +) +from modyn.utils import dynamic_module_import + +logger = logging.getLogger(__name__) + +FULL_MODEL_STRATEGY_MODULE = "modyn.model_storage.internal.storage_strategies.full_model_strategies" +INCREMENTAL_MODEL_STRATEGY_MODULE = "modyn.model_storage.internal.storage_strategies.incremental_model_strategies" + + +class ModelStorageStrategy: + """ + Class used to represent the model storage strategy. It loads the specified strategies. + """ + + def __init__( + self, + full_model_strategy_name: str, + full_model_strategy_zip: Optional[bool], + full_model_strategy_zip_algorithm: Optional[str], + full_model_strategy_config: Optional[str], + ) -> None: + self.full_model_strategy: AbstractFullModelStrategy = self._setup_model_storage_strategy( + full_model_strategy_name, + full_model_strategy_zip, + full_model_strategy_zip_algorithm, + full_model_strategy_config, + FULL_MODEL_STRATEGY_MODULE, + ) + + self.incremental_model_strategy: Optional[AbstractIncrementalModelStrategy] = None + self.full_model_interval: Optional[int] = None + + def register_incremental_model_strategy( + self, + name: str, + zip_enabled: Optional[bool], + zip_algorithm: Optional[str], + config: Optional[str], + full_model_interval: Optional[int], + ) -> None: + self.incremental_model_strategy = self._setup_model_storage_strategy( + name, zip_enabled, zip_algorithm, config, INCREMENTAL_MODEL_STRATEGY_MODULE + ) + if full_model_interval is not None: + self._validate_full_model_interval(full_model_interval) + + def _validate_full_model_interval(self, full_model_interval: int) -> None: + if full_model_interval <= 0: + raise ValueError("Full model interval should be positive.") + self.full_model_interval = full_model_interval + + @staticmethod + def _setup_model_storage_strategy( + name: str, zip_enabled: Optional[bool], zip_algorithm: Optional[str], config: Optional[str], module_name: str + ) -> Union[AbstractFullModelStrategy, AbstractIncrementalModelStrategy]: + model_storage_module = dynamic_module_import(module_name) + if not hasattr(model_storage_module, name): + raise NotImplementedError(f"Strategy {name} not implemented!") + model_storage_strategy_handler = getattr(model_storage_module, name) + strategy_config = json.loads(config) if config else {} + return model_storage_strategy_handler(zip_enabled or False, zip_algorithm or "ZIP_DEFLATED", strategy_config) diff --git a/modyn/model_storage/model_storage.py b/modyn/model_storage/model_storage.py index 5ac2407ec..3b918a3af 100644 --- a/modyn/model_storage/model_storage.py +++ b/modyn/model_storage/model_storage.py @@ -1,34 +1,31 @@ import os import pathlib -from typing import Optional, Tuple +import shutil +import tempfile -from jsonschema import ValidationError from modyn.common.ftp.ftp_server import FTPServer from modyn.model_storage.internal.grpc.grpc_server import GRPCServer -from modyn.utils import validate_yaml class ModelStorage: def __init__(self, config: dict) -> None: self.config = config + self._setup_model_storage_directories() - valid, errors = self._validate_config() - if not valid: - raise ValueError(f"Invalid configuration: {errors}") + def _setup_model_storage_directories(self) -> None: + self.model_storage_directory = pathlib.Path(os.getcwd()) / "model_storage" + self.ftp_directory = pathlib.Path(tempfile.gettempdir()) / "ftp_model_storage" - self._setup_model_storage_directory() + os.makedirs(self.model_storage_directory, exist_ok=True) - def _validate_config(self) -> Tuple[bool, Optional[ValidationError]]: - schema_path = ( - pathlib.Path(os.path.abspath(__file__)).parent.parent / "config" / "schema" / "modyn_config_schema.yaml" - ) - return validate_yaml(self.config, schema_path) + if self.ftp_directory.exists() and self.ftp_directory.is_dir(): + shutil.rmtree(self.ftp_directory) - def _setup_model_storage_directory(self) -> None: - self.model_storage_directory = pathlib.Path(os.getcwd()) / "model_storage" - os.makedirs(self.model_storage_directory) + self.ftp_directory.mkdir() def run(self) -> None: - with GRPCServer(self.config, self.model_storage_directory) as server: - with FTPServer(self.config["model_storage"]["ftp_port"], self.model_storage_directory): + with GRPCServer(self.config, self.model_storage_directory, self.ftp_directory) as server: + with FTPServer(self.config["model_storage"]["ftp_port"], self.ftp_directory): server.wait_for_termination() + + shutil.rmtree(self.ftp_directory) diff --git a/modyn/protos/evaluator.proto b/modyn/protos/evaluator.proto index ff95f1b2a..148e75df9 100644 --- a/modyn/protos/evaluator.proto +++ b/modyn/protos/evaluator.proto @@ -27,14 +27,11 @@ message EvaluateModelRequest { int32 trained_model_id = 1; DatasetInfo dataset_info = 2; string device = 3; - bool amp = 4; - int32 batch_size = 5; - repeated MetricConfiguration metrics = 6; - string model_id = 7; - JsonString model_configuration = 8; - repeated string transform_list = 9; - PythonString bytes_parser = 10; - PythonString label_transformer = 11; + int32 batch_size = 4; + repeated MetricConfiguration metrics = 5; + repeated string transform_list = 6; + PythonString bytes_parser = 7; + PythonString label_transformer = 8; } message EvaluateModelResponse { diff --git a/modyn/protos/model_storage.proto b/modyn/protos/model_storage.proto index 03e177916..bd4184af9 100644 --- a/modyn/protos/model_storage.proto +++ b/modyn/protos/model_storage.proto @@ -14,6 +14,7 @@ message RegisterModelRequest { string hostname = 3; int32 port = 4; string model_path = 5; + bytes checksum = 6; } message RegisterModelResponse { @@ -23,11 +24,13 @@ message RegisterModelResponse { message FetchModelRequest { int32 model_id = 1; + bool load_metadata = 2; } message FetchModelResponse { bool success = 1; string model_path = 2; + bytes checksum = 3; } message DeleteModelRequest { diff --git a/modyn/protos/selector.proto b/modyn/protos/selector.proto index 5c41c0b54..e0beca5e6 100644 --- a/modyn/protos/selector.proto +++ b/modyn/protos/selector.proto @@ -24,6 +24,19 @@ message Empty {} message JsonString { string value = 1; } +message StrategyConfig { + string name = 1; + optional bool zip = 2; + optional string zip_algorithm = 3; + optional JsonString config = 4; +} + +message ModelStorageStrategyInfo { + StrategyConfig full_model_strategy_config = 1; + optional StrategyConfig incremental_model_strategy_config = 2; + optional int32 full_model_interval = 3; +} + message DataInformRequest { int32 pipeline_id = 1; repeated int64 keys = 2; @@ -38,6 +51,8 @@ message RegisterPipelineRequest { JsonString selection_strategy = 2; string model_id = 3; JsonString model_configuration = 4; + bool amp = 5; + ModelStorageStrategyInfo model_storage_strategy = 6; } message PipelineResponse { int32 pipeline_id = 1; } diff --git a/modyn/protos/trainer_server.proto b/modyn/protos/trainer_server.proto index 6698b7617..01be4aee3 100644 --- a/modyn/protos/trainer_server.proto +++ b/modyn/protos/trainer_server.proto @@ -35,23 +35,22 @@ message StartTrainingRequest { int32 pipeline_id = 1; int32 trigger_id = 2; string device = 3; - bool amp = 4; - bool use_pretrained_model = 7; - bool load_optimizer_state = 8; - int32 pretrained_model_id = 9; - int32 batch_size = 10; - JsonString torch_optimizers_configuration = 11; - string torch_criterion = 12; - JsonString criterion_parameters = 13; - Data data_info = 14; - CheckpointInfo checkpoint_info = 15; - PythonString bytes_parser = 16; - repeated string transform_list = 17; - JsonString lr_scheduler = 18; - PythonString label_transformer = 19; - JsonString grad_scaler_configuration = 20; - int32 epochs_per_trigger = 21; - optional int32 seed = 22; + bool use_pretrained_model = 4; + bool load_optimizer_state = 5; + int32 pretrained_model_id = 6; + int32 batch_size = 7; + JsonString torch_optimizers_configuration = 8; + string torch_criterion = 9; + JsonString criterion_parameters = 10; + Data data_info = 11; + CheckpointInfo checkpoint_info = 12; + PythonString bytes_parser = 13; + repeated string transform_list = 14; + JsonString lr_scheduler = 15; + PythonString label_transformer = 16; + JsonString grad_scaler_configuration = 17; + int32 epochs_per_trigger = 18; + optional int32 seed = 19; } message StartTrainingResponse { diff --git a/modyn/selector/internal/grpc/generated/selector_pb2.py b/modyn/selector/internal/grpc/generated/selector_pb2.py index 2bec54a51..a5aa26e7b 100644 --- a/modyn/selector/internal/grpc/generated/selector_pb2.py +++ b/modyn/selector/internal/grpc/generated/selector_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0eselector.proto\x12\x08selector\"\x07\n\x05\x45mpty\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"Z\n\x11\x44\x61taInformRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x0c\n\x04keys\x18\x02 \x03(\x03\x12\x12\n\ntimestamps\x18\x03 \x03(\x03\x12\x0e\n\x06labels\x18\x04 \x03(\x03\"%\n\x0fTriggerResponse\x12\x12\n\ntrigger_id\x18\x01 \x01(\x05\"\xa5\x01\n\x17RegisterPipelineRequest\x12\x13\n\x0bnum_workers\x18\x01 \x01(\x05\x12\x30\n\x12selection_strategy\x18\x02 \x01(\x0b\x32\x14.selector.JsonString\x12\x10\n\x08model_id\x18\x03 \x01(\t\x12\x31\n\x13model_configuration\x18\x04 \x01(\x0b\x32\x14.selector.JsonString\"\'\n\x10PipelineResponse\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"e\n\x11GetSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x14\n\x0cpartition_id\x18\x03 \x01(\x05\x12\x11\n\tworker_id\x18\x04 \x01(\x05\"T\n\x0fSamplesResponse\x12\x1f\n\x17training_samples_subset\x18\x01 \x03(\x03\x12 \n\x18training_samples_weights\x18\x02 \x03(\x02\"D\n\x19GetNumberOfSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\".\n\x17NumberOfSamplesResponse\x12\x13\n\x0bnum_samples\x18\x01 \x01(\x05\"/\n\x18GetStatusBarScaleRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"2\n\x16StatusBarScaleResponse\x12\x18\n\x10status_bar_scale\x18\x01 \x01(\x05\"G\n\x1cGetNumberOfPartitionsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\"4\n\x1aNumberOfPartitionsResponse\x12\x16\n\x0enum_partitions\x18\x01 \x01(\x05\"0\n\x19GetAvailableLabelsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"3\n\x17\x41vailableLabelsResponse\x12\x18\n\x10\x61vailable_labels\x18\x01 \x03(\x03\"2\n\x1bGetSelectionStrategyRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"\x82\x01\n\x19SelectionStrategyResponse\x12\x1c\n\x14\x64ownsampling_enabled\x18\x01 \x01(\x08\x12\x15\n\rstrategy_name\x18\x02 \x01(\t\x12\x30\n\x12\x64ownsampler_config\x18\x03 \x01(\x0b\x32\x14.selector.JsonString\")\n\x12UsesWeightsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"+\n\x13UsesWeightsResponse\x12\x14\n\x0cuses_weights\x18\x01 \x01(\x08\"#\n\x13SeedSelectorRequest\x12\x0c\n\x04seed\x18\x01 \x01(\x05\"\'\n\x14SeedSelectorResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xe9\x07\n\x08Selector\x12T\n\x11register_pipeline\x12!.selector.RegisterPipelineRequest\x1a\x1a.selector.PipelineResponse\"\x00\x12Y\n\x1bget_sample_keys_and_weights\x12\x1b.selector.GetSamplesRequest\x1a\x19.selector.SamplesResponse\"\x00\x30\x01\x12=\n\x0binform_data\x12\x1b.selector.DataInformRequest\x1a\x0f.selector.Empty\"\x00\x12S\n\x17inform_data_and_trigger\x12\x1b.selector.DataInformRequest\x1a\x19.selector.TriggerResponse\"\x00\x12\x61\n\x15get_number_of_samples\x12#.selector.GetNumberOfSamplesRequest\x1a!.selector.NumberOfSamplesResponse\"\x00\x12^\n\x14get_status_bar_scale\x12\".selector.GetStatusBarScaleRequest\x1a .selector.StatusBarScaleResponse\"\x00\x12j\n\x18get_number_of_partitions\x12&.selector.GetNumberOfPartitionsRequest\x1a$.selector.NumberOfPartitionsResponse\"\x00\x12`\n\x14get_available_labels\x12#.selector.GetAvailableLabelsRequest\x1a!.selector.AvailableLabelsResponse\"\x00\x12\x66\n\x16get_selection_strategy\x12%.selector.GetSelectionStrategyRequest\x1a#.selector.SelectionStrategyResponse\"\x00\x12P\n\rseed_selector\x12\x1d.selector.SeedSelectorRequest\x1a\x1e.selector.SeedSelectorResponse\"\x00\x12M\n\x0cuses_weights\x12\x1c.selector.UsesWeightsRequest\x1a\x1d.selector.UsesWeightsResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0eselector.proto\x12\x08selector\"\x07\n\x05\x45mpty\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x9c\x01\n\x0eStrategyConfig\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x03zip\x18\x02 \x01(\x08H\x00\x88\x01\x01\x12\x1a\n\rzip_algorithm\x18\x03 \x01(\tH\x01\x88\x01\x01\x12)\n\x06\x63onfig\x18\x04 \x01(\x0b\x32\x14.selector.JsonStringH\x02\x88\x01\x01\x42\x06\n\x04_zipB\x10\n\x0e_zip_algorithmB\t\n\x07_config\"\x82\x02\n\x18ModelStorageStrategyInfo\x12<\n\x1a\x66ull_model_strategy_config\x18\x01 \x01(\x0b\x32\x18.selector.StrategyConfig\x12H\n!incremental_model_strategy_config\x18\x02 \x01(\x0b\x32\x18.selector.StrategyConfigH\x00\x88\x01\x01\x12 \n\x13\x66ull_model_interval\x18\x03 \x01(\x05H\x01\x88\x01\x01\x42$\n\"_incremental_model_strategy_configB\x16\n\x14_full_model_interval\"Z\n\x11\x44\x61taInformRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x0c\n\x04keys\x18\x02 \x03(\x03\x12\x12\n\ntimestamps\x18\x03 \x03(\x03\x12\x0e\n\x06labels\x18\x04 \x03(\x03\"%\n\x0fTriggerResponse\x12\x12\n\ntrigger_id\x18\x01 \x01(\x05\"\xf6\x01\n\x17RegisterPipelineRequest\x12\x13\n\x0bnum_workers\x18\x01 \x01(\x05\x12\x30\n\x12selection_strategy\x18\x02 \x01(\x0b\x32\x14.selector.JsonString\x12\x10\n\x08model_id\x18\x03 \x01(\t\x12\x31\n\x13model_configuration\x18\x04 \x01(\x0b\x32\x14.selector.JsonString\x12\x0b\n\x03\x61mp\x18\x05 \x01(\x08\x12\x42\n\x16model_storage_strategy\x18\x06 \x01(\x0b\x32\".selector.ModelStorageStrategyInfo\"\'\n\x10PipelineResponse\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"e\n\x11GetSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x14\n\x0cpartition_id\x18\x03 \x01(\x05\x12\x11\n\tworker_id\x18\x04 \x01(\x05\"T\n\x0fSamplesResponse\x12\x1f\n\x17training_samples_subset\x18\x01 \x03(\x03\x12 \n\x18training_samples_weights\x18\x02 \x03(\x02\"D\n\x19GetNumberOfSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\".\n\x17NumberOfSamplesResponse\x12\x13\n\x0bnum_samples\x18\x01 \x01(\x05\"/\n\x18GetStatusBarScaleRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"2\n\x16StatusBarScaleResponse\x12\x18\n\x10status_bar_scale\x18\x01 \x01(\x05\"G\n\x1cGetNumberOfPartitionsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\"4\n\x1aNumberOfPartitionsResponse\x12\x16\n\x0enum_partitions\x18\x01 \x01(\x05\"0\n\x19GetAvailableLabelsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"3\n\x17\x41vailableLabelsResponse\x12\x18\n\x10\x61vailable_labels\x18\x01 \x03(\x03\"2\n\x1bGetSelectionStrategyRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"\x82\x01\n\x19SelectionStrategyResponse\x12\x1c\n\x14\x64ownsampling_enabled\x18\x01 \x01(\x08\x12\x15\n\rstrategy_name\x18\x02 \x01(\t\x12\x30\n\x12\x64ownsampler_config\x18\x03 \x01(\x0b\x32\x14.selector.JsonString\")\n\x12UsesWeightsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"+\n\x13UsesWeightsResponse\x12\x14\n\x0cuses_weights\x18\x01 \x01(\x08\"#\n\x13SeedSelectorRequest\x12\x0c\n\x04seed\x18\x01 \x01(\x05\"\'\n\x14SeedSelectorResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xe9\x07\n\x08Selector\x12T\n\x11register_pipeline\x12!.selector.RegisterPipelineRequest\x1a\x1a.selector.PipelineResponse\"\x00\x12Y\n\x1bget_sample_keys_and_weights\x12\x1b.selector.GetSamplesRequest\x1a\x19.selector.SamplesResponse\"\x00\x30\x01\x12=\n\x0binform_data\x12\x1b.selector.DataInformRequest\x1a\x0f.selector.Empty\"\x00\x12S\n\x17inform_data_and_trigger\x12\x1b.selector.DataInformRequest\x1a\x19.selector.TriggerResponse\"\x00\x12\x61\n\x15get_number_of_samples\x12#.selector.GetNumberOfSamplesRequest\x1a!.selector.NumberOfSamplesResponse\"\x00\x12^\n\x14get_status_bar_scale\x12\".selector.GetStatusBarScaleRequest\x1a .selector.StatusBarScaleResponse\"\x00\x12j\n\x18get_number_of_partitions\x12&.selector.GetNumberOfPartitionsRequest\x1a$.selector.NumberOfPartitionsResponse\"\x00\x12`\n\x14get_available_labels\x12#.selector.GetAvailableLabelsRequest\x1a!.selector.AvailableLabelsResponse\"\x00\x12\x66\n\x16get_selection_strategy\x12%.selector.GetSelectionStrategyRequest\x1a#.selector.SelectionStrategyResponse\"\x00\x12P\n\rseed_selector\x12\x1d.selector.SeedSelectorRequest\x1a\x1e.selector.SeedSelectorResponse\"\x00\x12M\n\x0cuses_weights\x12\x1c.selector.UsesWeightsRequest\x1a\x1d.selector.UsesWeightsResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'selector_pb2', globals()) @@ -25,46 +25,50 @@ _EMPTY._serialized_end=35 _JSONSTRING._serialized_start=37 _JSONSTRING._serialized_end=64 - _DATAINFORMREQUEST._serialized_start=66 - _DATAINFORMREQUEST._serialized_end=156 - _TRIGGERRESPONSE._serialized_start=158 - _TRIGGERRESPONSE._serialized_end=195 - _REGISTERPIPELINEREQUEST._serialized_start=198 - _REGISTERPIPELINEREQUEST._serialized_end=363 - _PIPELINERESPONSE._serialized_start=365 - _PIPELINERESPONSE._serialized_end=404 - _GETSAMPLESREQUEST._serialized_start=406 - _GETSAMPLESREQUEST._serialized_end=507 - _SAMPLESRESPONSE._serialized_start=509 - _SAMPLESRESPONSE._serialized_end=593 - _GETNUMBEROFSAMPLESREQUEST._serialized_start=595 - _GETNUMBEROFSAMPLESREQUEST._serialized_end=663 - _NUMBEROFSAMPLESRESPONSE._serialized_start=665 - _NUMBEROFSAMPLESRESPONSE._serialized_end=711 - _GETSTATUSBARSCALEREQUEST._serialized_start=713 - _GETSTATUSBARSCALEREQUEST._serialized_end=760 - _STATUSBARSCALERESPONSE._serialized_start=762 - _STATUSBARSCALERESPONSE._serialized_end=812 - _GETNUMBEROFPARTITIONSREQUEST._serialized_start=814 - _GETNUMBEROFPARTITIONSREQUEST._serialized_end=885 - _NUMBEROFPARTITIONSRESPONSE._serialized_start=887 - _NUMBEROFPARTITIONSRESPONSE._serialized_end=939 - _GETAVAILABLELABELSREQUEST._serialized_start=941 - _GETAVAILABLELABELSREQUEST._serialized_end=989 - _AVAILABLELABELSRESPONSE._serialized_start=991 - _AVAILABLELABELSRESPONSE._serialized_end=1042 - _GETSELECTIONSTRATEGYREQUEST._serialized_start=1044 - _GETSELECTIONSTRATEGYREQUEST._serialized_end=1094 - _SELECTIONSTRATEGYRESPONSE._serialized_start=1097 - _SELECTIONSTRATEGYRESPONSE._serialized_end=1227 - _USESWEIGHTSREQUEST._serialized_start=1229 - _USESWEIGHTSREQUEST._serialized_end=1270 - _USESWEIGHTSRESPONSE._serialized_start=1272 - _USESWEIGHTSRESPONSE._serialized_end=1315 - _SEEDSELECTORREQUEST._serialized_start=1317 - _SEEDSELECTORREQUEST._serialized_end=1352 - _SEEDSELECTORRESPONSE._serialized_start=1354 - _SEEDSELECTORRESPONSE._serialized_end=1393 - _SELECTOR._serialized_start=1396 - _SELECTOR._serialized_end=2397 + _STRATEGYCONFIG._serialized_start=67 + _STRATEGYCONFIG._serialized_end=223 + _MODELSTORAGESTRATEGYINFO._serialized_start=226 + _MODELSTORAGESTRATEGYINFO._serialized_end=484 + _DATAINFORMREQUEST._serialized_start=486 + _DATAINFORMREQUEST._serialized_end=576 + _TRIGGERRESPONSE._serialized_start=578 + _TRIGGERRESPONSE._serialized_end=615 + _REGISTERPIPELINEREQUEST._serialized_start=618 + _REGISTERPIPELINEREQUEST._serialized_end=864 + _PIPELINERESPONSE._serialized_start=866 + _PIPELINERESPONSE._serialized_end=905 + _GETSAMPLESREQUEST._serialized_start=907 + _GETSAMPLESREQUEST._serialized_end=1008 + _SAMPLESRESPONSE._serialized_start=1010 + _SAMPLESRESPONSE._serialized_end=1094 + _GETNUMBEROFSAMPLESREQUEST._serialized_start=1096 + _GETNUMBEROFSAMPLESREQUEST._serialized_end=1164 + _NUMBEROFSAMPLESRESPONSE._serialized_start=1166 + _NUMBEROFSAMPLESRESPONSE._serialized_end=1212 + _GETSTATUSBARSCALEREQUEST._serialized_start=1214 + _GETSTATUSBARSCALEREQUEST._serialized_end=1261 + _STATUSBARSCALERESPONSE._serialized_start=1263 + _STATUSBARSCALERESPONSE._serialized_end=1313 + _GETNUMBEROFPARTITIONSREQUEST._serialized_start=1315 + _GETNUMBEROFPARTITIONSREQUEST._serialized_end=1386 + _NUMBEROFPARTITIONSRESPONSE._serialized_start=1388 + _NUMBEROFPARTITIONSRESPONSE._serialized_end=1440 + _GETAVAILABLELABELSREQUEST._serialized_start=1442 + _GETAVAILABLELABELSREQUEST._serialized_end=1490 + _AVAILABLELABELSRESPONSE._serialized_start=1492 + _AVAILABLELABELSRESPONSE._serialized_end=1543 + _GETSELECTIONSTRATEGYREQUEST._serialized_start=1545 + _GETSELECTIONSTRATEGYREQUEST._serialized_end=1595 + _SELECTIONSTRATEGYRESPONSE._serialized_start=1598 + _SELECTIONSTRATEGYRESPONSE._serialized_end=1728 + _USESWEIGHTSREQUEST._serialized_start=1730 + _USESWEIGHTSREQUEST._serialized_end=1771 + _USESWEIGHTSRESPONSE._serialized_start=1773 + _USESWEIGHTSRESPONSE._serialized_end=1816 + _SEEDSELECTORREQUEST._serialized_start=1818 + _SEEDSELECTORREQUEST._serialized_end=1853 + _SEEDSELECTORRESPONSE._serialized_start=1855 + _SEEDSELECTORRESPONSE._serialized_end=1894 + _SELECTOR._serialized_start=1897 + _SELECTOR._serialized_end=2898 # @@protoc_insertion_point(module_scope) diff --git a/modyn/selector/internal/grpc/generated/selector_pb2.pyi b/modyn/selector/internal/grpc/generated/selector_pb2.pyi index 3639fd9cc..8f3c358bb 100644 --- a/modyn/selector/internal/grpc/generated/selector_pb2.pyi +++ b/modyn/selector/internal/grpc/generated/selector_pb2.pyi @@ -8,6 +8,7 @@ import google.protobuf.descriptor import google.protobuf.internal.containers import google.protobuf.message import sys +import typing if sys.version_info >= (3, 8): import typing as typing_extensions @@ -41,6 +42,66 @@ class JsonString(google.protobuf.message.Message): global___JsonString = JsonString +@typing_extensions.final +class StrategyConfig(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + NAME_FIELD_NUMBER: builtins.int + ZIP_FIELD_NUMBER: builtins.int + ZIP_ALGORITHM_FIELD_NUMBER: builtins.int + CONFIG_FIELD_NUMBER: builtins.int + name: builtins.str + zip: builtins.bool + zip_algorithm: builtins.str + @property + def config(self) -> global___JsonString: ... + def __init__( + self, + *, + name: builtins.str = ..., + zip: builtins.bool | None = ..., + zip_algorithm: builtins.str | None = ..., + config: global___JsonString | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_config", b"_config", "_zip", b"_zip", "_zip_algorithm", b"_zip_algorithm", "config", b"config", "zip", b"zip", "zip_algorithm", b"zip_algorithm"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_config", b"_config", "_zip", b"_zip", "_zip_algorithm", b"_zip_algorithm", "config", b"config", "name", b"name", "zip", b"zip", "zip_algorithm", b"zip_algorithm"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_config", b"_config"]) -> typing_extensions.Literal["config"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_zip", b"_zip"]) -> typing_extensions.Literal["zip"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_zip_algorithm", b"_zip_algorithm"]) -> typing_extensions.Literal["zip_algorithm"] | None: ... + +global___StrategyConfig = StrategyConfig + +@typing_extensions.final +class ModelStorageStrategyInfo(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + FULL_MODEL_STRATEGY_CONFIG_FIELD_NUMBER: builtins.int + INCREMENTAL_MODEL_STRATEGY_CONFIG_FIELD_NUMBER: builtins.int + FULL_MODEL_INTERVAL_FIELD_NUMBER: builtins.int + @property + def full_model_strategy_config(self) -> global___StrategyConfig: ... + @property + def incremental_model_strategy_config(self) -> global___StrategyConfig: ... + full_model_interval: builtins.int + def __init__( + self, + *, + full_model_strategy_config: global___StrategyConfig | None = ..., + incremental_model_strategy_config: global___StrategyConfig | None = ..., + full_model_interval: builtins.int | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_full_model_interval", b"_full_model_interval", "_incremental_model_strategy_config", b"_incremental_model_strategy_config", "full_model_interval", b"full_model_interval", "full_model_strategy_config", b"full_model_strategy_config", "incremental_model_strategy_config", b"incremental_model_strategy_config"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_full_model_interval", b"_full_model_interval", "_incremental_model_strategy_config", b"_incremental_model_strategy_config", "full_model_interval", b"full_model_interval", "full_model_strategy_config", b"full_model_strategy_config", "incremental_model_strategy_config", b"incremental_model_strategy_config"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_full_model_interval", b"_full_model_interval"]) -> typing_extensions.Literal["full_model_interval"] | None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_incremental_model_strategy_config", b"_incremental_model_strategy_config"]) -> typing_extensions.Literal["incremental_model_strategy_config"] | None: ... + +global___ModelStorageStrategyInfo = ModelStorageStrategyInfo + @typing_extensions.final class DataInformRequest(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor @@ -91,12 +152,17 @@ class RegisterPipelineRequest(google.protobuf.message.Message): SELECTION_STRATEGY_FIELD_NUMBER: builtins.int MODEL_ID_FIELD_NUMBER: builtins.int MODEL_CONFIGURATION_FIELD_NUMBER: builtins.int + AMP_FIELD_NUMBER: builtins.int + MODEL_STORAGE_STRATEGY_FIELD_NUMBER: builtins.int num_workers: builtins.int @property def selection_strategy(self) -> global___JsonString: ... model_id: builtins.str @property def model_configuration(self) -> global___JsonString: ... + amp: builtins.bool + @property + def model_storage_strategy(self) -> global___ModelStorageStrategyInfo: ... def __init__( self, *, @@ -104,9 +170,11 @@ class RegisterPipelineRequest(google.protobuf.message.Message): selection_strategy: global___JsonString | None = ..., model_id: builtins.str = ..., model_configuration: global___JsonString | None = ..., + amp: builtins.bool = ..., + model_storage_strategy: global___ModelStorageStrategyInfo | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["model_configuration", b"model_configuration", "selection_strategy", b"selection_strategy"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["model_configuration", b"model_configuration", "model_id", b"model_id", "num_workers", b"num_workers", "selection_strategy", b"selection_strategy"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["model_configuration", b"model_configuration", "model_storage_strategy", b"model_storage_strategy", "selection_strategy", b"selection_strategy"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["amp", b"amp", "model_configuration", b"model_configuration", "model_id", b"model_id", "model_storage_strategy", b"model_storage_strategy", "num_workers", b"num_workers", "selection_strategy", b"selection_strategy"]) -> None: ... global___RegisterPipelineRequest = RegisterPipelineRequest diff --git a/modyn/selector/internal/grpc/selector_grpc_servicer.py b/modyn/selector/internal/grpc/selector_grpc_servicer.py index 64dd109c9..0e34c85d9 100644 --- a/modyn/selector/internal/grpc/selector_grpc_servicer.py +++ b/modyn/selector/internal/grpc/selector_grpc_servicer.py @@ -1,10 +1,11 @@ import json import logging -from typing import Iterable +from typing import Iterable, Optional import grpc # pylint: disable=no-name-in-module +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.selector.internal.grpc.generated.selector_pb2 import ( AvailableLabelsResponse, DataInformRequest, @@ -27,6 +28,7 @@ SeedSelectorResponse, SelectionStrategyResponse, StatusBarScaleResponse, + StrategyConfig, TriggerResponse, UsesWeightsRequest, UsesWeightsResponse, @@ -49,11 +51,50 @@ def __init__(self, selector_manager: SelectorManager, sample_batch_size: int): def register_pipeline(self, request: RegisterPipelineRequest, context: grpc.ServicerContext) -> PipelineResponse: logger.info(f"Registering pipeline with request - {str(request)}") + + full_model_strategy = self.get_model_storage_strategy_config( + request.model_storage_strategy.full_model_strategy_config + ) + + incremental_model_strategy: Optional[ModelStorageStrategyConfig] = None + if ( + request.model_storage_strategy.HasField("incremental_model_strategy_config") + and request.model_storage_strategy.incremental_model_strategy_config is not None + ): + incremental_model_strategy = self.get_model_storage_strategy_config( + request.model_storage_strategy.incremental_model_strategy_config + ) + + full_model_interval: Optional[int] = None + if ( + request.model_storage_strategy.HasField("full_model_interval") + and request.model_storage_strategy.full_model_interval is not None + ): + full_model_interval = request.model_storage_strategy.full_model_interval + pipeline_id = self.selector_manager.register_pipeline( - request.num_workers, request.selection_strategy.value, request.model_id, request.model_configuration.value + request.num_workers, + request.selection_strategy.value, + request.model_id, + request.model_configuration.value, + request.amp, + full_model_strategy, + incremental_model_strategy, + full_model_interval, ) return PipelineResponse(pipeline_id=pipeline_id) + @staticmethod + def get_model_storage_strategy_config(strategy_config: StrategyConfig) -> ModelStorageStrategyConfig: + strategy = ModelStorageStrategyConfig(strategy_config.name) + if strategy_config.HasField("zip") and strategy_config.zip is not None: + strategy.zip = strategy_config.zip + if strategy_config.HasField("zip_algorithm") and strategy_config.zip is not None: + strategy.zip_algorithm = strategy_config.zip_algorithm + if strategy_config.HasField("config") and strategy_config.config is not None: + strategy.config = strategy_config.config.value + return strategy + def get_sample_keys_and_weights( # pylint: disable-next=unused-argument self, request: GetSamplesRequest, context: grpc.ServicerContext ) -> Iterable[SamplesResponse]: diff --git a/modyn/selector/internal/selector_manager.py b/modyn/selector/internal/selector_manager.py index 13762135c..d024cbeb2 100644 --- a/modyn/selector/internal/selector_manager.py +++ b/modyn/selector/internal/selector_manager.py @@ -6,8 +6,10 @@ import shutil from pathlib import Path from threading import Lock +from typing import Optional from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.selector.internal.selector_strategies.abstract_selection_strategy import AbstractSelectionStrategy from modyn.selector.selector import Selector from modyn.utils.utils import dynamic_module_import, is_directory_writable @@ -57,7 +59,17 @@ def _init_trigger_sample_directory(self) -> None: + f"Directory info: {os.stat(trigger_sample_directory)}" ) - def register_pipeline(self, num_workers: int, selection_strategy: str, model_id: str, model_config: str) -> int: + def register_pipeline( + self, + num_workers: int, + selection_strategy: str, + model_id: str, + model_config: str, + amp: bool, + full_model_strategy: ModelStorageStrategyConfig, + incremental_model_strategy: Optional[ModelStorageStrategyConfig] = None, + full_model_interval: Optional[int] = None, + ) -> int: """ Registers a new pipeline at the Selector. Returns: @@ -70,7 +82,15 @@ def register_pipeline(self, num_workers: int, selection_strategy: str, model_id: with self._next_pipeline_lock: with MetadataDatabaseConnection(self._modyn_config) as database: - pipeline_id = database.register_pipeline(num_workers, model_id, model_config) + pipeline_id = database.register_pipeline( + num_workers, + model_id, + model_config, + amp, + full_model_strategy, + incremental_model_strategy, + full_model_interval, + ) selection_strategy = self._instantiate_strategy(json.loads(selection_strategy), pipeline_id) selector = Selector(selection_strategy, pipeline_id, num_workers, self._selector_cache_size) diff --git a/modyn/supervisor/internal/grpc_handler.py b/modyn/supervisor/internal/grpc_handler.py index df49731d8..3dd1d4a7e 100644 --- a/modyn/supervisor/internal/grpc_handler.py +++ b/modyn/supervisor/internal/grpc_handler.py @@ -28,10 +28,12 @@ ) from modyn.selector.internal.grpc.generated.selector_pb2 import JsonString as SelectorJsonString from modyn.selector.internal.grpc.generated.selector_pb2 import ( + ModelStorageStrategyInfo, NumberOfSamplesResponse, RegisterPipelineRequest, SeedSelectorRequest, StatusBarScaleResponse, + StrategyConfig, TriggerResponse, ) from modyn.selector.internal.grpc.generated.selector_pb2_grpc import SelectorStub @@ -195,6 +197,11 @@ def register_pipeline_at_selector(self, pipeline_config: dict) -> int: else: model_config = "{}" + model_storage_config = pipeline_config["model_storage"] + incremental_model_strategy: Optional[StrategyConfig] = None + if "incremental_model_strategy" in model_storage_config: + incremental_model_strategy = self.get_model_strategy(model_storage_config["incremental_model_strategy"]) + pipeline_id = self.selector.register_pipeline( RegisterPipelineRequest( num_workers=pipeline_config["training"]["dataloader_workers"], @@ -203,12 +210,31 @@ def register_pipeline_at_selector(self, pipeline_config: dict) -> int: ), model_id=pipeline_config["model"]["id"], model_configuration=SelectorJsonString(value=model_config), + amp=pipeline_config["training"]["amp"] if "amp" in pipeline_config["training"] else False, + model_storage_strategy=ModelStorageStrategyInfo( + full_model_strategy_config=self.get_model_strategy(model_storage_config["full_model_strategy"]), + incremental_model_strategy_config=incremental_model_strategy, + full_model_interval=model_storage_config["full_model_interval"] + if "full_model_interval" in model_storage_config + else None, + ), ) ).pipeline_id logger.info(f"Registered pipeline {pipeline_config['pipeline']['name']} at selector with ID {pipeline_id}") return pipeline_id + @staticmethod + def get_model_strategy(strategy_config: dict) -> StrategyConfig: + return StrategyConfig( + name=strategy_config["name"], + zip=strategy_config["zip"] if "zip" in strategy_config else None, + zip_algorithm=strategy_config["zip_algorithm"] if "zip_algorithm" in strategy_config else None, + config=SelectorJsonString(value=json.dumps(strategy_config["config"])) + if "config" in strategy_config + else None, + ) + # pylint: disable-next=unused-argument def unregister_pipeline_at_selector(self, pipeline_id: int) -> None: # # TODO(#64,#124): Implement. @@ -318,8 +344,6 @@ def start_training( else: checkpoint_info = CheckpointInfo(checkpoint_interval=0, checkpoint_path="") - amp = pipeline_config["training"]["amp"] if "amp" in pipeline_config["training"] else False - if "grad_scaler_config" in pipeline_config["training"]: grad_scaler_config = pipeline_config["training"]["grad_scaler_config"] else: @@ -329,7 +353,6 @@ def start_training( "pipeline_id": pipeline_id, "trigger_id": trigger_id, "device": pipeline_config["training"]["device"], - "amp": amp, "use_pretrained_model": previous_model_id is not None, "pretrained_model_id": previous_model_id or -1, "load_optimizer_state": False, # TODO(#137): Think about this. @@ -467,23 +490,14 @@ def start_evaluation(self, trained_model_id: int, pipeline_config: dict) -> dict if not self.connected_to_evaluator: raise ConnectionError("Tried to start evaluation at evaluator, but there is no gRPC connection.") - model_id = pipeline_config["model"]["id"] - if "config" in pipeline_config["model"]: - model_config = json.dumps(pipeline_config["model"]["config"]) - else: - model_config = "{}" - device = pipeline_config["evaluation"]["device"] - amp = pipeline_config["evaluation"]["amp"] if "amp" in pipeline_config["evaluation"] else False evaluations: dict[int, EvaluationStatusTracker] = {} for dataset in pipeline_config["evaluation"]["datasets"]: dataset_id = dataset["dataset_id"] - req = GRPCHandler._prepare_evaluation_request( - dataset, model_id, model_config, trained_model_id, device, amp - ) + req = GRPCHandler._prepare_evaluation_request(dataset, trained_model_id, device) response: EvaluateModelResponse = self.evaluator.evaluate_model(req) if not response.evaluation_started: @@ -496,9 +510,7 @@ def start_evaluation(self, trained_model_id: int, pipeline_config: dict) -> dict return evaluations @staticmethod - def _prepare_evaluation_request( - dataset_config: dict, model_id: str, model_config: str, trained_model_id: int, device: str, amp: bool - ) -> EvaluateModelRequest: + def _prepare_evaluation_request(dataset_config: dict, trained_model_id: int, device: str) -> EvaluateModelRequest: dataset_id = dataset_config["dataset_id"] if "transformations" in dataset_config: @@ -539,19 +551,14 @@ def _prepare_evaluation_request( "trained_model_id": trained_model_id, "dataset_info": DatasetInfo(dataset_id=dataset_id, num_dataloaders=dataloader_workers), "device": device, - "amp": amp, "batch_size": batch_size, "metrics": metrics, - "model_id": model_id, - "model_configuration": EvaluatorJsonString(value=model_config), "transform_list": transform_list, "bytes_parser": EvaluatorPythonString(value=bytes_parser_function), "label_transformer": EvaluatorPythonString(value=label_transformer), } - cleaned_kwargs = {k: v for k, v in start_evaluation_kwargs.items() if v is not None} - - return EvaluateModelRequest(**cleaned_kwargs) + return EvaluateModelRequest(**start_evaluation_kwargs) def wait_for_evaluation_completion(self, training_id: int, evaluations: dict[int, EvaluationStatusTracker]) -> None: if not self.connected_to_evaluator: diff --git a/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py b/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py index 53e8fe138..4472c6fe6 100644 --- a/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py +++ b/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py @@ -1,9 +1,9 @@ # pylint: disable=unused-argument, no-name-in-module, no-value-for-parameter import json import multiprocessing as mp +import os import pathlib import platform -import shutil import tempfile from time import sleep from unittest import mock @@ -23,18 +23,54 @@ ) from modyn.evaluator.internal.metrics import Accuracy, F1Score from modyn.evaluator.internal.utils import EvaluationInfo, EvaluationProcessInfo, EvaluatorMessages +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import FetchModelRequest, FetchModelResponse from modyn.storage.internal.grpc.generated.storage_pb2 import GetDatasetSizeRequest, GetDatasetSizeResponse +DATABASE = pathlib.Path(os.path.abspath(__file__)).parent / "test_evaluator.database" + def get_modyn_config(): return { "evaluator": {"hostname": "localhost", "port": "50000"}, "model_storage": {"hostname": "localhost", "port": "50051", "ftp_port": "5223"}, "storage": {"hostname": "storage", "port": "50052"}, + "metadata_database": { + "drivername": "sqlite", + "username": "", + "password": "", + "host": "", + "port": 0, + "database": f"{DATABASE}", + }, } +def setup(): + if os.path.exists(DATABASE): + os.remove(DATABASE) + + with MetadataDatabaseConnection(get_modyn_config()) as database: + database.create_tables() + + database.register_pipeline( + 1, + "ResNet18", + json.dumps({}), + True, + ModelStorageStrategyConfig(name="PyTorchFullModel"), + incremental_model_strategy=None, + full_model_interval=None, + ) + database.add_trained_model(1, 10, "trained_model.modyn") + database.add_trained_model(1, 11, "trained_model2.modyn") + + +def teardown(): + os.remove(DATABASE) + + class DummyModelWrapper: def __init__(self, model_configuration=None) -> None: self.model = None @@ -43,8 +79,8 @@ def __init__(self, model_configuration=None) -> None: class DummyModelStorageStub: # pylint: disable-next=invalid-name def FetchModel(self, request: FetchModelRequest) -> FetchModelResponse: - if request.model_id <= 10: - return FetchModelResponse(success=True, model_path="trained_model.modyn") + if request.model_id == 1: + return FetchModelResponse(success=True, model_path="trained_model.modyn", checksum=bytes(5)) return FetchModelResponse(success=False) @@ -86,12 +122,11 @@ def get_mock_evaluation_transformer(): ) -def get_evaluate_model_request(valid_model: bool): +def get_evaluate_model_request(): return EvaluateModelRequest( - trained_model_id=5, + trained_model_id=1, dataset_info=DatasetInfo(dataset_id="MNIST", num_dataloaders=1), device="cpu", - amp=False, batch_size=4, metrics=[ MetricConfiguration( @@ -100,19 +135,20 @@ def get_evaluate_model_request(valid_model: bool): evaluation_transformer=PythonString(value=""), ) ], - model_id="ResNet18" if valid_model else "unknown", - model_configuration=JsonString(value=json.dumps({})), transform_list=[], bytes_parser=PythonString(value=get_mock_bytes_parser()), label_transformer=PythonString(value=""), ) -def get_evaluation_info(evaluation_id, valid_model: bool, model_path: pathlib.Path, config: dict): +def get_evaluation_info(evaluation_id, model_path: pathlib.Path, config: dict): storage_address = f"{config['storage']['hostname']}:{config['storage']['port']}" return EvaluationInfo( - request=get_evaluate_model_request(valid_model), + request=get_evaluate_model_request(), evaluation_id=evaluation_id, + model_id="ResNet18", + amp=False, + model_config="{}", storage_address=storage_address, metrics=[Accuracy("", {}), F1Score("", {"num_classes": 2})], model_path=model_path, @@ -133,51 +169,68 @@ def test_init(test_connect_to_model_storage, test_connect_to_storage): @patch.object(EvaluatorGRPCServicer, "connect_to_storage", return_value=DummyStorageStub()) @patch.object(EvaluatorGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) -def test_evaluate_model_invalid(test_connect_to_model_storage, test_connect_to_storage): +@patch("modyn.evaluator.internal.grpc.evaluator_grpc_servicer.hasattr", return_value=False) +def test_evaluate_model_invalid_model_id(test_has_attribute, test_connect_to_model_storage, test_connect_to_storage): with tempfile.TemporaryDirectory() as modyn_temp: evaluator = EvaluatorGRPCServicer(get_modyn_config(), pathlib.Path(modyn_temp)) - response = evaluator.evaluate_model(get_evaluate_model_request(False), None) + response = evaluator.evaluate_model(get_evaluate_model_request(), None) assert not response.evaluation_started assert not evaluator._evaluation_dict assert evaluator._next_evaluation_id == 0 - req = get_evaluate_model_request(True) + +@patch.object(EvaluatorGRPCServicer, "connect_to_storage", return_value=DummyStorageStub()) +@patch.object(EvaluatorGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) +def test_evaluate_model_invalid(test_connect_to_model_storage, test_connect_to_storage): + with tempfile.TemporaryDirectory() as modyn_temp: + evaluator = EvaluatorGRPCServicer(get_modyn_config(), pathlib.Path(modyn_temp)) + req = get_evaluate_model_request() req.trained_model_id = 15 resp = evaluator.evaluate_model(req, None) assert not resp.evaluation_started - req = get_evaluate_model_request(True) + req = get_evaluate_model_request() req.dataset_info.dataset_id = "unknown" resp = evaluator.evaluate_model(req, None) assert not resp.evaluation_started assert evaluator._next_evaluation_id == 0 + req = get_evaluate_model_request() + req.trained_model_id = 2 + resp = evaluator.evaluate_model(req, None) + assert not resp.evaluation_started + -@patch("modyn.evaluator.internal.grpc.evaluator_grpc_servicer.download_file") +@patch( + "modyn.evaluator.internal.grpc.evaluator_grpc_servicer.download_trained_model", + return_value=pathlib.Path("downloaded_model.modyn"), +) @patch.object(EvaluatorGRPCServicer, "connect_to_storage", return_value=DummyStorageStub()) @patch.object(EvaluatorGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) -def test_evaluate_model_valid(test_connect_to_model_storage, test_connect_to_storage, download_file_mock: MagicMock): +def test_evaluate_model_valid(test_connect_to_model_storage, test_connect_to_storage, download_model_mock: MagicMock): with tempfile.TemporaryDirectory() as modyn_temp: evaluator = EvaluatorGRPCServicer(get_modyn_config(), pathlib.Path(modyn_temp)) - with open(pathlib.Path(modyn_temp) / "trained_model.modyn", "wb") as file: - file.write(b"Our trained model!") + mock_start = mock.Mock() with patch("multiprocessing.Process.start", mock_start): - resp: EvaluateModelResponse = evaluator.evaluate_model(get_evaluate_model_request(True), None) + resp: EvaluateModelResponse = evaluator.evaluate_model(get_evaluate_model_request(), None) assert 0 in evaluator._evaluation_process_dict assert evaluator._next_evaluation_id == 1 - download_file_mock.assert_called_once() - kwargs = download_file_mock.call_args.kwargs - remote_file_path = kwargs["remote_file_path"] - local_file_path = kwargs["local_file_path"] - - shutil.copyfile(pathlib.Path(modyn_temp) / remote_file_path, local_file_path) + download_model_mock.assert_called_once() + kwargs = download_model_mock.call_args.kwargs + remote_file_path = kwargs["remote_path"] + base_directory = kwargs["base_directory"] + identifier = kwargs["identifier"] - with open(evaluator._evaluation_dict[resp.evaluation_id].model_path, "rb") as file: - assert file.read().decode("utf-8") == "Our trained model!" + assert str(remote_file_path) == "trained_model.modyn" + assert base_directory == evaluator._base_dir + assert identifier == 0 + assert resp.evaluation_started + assert resp.evaluation_id == identifier + assert str(evaluator._evaluation_dict[resp.evaluation_id].model_path) == "downloaded_model.modyn" @patch.object(EvaluatorGRPCServicer, "connect_to_storage", return_value=DummyStorageStub()) @@ -396,7 +449,7 @@ def test_get_evaluation_result_missing_metric(test_is_alive, test_connect_to_mod evaluation_process_info = get_evaluation_process_info() evaluator._evaluation_process_dict[3] = evaluation_process_info config = get_modyn_config() - evaluator._evaluation_dict[3] = get_evaluation_info(3, True, pathlib.Path("trained.model"), config) + evaluator._evaluation_dict[3] = get_evaluation_info(3, pathlib.Path("trained.model"), config) response = evaluator.get_evaluation_result(EvaluationResultRequest(evaluation_id=3), None) assert response.valid assert len(response.evaluation_data) == 0 @@ -413,7 +466,7 @@ def test_get_evaluation_result( with tempfile.TemporaryDirectory() as temp: config = get_modyn_config() evaluator = EvaluatorGRPCServicer(config, pathlib.Path(temp)) - evaluator._evaluation_dict[1] = get_evaluation_info(1, True, pathlib.Path(temp) / "trained_model.modyn", config) + evaluator._evaluation_dict[1] = get_evaluation_info(1, pathlib.Path(temp) / "trained_model.modyn", config) assert len(evaluator._evaluation_dict[1].metrics) == 2 assert isinstance(evaluator._evaluation_dict[1].metrics[0], Accuracy) diff --git a/modyn/tests/evaluator/internal/test_pytorch_evaluator.py b/modyn/tests/evaluator/internal/test_pytorch_evaluator.py index b0b3b63da..a00c3a8c7 100644 --- a/modyn/tests/evaluator/internal/test_pytorch_evaluator.py +++ b/modyn/tests/evaluator/internal/test_pytorch_evaluator.py @@ -95,7 +95,6 @@ def get_evaluation_info( trained_model_id=1, dataset_info=DatasetInfo(dataset_id="MNIST", num_dataloaders=1), device="cpu", - amp=False, batch_size=4, metrics=[ MetricConfiguration( @@ -104,14 +103,12 @@ def get_evaluation_info( evaluation_transformer=PythonString(value=get_mock_accuracy_transformer()), ) ], - model_id="model", - model_configuration=JsonString(value=json.dumps({})), transform_list=[], bytes_parser=PythonString(value=get_mock_bytes_parser()), label_transformer=PythonString(value=get_mock_label_transformer() if label_transformer else ""), ) - return EvaluationInfo(request, evaluation_id, storage_address, metrics, trained_model_path) + return EvaluationInfo(request, evaluation_id, "model", "{}", False, storage_address, metrics, trained_model_path) @patch.object(StorageStub, "__init__", noop_constructor_mock) diff --git a/modyn/tests/metadata_database/models/test_pipelines.py b/modyn/tests/metadata_database/models/test_pipelines.py index d270aa682..ada36347e 100644 --- a/modyn/tests/metadata_database/models/test_pipelines.py +++ b/modyn/tests/metadata_database/models/test_pipelines.py @@ -21,7 +21,13 @@ def session(): def test_add_pipeline(session): - pipeline = Pipeline(num_workers=10, model_id="ResNet18", model_config=json.dumps({"num_classes": 10})) + pipeline = Pipeline( + num_workers=10, + model_id="ResNet18", + model_config=json.dumps({"num_classes": 10}), + amp=True, + full_model_strategy_name="PyTorchFullModel", + ) session.add(pipeline) session.commit() @@ -31,18 +37,27 @@ def test_add_pipeline(session): assert extracted_pipeline.num_workers == 10 assert extracted_pipeline.model_id == "ResNet18" assert json.loads(extracted_pipeline.model_config)["num_classes"] == 10 + assert extracted_pipeline.amp + assert extracted_pipeline.full_model_strategy_name == "PyTorchFullModel" + assert extracted_pipeline.full_model_strategy_zip is None + assert extracted_pipeline.inc_model_strategy_name is None + assert extracted_pipeline.full_model_strategy_config is None def test_update_pipeline(session): - pipeline = Pipeline(num_workers=10, model_id="ResNet18", model_config="{}") + pipeline = Pipeline( + num_workers=10, model_id="ResNet18", model_config="{}", amp=True, full_model_strategy_name="PyTorchFullModel" + ) session.add(pipeline) session.commit() pipeline.num_workers = 20 + pipeline.amp = False session.commit() assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first() is not None assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().num_workers == 20 + assert not session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().amp pipeline.model_id = "test_model" session.commit() @@ -51,7 +66,9 @@ def test_update_pipeline(session): def test_delete_pipeline(session): - pipeline = Pipeline(num_workers=10, model_id="ResNet18", model_config="{}") + pipeline = Pipeline( + num_workers=10, model_id="ResNet18", model_config="{}", amp=False, full_model_strategy_name="PyTorchFullModel" + ) session.add(pipeline) session.commit() diff --git a/modyn/tests/metadata_database/test_metadata_database_connection.py b/modyn/tests/metadata_database/test_metadata_database_connection.py index 1cbae327a..1e62e9859 100644 --- a/modyn/tests/metadata_database/test_metadata_database_connection.py +++ b/modyn/tests/metadata_database/test_metadata_database_connection.py @@ -2,6 +2,7 @@ from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.metadata_database.models import TrainedModel, Trigger +from modyn.metadata_database.utils import ModelStorageStrategyConfig def get_minimal_modyn_config() -> dict: @@ -26,16 +27,22 @@ def test_database_connection(): def test_register_pipeline(): with MetadataDatabaseConnection(get_minimal_modyn_config()) as database: database.create_tables() - pipeline_id = database.register_pipeline(1, "ResNet18", "{}") + pipeline_id = database.register_pipeline( + 1, "ResNet18", "{}", True, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) assert pipeline_id == 1 - pipeline_id = database.register_pipeline(1, "ResNet18", "{}") + pipeline_id = database.register_pipeline( + 1, "ResNet18", "{}", False, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) assert pipeline_id == 2 def test_add_trained_model(): with MetadataDatabaseConnection(get_minimal_modyn_config()) as database: database.create_tables() - pipeline_id = database.register_pipeline(1, "ResNet18", "{}") + pipeline_id = database.register_pipeline( + 1, "ResNet18", "{}", True, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) trigger = Trigger(pipeline_id=pipeline_id, trigger_id=5) database.session.add(trigger) @@ -46,21 +53,32 @@ def test_add_trained_model(): model_id = database.add_trained_model(pipeline_id, trigger_id, "test_path.modyn") - model: TrainedModel = database.session.get(TrainedModel, model_id) + model_parent: TrainedModel = database.session.get(TrainedModel, model_id) - assert model.model_id == 1 - assert model.model_path == "test_path.modyn" - assert model.pipeline_id == 1 and model.trigger_id == 5 + assert model_parent.model_id == 1 + assert model_parent.model_path == "test_path.modyn" + assert model_parent.pipeline_id == 1 and model_parent.trigger_id == 5 + assert model_parent.parent_model is None + model_id = database.add_trained_model(pipeline_id, 6, "test_path.modyn", parent_model=model_parent.model_id) + model_child: TrainedModel = database.session.get(TrainedModel, model_id) -def test_get_model_configuration(): + assert model_child.parent_model == model_parent.model_id + assert len(model_parent.children) == 1 + assert model_parent.children[0] == model_child + + +def test_get_model_storage_strategy(): with MetadataDatabaseConnection(get_minimal_modyn_config()) as database: database.create_tables() - pipeline_id = database.register_pipeline(1, "ResNet18", json.dumps({"num_classes": 10})) + pipeline_id = database.register_pipeline( + 1, "ResNet18", json.dumps({"num_classes": 10}), True, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) assert pipeline_id == 1 - model_id, model_config = database.get_model_configuration(pipeline_id) + model_id, model_config, amp = database.get_model_configuration(pipeline_id) assert model_id == "ResNet18" assert json.loads(model_config) == {"num_classes": 10} + assert amp diff --git a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_server.py b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_server.py index 7f4b3e1ca..47a7b0276 100644 --- a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_server.py +++ b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_server.py @@ -2,6 +2,7 @@ import pathlib from unittest.mock import patch +from modyn.model_storage.internal import ModelStorageManager from modyn.model_storage.internal.grpc.grpc_server import GRPCServer @@ -10,12 +11,16 @@ def get_modyn_config(): def test_init(): - grpc_server = GRPCServer(get_modyn_config(), pathlib.Path.cwd() / "temp_dir") + grpc_server = GRPCServer(get_modyn_config(), pathlib.Path.cwd() / "storage_dir", pathlib.Path.cwd() / "ftp_dir") assert grpc_server.modyn_config == get_modyn_config() - assert str(grpc_server.storage_dir) == str(pathlib.Path.cwd() / "temp_dir") + assert str(grpc_server.storage_dir) == str(pathlib.Path.cwd() / "storage_dir") + assert str(grpc_server.ftp_directory) == str(pathlib.Path.cwd() / "ftp_dir") @patch("modyn.model_storage.internal.grpc.grpc_server.add_ModelStorageServicer_to_server", return_value=None) -def test_enter(mock_add_model_storage_servicer_to_server): - with GRPCServer(get_modyn_config(), pathlib.Path.cwd() / "temp_dir") as grpc_server: +@patch.object(ModelStorageManager, "__init__", return_value=None) +def test_enter(mock_init_model_storage_manager, mock_add_model_storage_servicer_to_server): + with GRPCServer( + get_modyn_config(), pathlib.Path.cwd() / "storage_dir", pathlib.Path.cwd() / "ftp_dir" + ) as grpc_server: assert grpc_server is not None diff --git a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py index 5fb1a1593..09637d71e 100644 --- a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py +++ b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py @@ -1,12 +1,11 @@ -import json -import os +# pylint: disable=unused-argument import pathlib import shutil import tempfile from unittest.mock import MagicMock, patch -from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection -from modyn.metadata_database.models import TrainedModel, Trigger +import torch +from modyn.model_storage.internal import ModelStorageManager # pylint: disable-next=no-name-in-module from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( @@ -18,52 +17,28 @@ RegisterModelResponse, ) from modyn.model_storage.internal.grpc.model_storage_grpc_servicer import ModelStorageGRPCServicer - -DATABASE = pathlib.Path(os.path.abspath(__file__)).parent / "test_model_storage.database" +from modyn.utils import calculate_checksum def get_modyn_config(): return { "model_storage": {"port": "50051", "ftp_port": "5223"}, "trainer_server": {"hostname": "localhost", "ftp_port": "5222"}, - "metadata_database": { - "drivername": "sqlite", - "username": "", - "password": "", - "host": "", - "port": 0, - "database": f"{DATABASE}", - }, } -def setup(): - if os.path.exists(DATABASE): - os.remove(DATABASE) - - with MetadataDatabaseConnection(get_modyn_config()) as database: - database.create_tables() - - pipeline_id = database.register_pipeline(1, "ResNet18", json.dumps({"num_classes": 10})) - trigger = Trigger(trigger_id=10, pipeline_id=pipeline_id) - - database.session.add(trigger) - database.session.commit() - - pipeline2 = database.register_pipeline(4, "ResNet18", json.dumps({"num_classes": 10})) - trigger2 = Trigger(trigger_id=50, pipeline_id=pipeline2) - - database.session.add(trigger2) - database.session.commit() - - -def teardown(): - os.remove(DATABASE) - - -@patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.download_file") +@patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.download_file", return_value=True) @patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.current_time_millis", return_value=100) -def test_register_model(current_time_millis, download_file_mock: MagicMock): # pylint: disable=unused-argument +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "store_model", return_value=15) +@patch("os.remove") +def test_register_model( + os_remove_mock: MagicMock, + store_model_mock: MagicMock, + init_manager_mock, + current_time_millis, + download_file_mock: MagicMock, +): config = get_modyn_config() with tempfile.TemporaryDirectory() as storage_dir: storage_path = pathlib.Path(storage_dir) @@ -71,7 +46,7 @@ def test_register_model(current_time_millis, download_file_mock: MagicMock): # with open(storage_path / "test.txt", "wb") as file: file.write(b"Our test model") - servicer = ModelStorageGRPCServicer(config, storage_path) + servicer = ModelStorageGRPCServicer(config, storage_path, storage_path) assert servicer is not None req = RegisterModelRequest( @@ -80,75 +55,125 @@ def test_register_model(current_time_millis, download_file_mock: MagicMock): # hostname=config["trainer_server"]["hostname"], port=int(config["trainer_server"]["ftp_port"]), model_path="test.txt", + checksum=calculate_checksum(storage_path / "test.txt"), ) resp: RegisterModelResponse = servicer.RegisterModel(req, None) download_file_mock.assert_called_once() kwargs = download_file_mock.call_args.kwargs + remote_file_path = kwargs["remote_file_path"] local_file_path = kwargs["local_file_path"] shutil.copyfile(storage_path / remote_file_path, local_file_path) assert resp.success + assert resp.model_id == 15 - # download file under path {current_time_millis}_{pipeline_id}_{trigger_id}.modyn - with open(storage_path / f"100_{resp.model_id}_10.modyn", "rb") as file: + # download file under path {current_time_millis}_{pipeline_id}_{trigger_id}.zip + with open(storage_path / "100_1_10.modyn", "rb") as file: assert file.read().decode("utf-8") == "Our test model" + assert calculate_checksum(storage_path / "100_1_10.modyn") == kwargs["checksum"] + os_remove_mock.assert_called_with(storage_path / "100_1_10.modyn") + -def test_fetch_model(): +@patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.download_file", return_value=False) +@patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.current_time_millis", return_value=100) +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "store_model") +def test_register_model_invalid( + store_model_mock: MagicMock, init_manager_mock, current_time_millis, download_file_mock: MagicMock +): + config = get_modyn_config() + storage_path = pathlib.Path("storage_dir") + servicer = ModelStorageGRPCServicer(config, storage_path, storage_path) + + assert servicer is not None + req = RegisterModelRequest( + pipeline_id=1, + trigger_id=10, + hostname=config["trainer_server"]["hostname"], + port=int(config["trainer_server"]["ftp_port"]), + model_path="test.txt", + checksum=bytes([7, 1, 0]), + ) + + resp: RegisterModelResponse = servicer.RegisterModel(req, None) + download_file_mock.assert_called_once() + + assert not resp.success + store_model_mock.assert_not_called() + + +@patch("modyn.model_storage.internal.grpc.model_storage_grpc_servicer.current_time_millis", return_value=100) +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "load_model", return_value={"model": {"conv_1": 1}, "metadata": True}) +def test_fetch_model(load_model_mock: MagicMock, init_manager_mock, current_time_millis): config = get_modyn_config() with tempfile.TemporaryDirectory() as storage_dir: storage_path = pathlib.Path(storage_dir) - servicer = ModelStorageGRPCServicer(config, storage_path) + servicer = ModelStorageGRPCServicer(config, storage_path, storage_path) assert servicer is not None - with MetadataDatabaseConnection(config) as database: - model_id = database.add_trained_model(2, 50, "test_model.modyn") - - req = FetchModelRequest(model_id=model_id) + req = FetchModelRequest(model_id=10, load_metadata=True) resp: FetchModelResponse = servicer.FetchModel(req, None) assert resp.success - assert resp.model_path == "test_model.modyn" + load_model_mock.assert_called_once_with(10, True) - req_invalid = FetchModelRequest(model_id=142) - resp_invalid: FetchModelResponse = servicer.FetchModel(req_invalid, None) + # store final model to {current_time_millis()}_{model_id}.zip + assert resp.model_path == "100_10.modyn" - assert not resp_invalid.success + assert torch.load(storage_path / resp.model_path) == {"model": {"conv_1": 1}, "metadata": True} -def test_delete_model(): +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "load_model", return_value=None) +def test_fetch_model_invalid(load_model_mock: MagicMock, init_manager_mock): config = get_modyn_config() with tempfile.TemporaryDirectory() as storage_dir: storage_path = pathlib.Path(storage_dir) - servicer = ModelStorageGRPCServicer(config, storage_path) + servicer = ModelStorageGRPCServicer(config, storage_path, storage_dir) assert servicer is not None - with open(storage_path / "model_to_be_deleted.modyn", "wb") as file: - file.write(b"model that will be deleted") + req = FetchModelRequest(model_id=101, load_metadata=False) + resp: FetchModelResponse = servicer.FetchModel(req, None) + + assert not resp.success - assert os.path.isfile(storage_path / "model_to_be_deleted.modyn") + req = FetchModelRequest(model_id=101, load_metadata=True) + resp: FetchModelResponse = servicer.FetchModel(req, None) - with MetadataDatabaseConnection(config) as database: - model_id = database.add_trained_model(2, 50, "model_to_be_deleted.modyn") + assert not resp.success - req = DeleteModelRequest(model_id=model_id) - resp: DeleteModelResponse = servicer.DeleteModel(req, None) - assert resp.success - assert not os.path.isfile(storage_path / "model_to_be_deleted.modyn") +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "delete_model", return_value=True) +def test_delete_model(delete_model_mock: MagicMock, init_manager_mock): + config = get_modyn_config() + servicer = ModelStorageGRPCServicer(config, pathlib.Path("storage_dir"), pathlib.Path("ftp_dir")) + assert servicer is not None - req_invalid = DeleteModelRequest(model_id=model_id) - resp_invalid: DeleteModelResponse = servicer.DeleteModel(req_invalid, None) + req = DeleteModelRequest(model_id=20) + resp: DeleteModelResponse = servicer.DeleteModel(req, None) - assert not resp_invalid.success + assert resp.success + delete_model_mock.assert_called_once_with(20) + + +@patch.object(ModelStorageManager, "__init__", return_value=None) +@patch.object(ModelStorageManager, "delete_model", return_value=False) +def test_delete_model_invalid(delete_model_mock: MagicMock, init_manager_mock): + config = get_modyn_config() + servicer = ModelStorageGRPCServicer(config, pathlib.Path("storage_dir"), pathlib.Path("ftp_dir")) + assert servicer is not None - with MetadataDatabaseConnection(config) as database: - model_id = database.session.get(TrainedModel, model_id) + req = DeleteModelRequest(model_id=50) + resp: DeleteModelResponse = servicer.DeleteModel(req, None) - assert not model_id + assert not resp.success + delete_model_mock.assert_called_once_with(50) diff --git a/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_sub_difference_operator.py b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_sub_difference_operator.py new file mode 100644 index 000000000..4bc58d961 --- /dev/null +++ b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_sub_difference_operator.py @@ -0,0 +1,36 @@ +import io + +import torch +from modyn.model_storage.internal.storage_strategies import AbstractDifferenceOperator +from modyn.model_storage.internal.storage_strategies.difference_operators import SubDifferenceOperator + + +def test_inheritance(): + assert issubclass(SubDifferenceOperator.__class__, AbstractDifferenceOperator.__class__) + + +def test_calculate_difference(): + ones = torch.ones(1, dtype=torch.int32) + + difference_operator = SubDifferenceOperator() + assert difference_operator.calculate_difference(ones, ones) == b"\x00\x00\x00\x00" + + twos = ones * 2 + assert difference_operator.calculate_difference(twos, ones) == b"\x01\x00\x00\x00" + + +def test_calculate_restore(): + difference_operator = SubDifferenceOperator() + + ones = torch.ones(1, dtype=torch.int32) + buf = io.BytesIO() + buf.write(b"\x00\x00\x00\x00") + buf.seek(0) + + assert difference_operator.restore(ones, buf).item() == 1 + + buf.seek(0) + buf.write(b"\x01\x00\x00\x00") + buf.seek(0) + + assert difference_operator.restore(ones, buf).item() == 2 diff --git a/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_xor_difference_operator.py b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_xor_difference_operator.py new file mode 100644 index 000000000..b949e72b1 --- /dev/null +++ b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_xor_difference_operator.py @@ -0,0 +1,36 @@ +import io + +import torch +from modyn.model_storage.internal.storage_strategies import AbstractDifferenceOperator +from modyn.model_storage.internal.storage_strategies.difference_operators import XorDifferenceOperator + + +def test_inheritance(): + assert issubclass(XorDifferenceOperator.__class__, AbstractDifferenceOperator.__class__) + + +def test_calculate_difference(): + ones = torch.ones(1, dtype=torch.int32) + + difference_operator = XorDifferenceOperator() + assert difference_operator.calculate_difference(ones, ones) == b"\x00\x00\x00\x00" + + twos = ones * 2 + assert difference_operator.calculate_difference(twos, ones) == b"\x03\x00\x00\x00" + + +def test_calculate_restore(): + difference_operator = XorDifferenceOperator() + + ones = torch.ones(1, dtype=torch.int32) + buf = io.BytesIO() + buf.write(b"\x00\x00\x00\x00") + buf.seek(0) + + assert difference_operator.restore(ones, buf).item() == 1 + + buf.seek(0) + buf.write(b"\x03\x00\x00\x00") + buf.seek(0) + + assert difference_operator.restore(ones, buf).item() == 2 diff --git a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_compressed_full_model.py b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_compressed_full_model.py new file mode 100644 index 000000000..44acc8fad --- /dev/null +++ b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_compressed_full_model.py @@ -0,0 +1,41 @@ +import pathlib +import tempfile + +import torch +from modyn.model_storage.internal.storage_strategies.full_model_strategies import CompressedFullModel + + +class MockModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32)) + + def forward(self, data): + return data + + +def test_save_model(): + model = MockModel() + full_model_strategy = CompressedFullModel(zip_activated=False, zip_algorithm_name="", config={}) + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + full_model_strategy.save_model(model.state_dict(), temp_file_path) + + with open(temp_file_path, "rb") as stored_model_file: + assert stored_model_file.read() == b"\x00\x00\x80\x3f\x00\x00\x80\x3f" + + +def test_load_model(): + model = MockModel() + full_model_strategy = CompressedFullModel(zip_activated=False, zip_algorithm_name="", config={}) + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + with open(temp_file_path, "wb") as stored_model_file: + assert stored_model_file.write(b"\x00\x00\x00\x3f\x00\x00\x00\x3f") + + state_dict = model.state_dict() + full_model_strategy.load_model(state_dict, temp_file_path) + + assert state_dict["_weight"][0] == 0.5 # pylint: disable=unsubscriptable-object diff --git a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py new file mode 100644 index 000000000..e7cd0ba06 --- /dev/null +++ b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py @@ -0,0 +1,76 @@ +import pathlib +import tempfile +from zipfile import ZIP_DEFLATED + +import torch +from modyn.model_storage.internal.storage_strategies.full_model_strategies import PyTorchFullModel +from modyn.utils import unzip_file, zip_file + + +def test_save_model(): + full_model_strategy = PyTorchFullModel(zip_activated=False, zip_algorithm_name="", config={}) + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + full_model_strategy.save_model({"conv_1": True}, temp_file_path) + + loaded_state = torch.load(temp_file_path) + + assert loaded_state["conv_1"] + + +def test_save_model_zipped(): + full_model_strategy = PyTorchFullModel(zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={}) + with tempfile.TemporaryDirectory() as temp_directory: + directory_path = pathlib.Path(temp_directory) + + zipped_file_path = directory_path / "zipped.model" + full_model_strategy.save_model({"conv_1": True}, zipped_file_path) + + unzipped_file_path = pathlib.Path(directory_path / "unzipped.model") + unzip_file(zipped_file_path, unzipped_file_path, compression=ZIP_DEFLATED) + + loaded_state = torch.load(unzipped_file_path) + assert loaded_state["conv_1"] + + +def test_load_model(): + full_model_strategy = PyTorchFullModel(zip_activated=False, zip_algorithm_name="", config={}) + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + torch.save({"conv_1": True}, temp_file_path) + + state_dict = {"conv_1": False} + full_model_strategy.load_model(state_dict, temp_file_path) + + assert state_dict["conv_1"] + + +def test_load_model_zipped(): + full_model_strategy = PyTorchFullModel(zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={}) + with tempfile.TemporaryDirectory() as temp_directory: + directory_path = pathlib.Path(temp_directory) + + model_path = directory_path / "basic.model" + torch.save({"conv_1": True}, model_path) + zipped_model_path = directory_path / "zipped.model" + zip_file(model_path, zipped_model_path, compression=ZIP_DEFLATED) + + state_dict = {"conv_1": False} + full_model_strategy.load_model(state_dict, zipped_model_path) + + assert state_dict["conv_1"] + + +def test_store_then_load(): + full_model_strategy = PyTorchFullModel(zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={}) + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + model_state = {"conv_1": True} + full_model_strategy.save_model(model_state, temp_file_path) + loaded_state = {"conv_1": False} + full_model_strategy.load_model(loaded_state, temp_file_path) + + assert loaded_state["conv_1"] diff --git a/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py b/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py new file mode 100644 index 000000000..192dd09c1 --- /dev/null +++ b/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py @@ -0,0 +1,87 @@ +import pathlib +import tempfile +from zipfile import ZIP_LZMA + +import torch +from modyn.model_storage.internal.storage_strategies.difference_operators import ( + SubDifferenceOperator, + XorDifferenceOperator, +) +from modyn.model_storage.internal.storage_strategies.incremental_model_strategies import WeightsDifference + + +class MockModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self._weight = torch.nn.Parameter(torch.zeros(2, dtype=torch.float32)) + + def forward(self, data): + return data + + +def get_mock_model_after() -> MockModel: + model_after = MockModel() + model_after._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32)) + + return model_after + + +def test_init(): + incremental_strategy = WeightsDifference(zip_activated=False, zip_algorithm_name="", config={}) + + assert isinstance(incremental_strategy.difference_operator, SubDifferenceOperator.__class__) + assert not incremental_strategy.split_exponent + + incremental_strategy = WeightsDifference( + zip_activated=False, zip_algorithm_name="", config={"operator": "xor", "split_exponent": True} + ) + + assert not incremental_strategy.zip + assert isinstance(incremental_strategy.difference_operator, XorDifferenceOperator.__class__) + assert incremental_strategy.split_exponent + + incremental_strategy = WeightsDifference( + zip_activated=True, zip_algorithm_name="ZIP_LZMA", config={"operator": "sub", "split_exponent": False} + ) + + assert incremental_strategy.zip + assert incremental_strategy.zip_algorithm == ZIP_LZMA + assert isinstance(incremental_strategy.difference_operator, SubDifferenceOperator.__class__) + assert not incremental_strategy.split_exponent + + +def test_save_model(): + model_before = MockModel() + model_after = get_mock_model_after() + + for operator in ["xor", "sub"]: + incremental_strategy = WeightsDifference( + zip_activated=False, zip_algorithm_name="", config={"operator": operator} + ) + + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + incremental_strategy.save_model(model_after.state_dict(), model_before.state_dict(), temp_file_path) + + with open(temp_file_path, "rb") as stored_model_file: + assert stored_model_file.read() == b"\x00\x00\x80\x3f\x00\x00\x80\x3f" + + +def test_load_model(): + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + with open(temp_file_path, "wb") as stored_model_file: + stored_model_file.write(b"\x00\x00\x80\x3f\x00\x00\x80\x3f") + + for operator in ["xor", "sub"]: + model = MockModel() + model_state = model.state_dict() + incremental_strategy = WeightsDifference( + zip_activated=False, zip_algorithm_name="", config={"operator": operator} + ) + + incremental_strategy.load_model(model_state, temp_file_path) + + assert model_state["_weight"][0] == 1 # pylint: disable=unsubscriptable-object diff --git a/modyn/tests/model_storage/internal/test_model_storage_manager.py b/modyn/tests/model_storage/internal/test_model_storage_manager.py new file mode 100644 index 000000000..f5d01de2a --- /dev/null +++ b/modyn/tests/model_storage/internal/test_model_storage_manager.py @@ -0,0 +1,371 @@ +# pylint: disable=unused-argument +import json +import os +import pathlib +import tempfile +from unittest.mock import MagicMock, patch +from zipfile import ZIP_DEFLATED + +import torch +from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.models import TrainedModel +from modyn.metadata_database.utils import ModelStorageStrategyConfig +from modyn.model_storage.internal import ModelStorageManager +from modyn.model_storage.internal.storage_strategies.full_model_strategies import PyTorchFullModel +from modyn.model_storage.internal.storage_strategies.incremental_model_strategies import WeightsDifference +from modyn.models import ResNet18 +from modyn.utils import unzip_file, zip_file + +DATABASE = pathlib.Path(os.path.abspath(__file__)).parent / "test_model_storage.database" + + +def get_modyn_config(): + return { + "model_storage": {"port": "50051", "ftp_port": "5223"}, + "trainer_server": {"hostname": "localhost", "ftp_port": "5222"}, + "metadata_database": { + "drivername": "sqlite", + "username": "", + "password": "", + "host": "", + "port": 0, + "database": f"{DATABASE}", + }, + } + + +def setup(): + if os.path.exists(DATABASE): + os.remove(DATABASE) + + with MetadataDatabaseConnection(get_modyn_config()) as database: + database.create_tables() + + full_model_strategy = ModelStorageStrategyConfig(name="PyTorchFullModel") + inc_model_strategy = ModelStorageStrategyConfig(name="WeightsDifference") + inc_model_strategy.zip = False + inc_model_strategy.config = json.dumps({"operator": "sub"}) + database.register_pipeline( + 1, "ResNet18", json.dumps({"num_classes": 10}), True, full_model_strategy, inc_model_strategy, 5 + ) + + +def teardown(): + os.remove(DATABASE) + + +class MockModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self._weight = torch.nn.Parameter(torch.ones(1, dtype=torch.float32)) + + def forward(self, data): + return data + + +def get_mock_model_after() -> MockModel: + model_after = MockModel() + model_after._weight = torch.nn.Parameter(torch.ones(1, dtype=torch.float32) * 3) + + return model_after + + +def test_init(): + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) + + assert manager._modyn_config == get_modyn_config() + assert manager._storage_dir == pathlib.Path("storage") + + +def test__get_previous_model(): + with MetadataDatabaseConnection(get_modyn_config()) as database: + database.add_trained_model(10, 2, "model.modyn") + + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) + previous_model = manager._get_previous_model(10, 3) + assert previous_model and previous_model.trigger_id == 2 + assert manager._get_previous_model(10, 2) is None + + +def test__get_base_model_state(): + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) + model_state = manager._get_base_model_state(1) + + assert len(model_state) == 122 + + +def test__reconstruct_model(): + mock_model = MockModel() + model_state = mock_model.state_dict() + full_model_strategy = PyTorchFullModel(zip_activated=False, zip_algorithm_name="", config={}) + incremental_model_strategy = WeightsDifference(zip_activated=False, zip_algorithm_name="", config={}) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + + prev_model_file_name = "before.model" + full_model_strategy.save_model(model_state, temp_directory_path / prev_model_file_name) + + difference_model_file_name = "difference.model" + incremental_model_strategy.save_model( + get_mock_model_after().state_dict(), model_state, temp_directory_path / difference_model_file_name + ) + + with MetadataDatabaseConnection(get_modyn_config()) as database: + prev_model_id = database.add_trained_model(15, 3, prev_model_file_name) + curr_model_id = database.add_trained_model(15, 4, difference_model_file_name, parent_model=prev_model_id) + + manager._reconstruct_model(curr_model_id, model_state, manager.get_model_storage_strategy(1)) + + assert model_state["_weight"].item() == 3 # pylint: disable=unsubscriptable-object + + +def test__handle_new_model_full(): + with MetadataDatabaseConnection(get_modyn_config()) as database: + database.add_trained_model(1, 4, "model.modyn") + + mock_model = MockModel() + model_state = mock_model.state_dict() + + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) + + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + parent_id = manager._handle_new_model(1, 5, model_state, temp_file_path, manager.get_model_storage_strategy(1)) + assert parent_id is None + + loaded_state = torch.load(temp_file_path) + assert loaded_state["_weight"].item() == 1 + + +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +@patch.object(ModelStorageManager, "_reconstruct_model") +@patch.object(ModelStorageManager, "_get_previous_model", return_value=TrainedModel(model_id=101)) +def test__handle_new_model_incremental( + previous_model_mock, reconstruct_model_mock: MagicMock, base_model_state_mock: MagicMock +): + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) + + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + parent_id = manager._handle_new_model( + 5, 4, get_mock_model_after().state_dict(), temp_file_path, manager.get_model_storage_strategy(1) + ) + + assert parent_id == 101 + + with open(temp_file_path, "rb") as model_file: + assert model_file.read() == b"\x00\x00\x00\x40" + + base_model_state_mock.assert_called_once_with(5) + previous_model_mock.assert_called_once_with(5, 4) + + +def test_get_model_storage_strategy(): + with MetadataDatabaseConnection(get_modyn_config()) as database: + simple_pipeline = database.register_pipeline( + 74, + "ResNet18", + json.dumps({"num_classes": 10}), + True, + ModelStorageStrategyConfig(name="PyTorchFullModel"), + None, + None, + ) + + full_model_strategy = ModelStorageStrategyConfig(name="PyTorchFullModel") + full_model_strategy.zip = True + full_model_strategy.zip_algorithm = "ZIP_DEFLATED" + inc_model_strategy = ModelStorageStrategyConfig(name="WeightsDifference") + inc_model_strategy.zip = False + inc_model_strategy.config = json.dumps({"operator": "sub"}) + complex_pipeline = database.register_pipeline( + 75, "ResNet18", json.dumps({"num_classes": 10}), True, full_model_strategy, inc_model_strategy, 10 + ) + + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) + + strategy = manager.get_model_storage_strategy(simple_pipeline) + assert strategy.incremental_model_strategy is None + assert strategy.full_model_interval is None + assert not strategy.full_model_strategy.zip + + complex_strategy = manager.get_model_storage_strategy(complex_pipeline) + assert complex_strategy.full_model_strategy.zip + assert complex_strategy.full_model_strategy.zip_algorithm == ZIP_DEFLATED + assert complex_strategy.incremental_model_strategy + assert not complex_strategy.incremental_model_strategy.zip + + +@patch("modyn.model_storage.internal.model_storage_manager.current_time_millis", return_value=100) +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test_store_model(base_model_mock, current_time_mock): + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + + with MetadataDatabaseConnection(get_modyn_config()) as database: + parent_id = database.add_trained_model(1, 128, "before.model") + + model_storage_strategy = manager.get_model_storage_strategy(1) + model_storage_strategy.full_model_strategy.save_model( + MockModel().state_dict(), temp_directory_path / "before.model" + ) + + torch.save( + {"model": get_mock_model_after().state_dict(), "metadata": True}, temp_directory_path / "model.modyn" + ) + + model_id = manager.store_model(1, 129, temp_directory_path / "model.modyn") + + with MetadataDatabaseConnection(get_modyn_config()) as database: + model: TrainedModel = database.session.get(TrainedModel, model_id) + + assert model.pipeline_id == 1 + assert model.trigger_id == 129 + assert model.model_path == "100_1_129.model" + assert model.parent_model == parent_id + assert model.metadata_path == "100_1_129.metadata.zip" + + with open(temp_directory_path / model.model_path, "rb") as model_file: + assert model_file.read() == b"\x00\x00\x00\x40" + + unzip_file(temp_directory_path / model.metadata_path, temp_directory_path / "unzipped.metadata") + assert torch.load(temp_directory_path / "unzipped.metadata")["metadata"] + + loaded_model = manager.load_model(model_id, True) + + assert loaded_model["model"]["_weight"].item() == 3 + assert loaded_model["metadata"] + + +def test_store_model_resnet(): + full_model_strategy = ModelStorageStrategyConfig(name="CompressedFullModel") + full_model_strategy.zip = True + + with MetadataDatabaseConnection(get_modyn_config()) as database: + pipeline_id = database.register_pipeline( + 1, "ResNet18", json.dumps({"num_classes": 10}), True, full_model_strategy + ) + + resnet = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + + torch.save({"model": resnet.model.state_dict(), "metadata": True}, temp_directory_path / "model.modyn") + + model_id = manager.store_model(pipeline_id, 1, temp_directory_path / "model.modyn") + loaded_state = manager.load_model(model_id, True) + assert loaded_state["metadata"] + + original_state = resnet.model.state_dict() + for layer_name, _ in loaded_state["model"].items(): + assert torch.all(torch.eq(loaded_state["model"][layer_name], original_state[layer_name])) + + +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test_load_model(base_model_mock: MagicMock): + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + + model_file_name = "mock.model" + with MetadataDatabaseConnection(get_modyn_config()) as database: + model_id = database.add_trained_model(1, 32, model_file_name) + + model_storage_strategy = manager.get_model_storage_strategy(1) + model_storage_strategy.full_model_strategy.save_model( + get_mock_model_after().state_dict(), temp_directory_path / model_file_name + ) + + reconstructed_state = manager.load_model(model_id, False) + + assert reconstructed_state["model"]["_weight"].item() == 3 + + +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test_load_model_metadata(base_model_mock: MagicMock): + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + + model_file_name = "mock.model" + with MetadataDatabaseConnection(get_modyn_config()) as database: + model_id = database.add_trained_model(1, 32, model_file_name, "mock.metadata.zip") + + model_storage_strategy = manager.get_model_storage_strategy(1) + model_storage_strategy.full_model_strategy.save_model( + get_mock_model_after().state_dict(), temp_directory_path / model_file_name + ) + torch.save({"metadata": True}, temp_directory_path / "mock.metadata") + zip_file(temp_directory_path / "mock.metadata", temp_directory_path / "mock.metadata.zip") + + reconstructed_state = manager.load_model(model_id, True) + + assert reconstructed_state["model"]["_weight"].item() == 3 + assert reconstructed_state["metadata"] + + +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test_load_model_invalid(base_model_mock: MagicMock): + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + + assert manager.load_model(133, False) is None + + model_file_name = "mock.model" + with MetadataDatabaseConnection(get_modyn_config()) as database: + model_id = database.add_trained_model(1, 23, model_file_name) + + model_storage_strategy = manager.get_model_storage_strategy(1) + model_storage_strategy.full_model_strategy.save_model( + get_mock_model_after().state_dict(), temp_directory_path / model_file_name + ) + + assert manager.load_model(model_id, True) is None + + +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test_delete_model(base_model_mock: MagicMock): + mock_model = MockModel() + model_state = mock_model.state_dict() + model_state_after = get_mock_model_after().state_dict() + + with tempfile.TemporaryDirectory() as temp_dir: + temp_directory_path = pathlib.Path(temp_dir) + + with MetadataDatabaseConnection(get_modyn_config()) as database: + parent_id = database.add_trained_model(1, 52, "parent.modyn") + first_child_id = database.add_trained_model(1, 53, "child1.modyn", parent_model=parent_id) + second_child_id = database.add_trained_model(1, 54, "child2.modyn", parent_model=parent_id) + + manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + model_storage_strategy = manager.get_model_storage_strategy(1) + model_storage_strategy.full_model_strategy.save_model(model_state, temp_directory_path / "parent.modyn") + model_storage_strategy.incremental_model_strategy.save_model( + model_state_after, model_state, temp_directory_path / "child1.modyn" + ) + model_storage_strategy.incremental_model_strategy.save_model( + model_state_after, model_state, temp_directory_path / "child2.modyn" + ) + + success = manager.delete_model(parent_id) + + assert success + assert not (temp_directory_path / "parent.modyn").exists() + + with MetadataDatabaseConnection(get_modyn_config()) as database: + first_child: TrainedModel = database.session.get(TrainedModel, first_child_id) + second_child: TrainedModel = database.session.get(TrainedModel, second_child_id) + + assert first_child.parent_model is None + assert second_child.parent_model is None + + assert manager.load_model(first_child_id, False)["model"]["_weight"] == 3 + assert manager.load_model(second_child_id, False)["model"]["_weight"] == 3 + assert not manager.delete_model(-1) diff --git a/modyn/tests/model_storage/internal/utils/test_data_types.py b/modyn/tests/model_storage/internal/utils/test_data_types.py new file mode 100644 index 000000000..6e05fe035 --- /dev/null +++ b/modyn/tests/model_storage/internal/utils/test_data_types.py @@ -0,0 +1,24 @@ +import io + +import numpy as np +import torch +from modyn.model_storage.internal.utils import create_tensor, read_tensor_from_bytes + + +def test_read_tensor_from_bytes(): + buf = io.BytesIO() + buf.write(b"\x01\x00\x00\x00") + buf.write(b"\x02\x00\x00\x00") + buf.write(b"\x03\x00\x00\x00") + buf.write(b"\x04\x00\x00\x00") + buf.seek(0) + res = read_tensor_from_bytes(torch.ones((2, 2), dtype=torch.int32), buf) + + assert res[0, 0] == 1 and res[0, 1] == 2 and res[1, 0] == 3 and res[1, 1] == 4 + + +def test_create_tensor(): + byte_num = bytes(b"\x04\x00\x00\x00") + tensor = create_tensor(byte_num, dtype=np.dtype(np.int32), shape=torch.Size([1])) + + assert tensor.item() == 4 diff --git a/modyn/tests/model_storage/internal/utils/test_model_storage_strategy.py b/modyn/tests/model_storage/internal/utils/test_model_storage_strategy.py new file mode 100644 index 000000000..1a9c4ed45 --- /dev/null +++ b/modyn/tests/model_storage/internal/utils/test_model_storage_strategy.py @@ -0,0 +1,56 @@ +import json +from zipfile import ZIP_DEFLATED, ZIP_LZMA + +import pytest +from modyn.model_storage.internal.storage_strategies.difference_operators import XorDifferenceOperator +from modyn.model_storage.internal.utils import ModelStorageStrategy + + +def test_basic_model_storage_strategy(): + model_storage_strategy = ModelStorageStrategy("PyTorchFullModel", None, None, None) + + assert model_storage_strategy.incremental_model_strategy is None + assert model_storage_strategy.full_model_interval is None + assert not model_storage_strategy.full_model_strategy.zip + + +def test_extended_model_storage_strategy(): + model_storage_strategy = ModelStorageStrategy( + full_model_strategy_name="PyTorchFullModel", + full_model_strategy_zip=True, + full_model_strategy_zip_algorithm="ZIP_LZMA", + full_model_strategy_config=None, + ) + model_storage_strategy.register_incremental_model_strategy( + name="WeightsDifference", + zip_enabled=True, + zip_algorithm=None, + config=json.dumps({"operator": "xor", "split_exponent": True}), + full_model_interval=10, + ) + + assert model_storage_strategy.full_model_strategy.zip + assert model_storage_strategy.full_model_strategy.zip_algorithm == ZIP_LZMA + + weights_diff_strategy = model_storage_strategy.incremental_model_strategy + assert weights_diff_strategy.zip + assert weights_diff_strategy.zip_algorithm == ZIP_DEFLATED + assert getattr(weights_diff_strategy, "split_exponent") + assert isinstance(getattr(weights_diff_strategy, "difference_operator"), XorDifferenceOperator.__class__) + + assert model_storage_strategy.full_model_interval == 10 + + +def test_model_storage_strategy_invalid(): + strategy = ModelStorageStrategy( + full_model_strategy_name="PyTorchFullModel", + full_model_strategy_zip=None, + full_model_strategy_zip_algorithm=None, + full_model_strategy_config=None, + ) + + with pytest.raises(ValueError): + strategy.register_incremental_model_strategy("WeightsDifference", None, None, None, 0) + + with pytest.raises(NotImplementedError): + strategy.register_incremental_model_strategy("UnknownStrategy", None, None, None, None) diff --git a/modyn/tests/model_storage/test_model_storage.py b/modyn/tests/model_storage/test_model_storage.py index 1bad9fedd..9f5b1cf12 100644 --- a/modyn/tests/model_storage/test_model_storage.py +++ b/modyn/tests/model_storage/test_model_storage.py @@ -1,26 +1,24 @@ -import os import pathlib +import tempfile from unittest.mock import patch -import pytest from modyn.model_storage import ModelStorage from modyn.model_storage.internal.grpc.grpc_server import GRPCServer -modyn_config = ( - pathlib.Path(os.path.abspath(__file__)).parent.parent.parent / "config" / "examples" / "modyn_config.yaml" -) - -def get_invalid_modyn_config() -> dict: - return {"invalid": "not_valid"} +def get_modyn_config(): + return {"model_storage": {"port": "5001", "ftp_port": "5002"}} # pylint: disable=unused-argument -def noop_setup_directory(self): +def noop_setup_directories(self): pass class MockFTPServer: + def __init__(self, ftp_port, ftp_directory): # pylint: disable=unused-argument + pass + def __enter__(self): pass @@ -41,18 +39,20 @@ def __exit__(self, *args, **kwargs): # pylint: disable=unused-argument pass -@patch.object(ModelStorage, "_setup_model_storage_directory", noop_setup_directory) +@patch.object(ModelStorage, "_setup_model_storage_directories", noop_setup_directories) def test_model_storage_init(): - model_storage = ModelStorage(modyn_config) - assert model_storage.config == modyn_config - + model_storage = ModelStorage(get_modyn_config()) + assert model_storage.config == get_modyn_config() -@patch.object(ModelStorage, "_setup_model_storage_directory", noop_setup_directory) -def test_validate_config(): - model_storage = ModelStorage(modyn_config) - assert model_storage._validate_config()[0] +@patch("modyn.model_storage.model_storage.GRPCServer", MockGRPCServer) +@patch("modyn.model_storage.model_storage.FTPServer", MockFTPServer) +@patch("os.makedirs") +def test_cleanup_at_exit(test_os_makedirs): + ftp_directory = pathlib.Path(tempfile.gettempdir()) / "ftp_model_storage" + assert not ftp_directory.exists() -def test_invalid_config(): - with pytest.raises(ValueError): - ModelStorage(get_invalid_modyn_config()) + model_storage = ModelStorage(get_modyn_config()) + assert ftp_directory.exists() + model_storage.run() + assert not ftp_directory.exists() diff --git a/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py b/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py index 8af446797..627655417 100644 --- a/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py +++ b/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py @@ -11,12 +11,14 @@ GetSamplesRequest, GetSelectionStrategyRequest, JsonString, + ModelStorageStrategyInfo, NumberOfPartitionsResponse, NumberOfSamplesResponse, PipelineResponse, RegisterPipelineRequest, SamplesResponse, SelectionStrategyResponse, + StrategyConfig, TriggerResponse, ) from modyn.selector.internal.grpc.selector_grpc_servicer import SelectorGRPCServicer @@ -49,18 +51,34 @@ def test_register_pipeline(test_register_pipeline: MagicMock): config["selector"]["trigger_sample_directory"] = tmp_dir mgr = SelectorManager(config) servicer = SelectorGRPCServicer(mgr, 8096) + model_storage_strategy = ModelStorageStrategyInfo( + full_model_strategy_config=StrategyConfig(name="PyTorchFullModel") + ) request = RegisterPipelineRequest( num_workers=2, selection_strategy=JsonString(value="strat"), model_id="ResNet18", model_configuration=JsonString(value="{}"), + amp=True, + model_storage_strategy=model_storage_strategy, ) test_register_pipeline.return_value = 42 response: PipelineResponse = servicer.register_pipeline(request, None) assert response.pipeline_id == 42 - test_register_pipeline.assert_called_once_with(2, "strat", "ResNet18", "{}") + test_register_pipeline.assert_called_once() + + arguments = test_register_pipeline.call_args[0] + assert arguments[0] == 2 + assert arguments[1] == "strat" + assert arguments[2] == "ResNet18" + assert arguments[3] == "{}" + assert arguments[4] + assert arguments[5].name == "PyTorchFullModel" + assert arguments[5].zip is None + assert arguments[6] is None + assert arguments[7] is None @patch.object(SelectorManager, "init_metadata_db", noop_init_metadata_db) diff --git a/modyn/tests/selector/internal/test_selector_manager.py b/modyn/tests/selector/internal/test_selector_manager.py index 7bb72a62d..b1ffeab05 100644 --- a/modyn/tests/selector/internal/test_selector_manager.py +++ b/modyn/tests/selector/internal/test_selector_manager.py @@ -5,6 +5,7 @@ from unittest.mock import MagicMock, patch import pytest +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.selector.internal.selector_manager import SelectorManager from modyn.selector.internal.selector_strategies.abstract_selection_strategy import AbstractSelectionStrategy from modyn.selector.selector import Selector @@ -45,7 +46,16 @@ def __init__(self, modyn_config: dict): # pylint: disable=super-init-not-called self.current_pipeline_id = 0 # pylint: disable=unused-argument - def register_pipeline(self, number_of_workers: int, model_id: int, model_config: dict) -> Optional[int]: + def register_pipeline( + self, + number_of_workers: int, + model_id: int, + model_config: dict, + amp: bool, + full_model_strategy: ModelStorageStrategyConfig, + incremental_model_strategy: Optional[ModelStorageStrategyConfig] = None, + full_model_interval: Optional[int] = None, + ) -> Optional[int]: pid = self.current_pipeline_id self.current_pipeline_id += 1 return pid @@ -90,13 +100,20 @@ def test_register_pipeline(test__instantiate_strategy: MagicMock): assert len(selec._selectors) == 0 - assert selec.register_pipeline(42, "{}", "RestNet18", "{}") == 0 + assert ( + selec.register_pipeline( + 42, "{}", "RestNet18", "{}", True, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) + == 0 + ) assert len(selec._selectors) == 1 assert isinstance(selec._selectors[0]._strategy, MockStrategy) with pytest.raises(ValueError): - selec.register_pipeline(0, "strat", "RestNet18", "{}") + selec.register_pipeline( + 0, "strat", "RestNet18", "{}", False, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) @patch("modyn.selector.internal.selector_manager.MetadataDatabaseConnection", MockDatabaseConnection) @@ -112,7 +129,9 @@ def test_get_sample_keys_and_weights( selec = SelectorManager(config) test__instantiate_strategy.return_value = MockStrategy() - pipe_id = selec.register_pipeline(2, "{}", "RestNet18", "{}") + pipe_id = selec.register_pipeline( + 2, "{}", "RestNet18", "{}", True, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) with pytest.raises(ValueError): # Non existing pipeline @@ -143,7 +162,9 @@ def test_inform_data(selector_inform_data: MagicMock, test__instantiate_strategy with pytest.raises(ValueError): selec.inform_data(0, [10], [0], [0]) - pipe_id = selec.register_pipeline(2, "{}", "RestNet18", "{}") + pipe_id = selec.register_pipeline( + 2, "{}", "RestNet18", "{}", False, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) selector_inform_data.return_value = None selec.inform_data(pipe_id, [10], [0], [0]) @@ -165,7 +186,9 @@ def test_inform_data_and_trigger(selector_inform_data_and_trigger: MagicMock, te with pytest.raises(ValueError): selec.inform_data_and_trigger(0, [10], [0], [0]) - pipe_id = selec.register_pipeline(2, "{}", "RestNet18", "{}") + pipe_id = selec.register_pipeline( + 2, "{}", "RestNet18", "{}", True, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) selector_inform_data_and_trigger.return_value = None selec.inform_data_and_trigger(pipe_id, [10], [0], [0]) @@ -184,7 +207,9 @@ def test_get_available_labels(selector_get_available_labels: MagicMock, test__in selector = SelectorManager(config) test__instantiate_strategy.return_value = MockStrategy() - pipe_id = selector.register_pipeline(2, "{}", "RestNet18", "{}") + pipe_id = selector.register_pipeline( + 2, "{}", "RestNet18", "{}", False, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) selector_get_available_labels.return_value = None selector.get_available_labels(pipe_id) @@ -228,7 +253,9 @@ def test_get_number_of_samples(selector_get_number_of_samples: MagicMock, test__ with pytest.raises(ValueError): selec.get_number_of_samples(0, 0) - pipe_id = selec.register_pipeline(2, "{}", "RestNet18", "{}") + pipe_id = selec.register_pipeline( + 2, "{}", "RestNet18", "{}", True, ModelStorageStrategyConfig(name="PyTorchFullModel") + ) selector_get_number_of_samples.return_value = 12 assert selec.get_number_of_samples(pipe_id, 21) == 12 diff --git a/modyn/tests/supervisor/internal/test_grpc_handler.py b/modyn/tests/supervisor/internal/test_grpc_handler.py index a2ae55118..abe69afcb 100644 --- a/modyn/tests/supervisor/internal/test_grpc_handler.py +++ b/modyn/tests/supervisor/internal/test_grpc_handler.py @@ -18,7 +18,6 @@ from modyn.selector.internal.grpc.generated.selector_pb2 import ( DataInformRequest, GetNumberOfSamplesRequest, - JsonString, NumberOfSamplesResponse, PipelineResponse, RegisterPipelineRequest, @@ -337,20 +336,37 @@ def test_register_pipeline_at_selector(test_grpc_connection_established): result = handler.register_pipeline_at_selector( { "pipeline": {"name": "test"}, - "training": {"dataloader_workers": 2, "selection_strategy": {}}, + "training": {"dataloader_workers": 2, "selection_strategy": {}, "amp": True}, "model": {"id": "ResNet18"}, + "model_storage": { + "full_model_strategy": {"name": "PyTorchFullModel", "zip": True, "zip_algorithm": "ZIP_DEFLATED"}, + "incremental_model_strategy": {"name": "WeightsDifference", "config": {"operator": "sub"}}, + "full_model_interval": 10, + }, } ) assert result == 42 - mock.assert_called_once_with( - RegisterPipelineRequest( - num_workers=2, - selection_strategy=JsonString(value="{}"), - model_id="ResNet18", - model_configuration=JsonString(value="{}"), - ) + mock.assert_called_once() + + request: RegisterPipelineRequest = mock.call_args.args[0] + assert request.num_workers == 2 + assert request.selection_strategy.value == "{}" + assert request.model_id == "ResNet18" + assert request.model_configuration.value == "{}" + assert request.amp + assert request.model_storage_strategy.full_model_strategy_config.name == "PyTorchFullModel" + assert request.model_storage_strategy.full_model_strategy_config.zip + assert request.model_storage_strategy.full_model_strategy_config.zip_algorithm == "ZIP_DEFLATED" + assert not request.model_storage_strategy.full_model_strategy_config.HasField("config") + assert request.model_storage_strategy.incremental_model_strategy_config.name == "WeightsDifference" + assert not request.model_storage_strategy.incremental_model_strategy_config.HasField("zip") + assert not request.model_storage_strategy.incremental_model_strategy_config.HasField("zip_algorithm") + assert ( + json.loads(request.model_storage_strategy.incremental_model_strategy_config.config.value)["operator"] + == "sub" ) + assert request.model_storage_strategy.full_model_interval == 10 def test_unregister_pipeline_at_selector(): @@ -539,6 +555,19 @@ def test_start_evaluation(test_connection_established): avail_method.assert_called_once() +def test__prepare_evaluation_request(): + pipeline_config = get_minimal_pipeline_config() + request = GRPCHandler._prepare_evaluation_request(pipeline_config["evaluation"]["datasets"][0], 23, "cpu") + + assert request.trained_model_id == 23 + assert request.device == "cpu" + assert request.batch_size == 64 + assert request.dataset_info.dataset_id == "MNIST_eval" + assert request.dataset_info.num_dataloaders == 2 + assert request.metrics[0].name == "Accuracy" + assert request.metrics[0].config.value == "{}" + + @patch("modyn.supervisor.internal.grpc_handler.grpc_connection_established", return_value=True) def test_wait_for_evaluation_completion(test_connection_established): mgr = enlighten.get_manager() diff --git a/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py b/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py index a4c31ba43..77947a975 100644 --- a/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py +++ b/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py @@ -4,7 +4,6 @@ import os import pathlib import platform -import shutil import tempfile from io import BytesIO from time import sleep @@ -13,6 +12,7 @@ import torch from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( FetchModelRequest, FetchModelResponse, @@ -35,6 +35,7 @@ from modyn.trainer_server.internal.utils.trainer_messages import TrainerMessages from modyn.trainer_server.internal.utils.training_info import TrainingInfo from modyn.trainer_server.internal.utils.training_process_info import TrainingProcessInfo +from modyn.utils import calculate_checksum DATABASE = pathlib.Path(os.path.abspath(__file__)).parent / "test_trainer_server.database" @@ -63,26 +64,6 @@ "model_storage": {"hostname": "model_storage", "port": "5004"}, } -modyn_download_file_config = { - "trainer_server": { - "hostname": "trainer_server", - "port": "5001", - "ftp_port": "3001", - "offline_dataset_directory": "/tmp/offline_dataset", - }, - "metadata_database": { - "drivername": "sqlite", - "username": "", - "password": "", - "host": "", - "port": 0, - "database": f"{DATABASE}", - }, - "storage": {"hostname": "storage", "port": "5002"}, - "selector": {"hostname": "selector", "port": "5003"}, - "model_storage": {"hostname": "localhost", "port": "5004", "ftp_port": "3002"}, -} - def setup(): if os.path.exists(DATABASE): @@ -91,7 +72,15 @@ def setup(): with MetadataDatabaseConnection(modyn_config) as database: database.create_tables() - database.register_pipeline(1, "model", json.dumps({})) + database.register_pipeline( + 1, + "model", + json.dumps({}), + True, + ModelStorageStrategyConfig(name="PyTorchFullModel"), + incremental_model_strategy=None, + full_model_interval=None, + ) def teardown(): @@ -142,7 +131,6 @@ def get_start_training_request(checkpoint_path=""): pipeline_id=1, trigger_id=1, device="cpu", - amp=False, batch_size=32, torch_optimizers_configuration=JsonString( value=json.dumps( @@ -177,6 +165,7 @@ def get_training_info( training_id, "model", json.dumps({}), + True, storage_address, selector_address, offline_dataset_path, @@ -237,18 +226,19 @@ def test_start_training_invalid_id(test_hasattr, test_connect_to_model_storage): assert not resp.training_started -@patch("modyn.trainer_server.internal.grpc.trainer_server_grpc_servicer.download_file") +@patch( + "modyn.trainer_server.internal.grpc.trainer_server_grpc_servicer.download_trained_model", + return_value=pathlib.Path("downloaded_model.modyn"), +) @patch.object(TrainerServerGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) @patch("modyn.trainer_server.internal.grpc.trainer_server_grpc_servicer.hasattr", return_value=True) @patch( "modyn.trainer_server.internal.utils.training_info.getattr", return_value=DummyModelWrapper, ) -def test_start_training(test_getattr, test_hasattr, test_connect_to_model_storage, download_file_mock: MagicMock): +def test_start_training(test_getattr, test_hasattr, test_connect_to_model_storage, download_model_mock: MagicMock): with tempfile.TemporaryDirectory() as modyn_temp: - trainer_server = TrainerServerGRPCServicer(modyn_download_file_config, modyn_temp) - with open(pathlib.Path(modyn_temp) / "testpath.modyn", "wb") as file: - file.write(b"Our pretrained model!") + trainer_server = TrainerServerGRPCServicer(modyn_config, modyn_temp) mock_start = mock.Mock() mock_start.side_effect = noop trainer_server._training_dict[1] = None @@ -263,6 +253,7 @@ def test_start_training(test_getattr, test_hasattr, test_connect_to_model_storag assert trainer_server._next_training_id == 2 assert trainer_server._training_dict[1].model_id == "model" assert trainer_server._training_dict[1].model_configuration_dict == {} + assert trainer_server._training_dict[1].amp request = get_start_training_request() request.use_pretrained_model = True @@ -270,16 +261,20 @@ def test_start_training(test_getattr, test_hasattr, test_connect_to_model_storag resp = trainer_server.start_training(request, None) - download_file_mock.assert_called_once() - kwargs = download_file_mock.call_args.kwargs - remote_file_path = kwargs["remote_file_path"] - local_file_path = kwargs["local_file_path"] - - shutil.copyfile(pathlib.Path(modyn_temp) / remote_file_path, local_file_path) + download_model_mock.assert_called_once() + kwargs = download_model_mock.call_args.kwargs + remote_file_path = kwargs["remote_path"] + base_directory = kwargs["base_directory"] + identifier = kwargs["identifier"] assert resp.training_id == 2 - with open(trainer_server._training_dict[resp.training_id].pretrained_model_path, "rb") as file: - assert file.read().decode("utf-8") == "Our pretrained model!" + assert str(remote_file_path) == "testpath.modyn" + assert base_directory == trainer_server._modyn_base_dir + assert resp.training_started + assert resp.training_id == identifier + assert ( + str(trainer_server._training_dict[resp.training_id].pretrained_model_path) == "downloaded_model.modyn" + ) @patch.object(TrainerServerGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) @@ -569,6 +564,7 @@ def test_store_final_model_found(test_is_alive, test_connect_to_model_storage): checkpoint_file = base_path / "model_final.modyn" torch.save(dict_to_save, checkpoint_file) + checksum = calculate_checksum(checkpoint_file) trainer_server._training_dict[1] = training_info trainer_server._training_process_dict[1] = get_training_process_info() @@ -585,6 +581,7 @@ def test_store_final_model_found(test_is_alive, test_connect_to_model_storage): assert req.hostname == "trainer_server" assert req.port == 3001 assert req.model_path == "model_final.modyn" + assert req.checksum == checksum @patch.object(TrainerServerGRPCServicer, "connect_to_model_storage", return_value=DummyModelStorageStub()) diff --git a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py index d22e77515..695294d62 100644 --- a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py +++ b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py @@ -201,7 +201,6 @@ def get_training_info( pipeline_id=1, trigger_id=1, device="cpu", - amp=False, data_info=Data(dataset_id="MNIST", num_dataloaders=2), torch_optimizers_configuration=JsonString(value=json.dumps(torch_optimizers_configuration)), criterion_parameters=JsonString(value=json.dumps({})), @@ -223,6 +222,7 @@ def get_training_info( training_id, "model", json.dumps({}), + False, storage_address, selector_address, offline_dataset_path, diff --git a/modyn/tests/utils/test_utils.py b/modyn/tests/utils/test_utils.py index f65327761..20bb683ae 100644 --- a/modyn/tests/utils/test_utils.py +++ b/modyn/tests/utils/test_utils.py @@ -1,5 +1,6 @@ # pylint: disable=unused-argument,redefined-outer-name import pathlib +import tempfile from unittest.mock import patch import grpc @@ -11,6 +12,7 @@ from modyn.supervisor.internal.grpc_handler import GRPCHandler from modyn.trainer_server.internal.trainer.remote_downsamplers import RemoteLossDownsampling from modyn.utils import ( + calculate_checksum, convert_timestr_to_seconds, current_time_millis, deserialize_function, @@ -22,8 +24,10 @@ package_available_and_can_be_imported, seed_everything, trigger_available, + unzip_file, validate_timestr, validate_yaml, + zip_file, ) from modyn.utils.utils import instantiate_class @@ -202,3 +206,46 @@ def test_instantiate_class_not_existing(): # missing parameters with pytest.raises(TypeError): instantiate_class("modyn.common.trigger_sample", "TriggerSampleStorage") + + +def test_calculate_checksum(): + with tempfile.TemporaryDirectory() as tempdir: + tempdir_path = pathlib.Path(tempdir) + + with open(tempdir_path / "testfile1.txt", "w", encoding="utf-8") as file: + file.write("This is a test") + + with open(tempdir_path / "testfile2.txt", "w", encoding="utf-8") as file: + file.write("This is a test") + + assert calculate_checksum(tempdir_path / "testfile1.txt") == calculate_checksum(tempdir_path / "testfile2.txt") + assert calculate_checksum(tempdir_path / "testfile1.txt", chunk_num_blocks=20) == calculate_checksum( + tempdir_path / "testfile2.txt", chunk_num_blocks=10 + ) + assert calculate_checksum(tempdir_path / "testfile1.txt", hash_func_name="blake2s") != calculate_checksum( + tempdir_path / "testfile2.txt", chunk_num_blocks=10 + ) + + +def test_zip_and_unzip_file(): + with tempfile.TemporaryDirectory() as tempdir: + tempdir_path = pathlib.Path(tempdir) + + text_file_path = tempdir_path / "testfile.txt" + zip_file_path = tempdir_path / "testfile.zip" + + with open(text_file_path, "w", encoding="utf-8") as file: + file.write("This is a testfile!") + + zip_file(text_file_path, zip_file_path, remove_file=True) + + assert not text_file_path.exists() + assert zip_file_path.exists() and zip_file_path.is_file() + + unzip_file(zip_file_path, text_file_path, remove_file=True) + + assert not zip_file_path.exists() + assert text_file_path.exists() and text_file_path.is_file() + + with open(text_file_path, "r", encoding="utf-8") as file: + assert file.read() == "This is a testfile!" diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py index 9df036ad7..fd8388ca0 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xb8\x05\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x0b\n\x03\x61mp\x18\x04 \x01(\x08\x12\x1c\n\x14use_pretrained_model\x18\x07 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x08 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\t \x01(\x05\x12\x12\n\nbatch_size\x18\n \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x0b \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\x0c \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\r \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0e \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0f \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x11 \x03(\t\x12)\n\x0clr_scheduler\x18\x12 \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x13 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x14 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x15 \x01(\x05\x12\x11\n\x04seed\x18\x16 \x01(\x05H\x00\x88\x01\x01\x42\x07\n\x05_seed\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\x84\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12\x16\n\texception\x18\x06 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x07 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x08 \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\t \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\n \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xab\x05\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x1c\n\x14use_pretrained_model\x18\x04 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x05 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\x06 \x01(\x05\x12\x12\n\nbatch_size\x18\x07 \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x08 \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\t \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\n \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0b \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0c \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\r \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x0e \x03(\t\x12)\n\x0clr_scheduler\x18\x0f \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x11 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x12 \x01(\x05\x12\x11\n\x04seed\x18\x13 \x01(\x05H\x00\x88\x01\x01\x42\x07\n\x05_seed\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\x84\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12\x16\n\texception\x18\x06 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x07 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x08 \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\t \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\n \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trainer_server_pb2', globals()) @@ -34,21 +34,21 @@ _CHECKPOINTINFO._serialized_start=220 _CHECKPOINTINFO._serialized_end=290 _STARTTRAININGREQUEST._serialized_start=293 - _STARTTRAININGREQUEST._serialized_end=989 - _STARTTRAININGRESPONSE._serialized_start=991 - _STARTTRAININGRESPONSE._serialized_end=1061 - _TRAININGSTATUSREQUEST._serialized_start=1063 - _TRAININGSTATUSREQUEST._serialized_end=1107 - _TRAININGSTATUSRESPONSE._serialized_start=1110 - _TRAININGSTATUSRESPONSE._serialized_end=1498 - _STOREFINALMODELREQUEST._serialized_start=1500 - _STOREFINALMODELREQUEST._serialized_end=1545 - _STOREFINALMODELRESPONSE._serialized_start=1547 - _STOREFINALMODELRESPONSE._serialized_end=1611 - _GETLATESTMODELREQUEST._serialized_start=1613 - _GETLATESTMODELREQUEST._serialized_end=1657 - _GETLATESTMODELRESPONSE._serialized_start=1659 - _GETLATESTMODELRESPONSE._serialized_end=1724 - _TRAINERSERVER._serialized_start=1727 - _TRAINERSERVER._serialized_end=2184 + _STARTTRAININGREQUEST._serialized_end=976 + _STARTTRAININGRESPONSE._serialized_start=978 + _STARTTRAININGRESPONSE._serialized_end=1048 + _TRAININGSTATUSREQUEST._serialized_start=1050 + _TRAININGSTATUSREQUEST._serialized_end=1094 + _TRAININGSTATUSRESPONSE._serialized_start=1097 + _TRAININGSTATUSRESPONSE._serialized_end=1485 + _STOREFINALMODELREQUEST._serialized_start=1487 + _STOREFINALMODELREQUEST._serialized_end=1532 + _STOREFINALMODELRESPONSE._serialized_start=1534 + _STOREFINALMODELRESPONSE._serialized_end=1598 + _GETLATESTMODELREQUEST._serialized_start=1600 + _GETLATESTMODELREQUEST._serialized_end=1644 + _GETLATESTMODELRESPONSE._serialized_start=1646 + _GETLATESTMODELRESPONSE._serialized_end=1711 + _TRAINERSERVER._serialized_start=1714 + _TRAINERSERVER._serialized_end=2171 # @@protoc_insertion_point(module_scope) diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi index 2175b8cb0..529d1ebd4 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi @@ -115,7 +115,6 @@ class StartTrainingRequest(google.protobuf.message.Message): PIPELINE_ID_FIELD_NUMBER: builtins.int TRIGGER_ID_FIELD_NUMBER: builtins.int DEVICE_FIELD_NUMBER: builtins.int - AMP_FIELD_NUMBER: builtins.int USE_PRETRAINED_MODEL_FIELD_NUMBER: builtins.int LOAD_OPTIMIZER_STATE_FIELD_NUMBER: builtins.int PRETRAINED_MODEL_ID_FIELD_NUMBER: builtins.int @@ -135,7 +134,6 @@ class StartTrainingRequest(google.protobuf.message.Message): pipeline_id: builtins.int trigger_id: builtins.int device: builtins.str - amp: builtins.bool use_pretrained_model: builtins.bool load_optimizer_state: builtins.bool pretrained_model_id: builtins.int @@ -167,7 +165,6 @@ class StartTrainingRequest(google.protobuf.message.Message): pipeline_id: builtins.int = ..., trigger_id: builtins.int = ..., device: builtins.str = ..., - amp: builtins.bool = ..., use_pretrained_model: builtins.bool = ..., load_optimizer_state: builtins.bool = ..., pretrained_model_id: builtins.int = ..., @@ -186,7 +183,7 @@ class StartTrainingRequest(google.protobuf.message.Message): seed: builtins.int | None = ..., ) -> None: ... def HasField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "lr_scheduler", b"lr_scheduler", "seed", b"seed", "torch_optimizers_configuration", b"torch_optimizers_configuration"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "amp", b"amp", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ... def WhichOneof(self, oneof_group: typing_extensions.Literal["_seed", b"_seed"]) -> typing_extensions.Literal["seed"] | None: ... global___StartTrainingRequest = StartTrainingRequest diff --git a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py index 7fd8f6feb..1ca623d59 100644 --- a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py +++ b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py @@ -10,7 +10,7 @@ import torch # pylint: disable=no-name-in-module -from modyn.common.ftp import download_file, get_pretrained_model_callback +from modyn.common.ftp import download_trained_model from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( FetchModelRequest, @@ -36,6 +36,7 @@ from modyn.trainer_server.internal.utils.training_info import TrainingInfo from modyn.trainer_server.internal.utils.training_process_info import TrainingProcessInfo from modyn.utils import current_time_millis, dynamic_module_import, grpc_connection_established +from modyn.utils.utils import calculate_checksum logger = logging.getLogger(__name__) @@ -96,7 +97,7 @@ def start_training( logger.info("Received start training request.") with MetadataDatabaseConnection(self._config) as database: - model_id, model_config = database.get_model_configuration(request.pipeline_id) + model_id, model_config, amp = database.get_model_configuration(request.pipeline_id) if not hasattr(dynamic_module_import("modyn.models"), model_id): logger.error(f"Model {model_id} not available!") @@ -104,7 +105,7 @@ def start_training( pretrained_model_path: Optional[pathlib.Path] = None if request.use_pretrained_model: - fetch_request = FetchModelRequest(model_id=request.pretrained_model_id) + fetch_request = FetchModelRequest(model_id=request.pretrained_model_id, load_metadata=True) fetch_resp: FetchModelResponse = self.model_storage_stub.FetchModel(fetch_request) if not fetch_resp.success: @@ -118,19 +119,18 @@ def start_training( training_id = self._next_training_id self._next_training_id += 1 - pretrained_model_path = self._modyn_base_dir / pathlib.Path(f"pretrained_model_{training_id}.modyn") - - download_file( - hostname=self._config["model_storage"]["hostname"], - port=int(self._config["model_storage"]["ftp_port"]), - user="modyn", - password="modyn", - remote_file_path=pathlib.Path(fetch_resp.model_path), - local_file_path=pretrained_model_path, - callback=get_pretrained_model_callback(logger), + pretrained_model_path = download_trained_model( + logger=logger, + model_storage_config=self._config["model_storage"], + remote_path=pathlib.Path(fetch_resp.model_path), + checksum=fetch_resp.checksum, + identifier=training_id, + base_directory=self._modyn_base_dir, ) - logger.info(f"Completed pretrained model download. Local path: {pretrained_model_path}") + if not pretrained_model_path: + return StartTrainingResponse(training_started=False) + else: with self._lock: training_id = self._next_training_id @@ -142,6 +142,7 @@ def start_training( training_id, model_id, model_config, + amp, self._storage_address, self._selector_address, self._offline_dataset_directory, @@ -331,9 +332,9 @@ def store_final_model( logger.error(f"Training with id {training_id} is still running.") return StoreFinalModelResponse(valid_state=False) - final_model_path = self._training_dict[training_id].final_checkpoint_path / "model_final.modyn" - if final_model_path.exists(): - prefix_path = str(final_model_path.relative_to(self._modyn_base_dir)) + final_checkpoint_path = self._prepare_final_model(training_id) + if final_checkpoint_path: + prefix_model_path = final_checkpoint_path.relative_to(self._modyn_base_dir) pipeline_id = self._training_dict[training_id].pipeline_id trigger_id = self._training_dict[training_id].trigger_id @@ -343,24 +344,30 @@ def store_final_model( trigger_id=trigger_id, hostname=self._config["trainer_server"]["hostname"], port=int(self._config["trainer_server"]["ftp_port"]), - model_path=prefix_path, + model_path=str(prefix_model_path), + checksum=calculate_checksum(final_checkpoint_path), ) register_response: RegisterModelResponse = self.model_storage_stub.RegisterModel(register_request) if not register_response.success: - logger.error(f"Could not store final model from training id {training_id}.") + logger.error(f"Could not store final model from training id {training_id} at model storage.") return StoreFinalModelResponse(valid_state=False) - - os.remove(final_model_path) - - logger.info(f"Deleted final model at {final_model_path}") + os.remove(final_checkpoint_path) + logger.info(f"Deleted final model on path {final_checkpoint_path}") return StoreFinalModelResponse(valid_state=True, model_id=register_response.model_id) logger.error(f"Could not find final checkpoint of training with ID {training_id}.") return StoreFinalModelResponse(valid_state=False) + def _prepare_final_model(self, training_id: int) -> Optional[pathlib.Path]: + final_checkpoint_path = self._training_dict[training_id].final_checkpoint_path / "model_final.modyn" + if not final_checkpoint_path.exists(): + return None + + return final_checkpoint_path + def get_latest_model( self, request: GetLatestModelRequest, diff --git a/modyn/trainer_server/internal/utils/training_info.py b/modyn/trainer_server/internal/utils/training_info.py index 893990e35..d71050afe 100644 --- a/modyn/trainer_server/internal/utils/training_info.py +++ b/modyn/trainer_server/internal/utils/training_info.py @@ -19,6 +19,7 @@ def __init__( training_id: int, model_id: str, model_config: str, + amp: bool, storage_address: str, selector_address: str, offline_dataset_path: str, @@ -55,7 +56,7 @@ def __init__( self.batch_size = request.batch_size self.torch_criterion = request.torch_criterion - self.amp = request.amp + self.amp = amp self.lr_scheduler = json.loads(request.lr_scheduler.value) diff --git a/modyn/utils/__init__.py b/modyn/utils/__init__.py index 268ae1ac5..fbd37a71b 100644 --- a/modyn/utils/__init__.py +++ b/modyn/utils/__init__.py @@ -12,6 +12,7 @@ LABEL_TRANSFORMER_FUNC_NAME, MAX_MESSAGE_SIZE, DownsamplingMode, + calculate_checksum, convert_timestr_to_seconds, current_time_millis, deserialize_function, @@ -24,8 +25,10 @@ package_available_and_can_be_imported, seed_everything, trigger_available, + unzip_file, validate_timestr, validate_yaml, + zip_file, ) files = os.listdir(os.path.dirname(__file__)) diff --git a/modyn/utils/utils.py b/modyn/utils/utils.py index f1fb54193..1d34f849c 100644 --- a/modyn/utils/utils.py +++ b/modyn/utils/utils.py @@ -1,4 +1,5 @@ import errno +import hashlib import importlib import importlib.util import inspect @@ -13,6 +14,7 @@ from inspect import isfunction from types import ModuleType from typing import Any, Callable, Optional +from zipfile import ZIP_DEFLATED, ZipFile import grpc import numpy as np @@ -239,3 +241,67 @@ def get_partition_for_worker(worker_id: int, total_workers: int, total_num_eleme start_index = 0 return start_index, worker_subset_size + + +def calculate_checksum(file_path: pathlib.Path, hash_func_name: str = "blake2b", chunk_num_blocks: int = 128) -> bytes: + """ + Returns the checksum of a file. + + Args: + file_path: the path to the file. + hash_func_name: the name of the hash function. + chunk_num_blocks: size of the update step. + + Returns: + bytes: the checksum that is calculated over the file. + """ + assert file_path.exists() and file_path.is_file() + + hash_func = hashlib.new(hash_func_name) + with open(file_path, "rb") as file: + while chunk := file.read(chunk_num_blocks * hash_func.block_size): + hash_func.update(chunk) + return hash_func.digest() + + +def zip_file( + file_path: pathlib.Path, zipped_file_path: pathlib.Path, compression: int = ZIP_DEFLATED, remove_file: bool = False +) -> None: + """ + Zips a file. + + Args: + file_path: the path to the file that should be zipped. + zipped_file_path: the path to the zipped file. + compression: the compression algorithm to be used. + remove_file: if the file should be removed after zipping. + """ + assert file_path.exists(), "Cannot work with non-existing file" + + with ZipFile(zipped_file_path, "w", compression=compression) as zipfile: + zipfile.write(file_path) + + if remove_file: + os.remove(file_path) + + +def unzip_file( + zipped_file_path: pathlib.Path, file_path: pathlib.Path, compression: int = ZIP_DEFLATED, remove_file: bool = False +) -> None: + """ + Unzips a file. + + Args: + zipped_file_path: path to the zipped file. + file_path: path pointing to the location where the unzipped file should be stored. + compression: the compression algorithm to be used. + remove_file: true if we should remove the zipped file afterwards. + """ + with ZipFile(zipped_file_path, "r", compression=compression) as zipfile: + assert len(zipfile.namelist()) == 1 + + with open(file_path, "wb") as file: + file.write(zipfile.read(zipfile.namelist()[0])) + + if remove_file: + os.remove(zipped_file_path) From 44d7e16316f58b5f5dc54d46f0d156ec0a4aeaa2 Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Wed, 16 Aug 2023 21:03:01 +0200 Subject: [PATCH 03/12] Merge main branch and follow-up * solve merge conflicts and edit pipeline schema --- modyn/config/schema/pipeline-schema.yaml | 9 +++-- .../grpc/generated/trainer_server_pb2.py | 36 +++++++++---------- .../grpc/generated/trainer_server_pb2.pyi | 14 ++------ 3 files changed, 24 insertions(+), 35 deletions(-) diff --git a/modyn/config/schema/pipeline-schema.yaml b/modyn/config/schema/pipeline-schema.yaml index 5cf2a24e2..9919a3b22 100644 --- a/modyn/config/schema/pipeline-schema.yaml +++ b/modyn/config/schema/pipeline-schema.yaml @@ -48,7 +48,7 @@ properties: name: type: string description: | - Name of the full model strategy. We currently support NaiveFullModelStrategy and BinaryFullModelStrategy. + Name of the full model strategy. We currently support PyTorchFullModel and CompressedFullModel. config: type: object description: | @@ -60,7 +60,7 @@ properties: zip_algorithm: type: string description: | - Which zip algorithm to use. Default is DEFLATED. + Which zip algorithm to use. Default is ZIP_DEFLATED. required: - name incremental_model_strategy: @@ -71,8 +71,7 @@ properties: name: type: string description: | - Name of the incremental model strategy. We currently support NaiveDeltaBasedModelStrategy and - BinaryDeltaBasedModelStrategy. + Name of the incremental model strategy. We currently support WeightsDifference. config: type: object description: | @@ -84,7 +83,7 @@ properties: zip_algorithm: type: string description: | - Which zip algorithm to use. Default is DEFLATED. + Which zip algorithm to use. Default is ZIP_DEFLATED. required: - name full_model_interval: diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py index 300ccddf2..a9f0cec7d 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xb9\x06\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x0b\n\x03\x61mp\x18\x04 \x01(\x08\x12\x10\n\x08model_id\x18\x05 \x01(\t\x12\x30\n\x13model_configuration\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1c\n\x14use_pretrained_model\x18\x07 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x08 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\t \x01(\x05\x12\x12\n\nbatch_size\x18\n \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x0b \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\x0c \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\r \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0e \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0f \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x11 \x03(\t\x12)\n\x0clr_scheduler\x18\x12 \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x13 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x14 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x15 \x01(\x05\x12\x11\n\x04seed\x18\x16 \x01(\x05H\x00\x88\x01\x01\x12-\n\ttokenizer\x18\x17 \x01(\x0b\x32\x15.trainer.PythonStringH\x01\x88\x01\x01\x42\x07\n\x05_seedB\x0c\n\n_tokenizer\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\x84\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12\x16\n\texception\x18\x06 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x07 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x08 \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\t \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\n \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xe8\x05\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x1c\n\x14use_pretrained_model\x18\x04 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x05 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\x06 \x01(\x05\x12\x12\n\nbatch_size\x18\x07 \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x08 \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\t \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\n \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0b \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0c \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\r \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x0e \x03(\t\x12)\n\x0clr_scheduler\x18\x0f \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x11 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x12 \x01(\x05\x12\x11\n\x04seed\x18\x13 \x01(\x05H\x00\x88\x01\x01\x12-\n\ttokenizer\x18\x14 \x01(\x0b\x32\x15.trainer.PythonStringH\x01\x88\x01\x01\x42\x07\n\x05_seedB\x0c\n\n_tokenizer\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\x84\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12\x16\n\texception\x18\x06 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x07 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x08 \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\t \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\n \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trainer_server_pb2', globals()) @@ -34,21 +34,21 @@ _CHECKPOINTINFO._serialized_start=220 _CHECKPOINTINFO._serialized_end=290 _STARTTRAININGREQUEST._serialized_start=293 - _STARTTRAININGREQUEST._serialized_end=1118 - _STARTTRAININGRESPONSE._serialized_start=1120 - _STARTTRAININGRESPONSE._serialized_end=1190 - _TRAININGSTATUSREQUEST._serialized_start=1192 - _TRAININGSTATUSREQUEST._serialized_end=1236 - _TRAININGSTATUSRESPONSE._serialized_start=1239 - _TRAININGSTATUSRESPONSE._serialized_end=1627 - _STOREFINALMODELREQUEST._serialized_start=1629 - _STOREFINALMODELREQUEST._serialized_end=1674 - _STOREFINALMODELRESPONSE._serialized_start=1676 - _STOREFINALMODELRESPONSE._serialized_end=1740 - _GETLATESTMODELREQUEST._serialized_start=1742 - _GETLATESTMODELREQUEST._serialized_end=1786 - _GETLATESTMODELRESPONSE._serialized_start=1788 - _GETLATESTMODELRESPONSE._serialized_end=1853 - _TRAINERSERVER._serialized_start=1856 - _TRAINERSERVER._serialized_end=2313 + _STARTTRAININGREQUEST._serialized_end=1037 + _STARTTRAININGRESPONSE._serialized_start=1039 + _STARTTRAININGRESPONSE._serialized_end=1109 + _TRAININGSTATUSREQUEST._serialized_start=1111 + _TRAININGSTATUSREQUEST._serialized_end=1155 + _TRAININGSTATUSRESPONSE._serialized_start=1158 + _TRAININGSTATUSRESPONSE._serialized_end=1546 + _STOREFINALMODELREQUEST._serialized_start=1548 + _STOREFINALMODELREQUEST._serialized_end=1593 + _STOREFINALMODELRESPONSE._serialized_start=1595 + _STOREFINALMODELRESPONSE._serialized_end=1659 + _GETLATESTMODELREQUEST._serialized_start=1661 + _GETLATESTMODELREQUEST._serialized_end=1705 + _GETLATESTMODELRESPONSE._serialized_start=1707 + _GETLATESTMODELRESPONSE._serialized_end=1772 + _TRAINERSERVER._serialized_start=1775 + _TRAINERSERVER._serialized_end=2232 # @@protoc_insertion_point(module_scope) diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi index 1ecc8cd7f..b7158b914 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi @@ -115,9 +115,6 @@ class StartTrainingRequest(google.protobuf.message.Message): PIPELINE_ID_FIELD_NUMBER: builtins.int TRIGGER_ID_FIELD_NUMBER: builtins.int DEVICE_FIELD_NUMBER: builtins.int - AMP_FIELD_NUMBER: builtins.int - MODEL_ID_FIELD_NUMBER: builtins.int - MODEL_CONFIGURATION_FIELD_NUMBER: builtins.int USE_PRETRAINED_MODEL_FIELD_NUMBER: builtins.int LOAD_OPTIMIZER_STATE_FIELD_NUMBER: builtins.int PRETRAINED_MODEL_ID_FIELD_NUMBER: builtins.int @@ -138,10 +135,6 @@ class StartTrainingRequest(google.protobuf.message.Message): pipeline_id: builtins.int trigger_id: builtins.int device: builtins.str - amp: builtins.bool - model_id: builtins.str - @property - def model_configuration(self) -> global___JsonString: ... use_pretrained_model: builtins.bool load_optimizer_state: builtins.bool pretrained_model_id: builtins.int @@ -175,9 +168,6 @@ class StartTrainingRequest(google.protobuf.message.Message): pipeline_id: builtins.int = ..., trigger_id: builtins.int = ..., device: builtins.str = ..., - amp: builtins.bool = ..., - model_id: builtins.str = ..., - model_configuration: global___JsonString | None = ..., use_pretrained_model: builtins.bool = ..., load_optimizer_state: builtins.bool = ..., pretrained_model_id: builtins.int = ..., @@ -196,8 +186,8 @@ class StartTrainingRequest(google.protobuf.message.Message): seed: builtins.int | None = ..., tokenizer: global___PythonString | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "lr_scheduler", b"lr_scheduler", "model_configuration", b"model_configuration", "seed", b"seed", "tokenizer", b"tokenizer", "torch_optimizers_configuration", b"torch_optimizers_configuration"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "amp", b"amp", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "model_configuration", b"model_configuration", "model_id", b"model_id", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "tokenizer", b"tokenizer", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "lr_scheduler", b"lr_scheduler", "seed", b"seed", "tokenizer", b"tokenizer", "torch_optimizers_configuration", b"torch_optimizers_configuration"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "tokenizer", b"tokenizer", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ... @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_seed", b"_seed"]) -> typing_extensions.Literal["seed"] | None: ... @typing.overload From 6f3df84bf20980078207211cdf69b5859fd5c4ee Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Mon, 25 Sep 2023 10:29:18 +0200 Subject: [PATCH 04/12] Implement XOR-Full strategy * make metadata required * rename model id to model class name to mitigate confusion * make model storage folder configurable * add documentation --- benchmark/mnist/mnist.yaml | 6 - docker-compose.yml | 5 +- environment.yml | 1 + .../integrationtest_model_storage.py | 24 +- modyn/config/examples/example-pipeline.yaml | 2 +- modyn/config/examples/modyn_config.yaml | 1 + modyn/config/schema/modyn_config_schema.yaml | 4 + modyn/config/schema/pipeline-schema.yaml | 8 +- .../metadata_database_connection.py | 23 +- modyn/metadata_database/models/pipelines.py | 6 +- .../models/trained_models.py | 2 +- .../utils/model_storage_strategy_config.py | 2 +- .../grpc/model_storage_grpc_servicer.py | 11 +- .../internal/model_storage_manager.py | 216 +++++++++++++----- .../abstract_difference_operator.py | 5 +- .../abstract_model_storage_strategy.py | 7 +- .../sub_difference_operator.py | 6 +- .../xor_difference_operator.py | 20 +- .../abstract_full_model_strategy.py | 12 +- .../compressed_full_model.py | 10 +- .../pytorch_full_model.py | 2 +- .../abstract_incremental_model_strategy.py | 14 +- .../weights_difference.py | 125 +++++++++- .../model_storage/internal/utils/__init__.py | 10 +- .../internal/utils/data_types.py | 28 ++- ...ge_strategy.py => model_storage_policy.py} | 19 +- modyn/model_storage/model_storage.py | 28 ++- modyn/protos/selector.proto | 6 +- .../internal/grpc/generated/selector_pb2.py | 92 ++++---- .../internal/grpc/generated/selector_pb2.pyi | 20 +- .../internal/grpc/selector_grpc_servicer.py | 16 +- modyn/selector/internal/selector_manager.py | 4 +- modyn/supervisor/internal/grpc_handler.py | 17 +- .../grpc/test_evaluator_grpc_servicer.py | 4 +- .../models/test_pipelines.py | 22 +- .../models/test_trained_models.py | 10 +- .../test_metadata_database_connection.py | 7 +- .../test_sub_difference_operator.py | 14 +- .../test_xor_difference_operator.py | 14 +- .../test_compressed_full_model.py | 12 +- .../test_pytorch_full_model.py | 30 ++- .../test_weights_difference.py | 93 +++++++- .../internal/test_model_storage_manager.py | 152 ++++++------ .../internal/utils/test_data_types.py | 12 +- ...rategy.py => test_model_storage_policy.py} | 38 +-- .../tests/model_storage/test_model_storage.py | 23 +- .../grpc/test_selector_grpc_servicer.py | 10 +- .../supervisor/internal/test_grpc_handler.py | 8 +- .../grpc/generated/trainer_server_pb2.py | 36 +-- 49 files changed, 771 insertions(+), 466 deletions(-) rename modyn/model_storage/internal/utils/{model_storage_strategy.py => model_storage_policy.py} (82%) rename modyn/tests/model_storage/internal/utils/{test_model_storage_strategy.py => test_model_storage_policy.py} (50%) diff --git a/benchmark/mnist/mnist.yaml b/benchmark/mnist/mnist.yaml index 59f48ae99..4ebb6acdf 100644 --- a/benchmark/mnist/mnist.yaml +++ b/benchmark/mnist/mnist.yaml @@ -11,12 +11,6 @@ model_storage: name: "PyTorchFullModel" zip: True zip_algorithm: ZIP_DEFLATED - incremental_model_strategy: - name: "WeightsDifference" - config: - operator: xor - split_exponent: True - full_model_interval: 10 training: gpus: 1 device: "cuda:0" diff --git a/docker-compose.yml b/docker-compose.yml index 490342082..965867f92 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -71,6 +71,8 @@ services: build: context: . dockerfile: docker/Model_Storage/Dockerfile + volumes: + - model_storage-data:/tmp/models evaluator: restart: on-failure depends_on: @@ -160,4 +162,5 @@ services: volumes: storage-data: selector-data: - downsampling-data: \ No newline at end of file + downsampling-data: + model_storage-data: \ No newline at end of file diff --git a/environment.yml b/environment.yml index 4d5f24566..155205684 100644 --- a/environment.yml +++ b/environment.yml @@ -26,6 +26,7 @@ dependencies: - pyaml - numpy - pandas + - bitstring - tensorboard - pyftpdlib - types-protobuf diff --git a/integrationtests/model_storage/integrationtest_model_storage.py b/integrationtests/model_storage/integrationtest_model_storage.py index 16023bfe0..0d5e1a6ce 100644 --- a/integrationtests/model_storage/integrationtest_model_storage.py +++ b/integrationtests/model_storage/integrationtest_model_storage.py @@ -1,13 +1,14 @@ # end-to-end testing of the model storage component import io import json +import logging import pathlib import shutil import grpc import torch from integrationtests.utils import get_modyn_config -from modyn.common.ftp import delete_file, download_file, upload_file +from modyn.common.ftp import delete_file, download_trained_model, upload_file from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.metadata_database.models import Trigger from modyn.metadata_database.utils import ModelStorageStrategyConfig @@ -95,8 +96,8 @@ def delete_data_from_database(config: dict, pipeline_id: int, trigger_id: int): database.session.commit() -def check_loaded_model() -> None: - with open(TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL_RESP, "rb") as state_file: +def check_loaded_model(path: pathlib.Path) -> None: + with open(path, "rb") as state_file: checkpoint = torch.load(io.BytesIO(state_file.read())) assert "model" in checkpoint, "Model state is not stored in file" @@ -146,18 +147,19 @@ def test_model_storage(config: dict): assert response_fetch.success, "Could not find model with this id" # download the model (dummy file) from model storage - download_file( - config["model_storage"]["hostname"], - int(config["model_storage"]["ftp_port"]), - "modyn", - "modyn", - remote_file_path=pathlib.Path(response_fetch.model_path), - local_file_path=TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL_RESP, + downloaded_path = download_trained_model( + logging.getLogger(__name__), + config["model_storage"], + remote_path=pathlib.Path(response_fetch.model_path), checksum=response_fetch.checksum, + identifier=42, + base_directory=TEST_MODELS_PATH, ) + assert downloaded_path is not None + # compare if content matches initial dummy file - check_loaded_model() + check_loaded_model(downloaded_path) # delete model on model storage component request_delete = DeleteModelRequest(model_id=model_id) diff --git a/modyn/config/examples/example-pipeline.yaml b/modyn/config/examples/example-pipeline.yaml index f7575f1c2..3c2feb98a 100644 --- a/modyn/config/examples/example-pipeline.yaml +++ b/modyn/config/examples/example-pipeline.yaml @@ -16,7 +16,7 @@ model_storage: config: operator: xor split_exponent: True - full_model_interval: 10 + full_model_interval: 10 training: gpus: 1 device: "cpu" diff --git a/modyn/config/examples/modyn_config.yaml b/modyn/config/examples/modyn_config.yaml index b2e80a1f1..1177384aa 100644 --- a/modyn/config/examples/modyn_config.yaml +++ b/modyn/config/examples/modyn_config.yaml @@ -173,6 +173,7 @@ model_storage: hostname: "model_storage" port: "50059" ftp_port: "50060" + models_directory: "/tmp/models" evaluator: hostname: "evaluator" diff --git a/modyn/config/schema/modyn_config_schema.yaml b/modyn/config/schema/modyn_config_schema.yaml index a5e9d4497..7cf525916 100644 --- a/modyn/config/schema/modyn_config_schema.yaml +++ b/modyn/config/schema/modyn_config_schema.yaml @@ -214,6 +214,10 @@ properties: type: string description: | The port of the FDP server used by the model_storage component. + models_directory: + type: string + description: | + The directory where we store the trained models. required: - hostname - port diff --git a/modyn/config/schema/pipeline-schema.yaml b/modyn/config/schema/pipeline-schema.yaml index c9c2b3d5c..03f93b67c 100644 --- a/modyn/config/schema/pipeline-schema.yaml +++ b/modyn/config/schema/pipeline-schema.yaml @@ -84,12 +84,12 @@ properties: type: string description: | Which zip algorithm to use. Default is ZIP_DEFLATED. + full_model_interval: + type: number + description: | + In which interval we are using the full model strategy. required: - name - full_model_interval: - type: number - description: | - In which interval we are using the full model strategy. required: - full_model_strategy training: diff --git a/modyn/metadata_database/metadata_database_connection.py b/modyn/metadata_database/metadata_database_connection.py index c8683a8a7..6ad76e3f4 100644 --- a/modyn/metadata_database/metadata_database_connection.py +++ b/modyn/metadata_database/metadata_database_connection.py @@ -72,7 +72,7 @@ def create_tables(self) -> None: def register_pipeline( self, num_workers: int, - model_id: str, + model_class_name: str, model_config: str, amp: bool, full_model_strategy: ModelStorageStrategyConfig, @@ -83,18 +83,20 @@ def register_pipeline( Args: num_workers (int): Number of workers in the pipeline. - model_id (str): the model name that is used by the pipeline. + model_class_name (str): the model class name that is used by the pipeline. model_config (str): the serialized model configuration options. amp (bool): whether amp is enabled for the model. full_model_strategy: the strategy used to store full models. - incremental_model_strategy: the strategy used to store models incrementally. - full_model_interval: interval between which the full model strategy is used. + incremental_model_strategy: the (optional) strategy used to store models incrementally. + full_model_interval: the (optional) interval between which the full model strategy is used. If not set, + the first model is stored according to the full model strategy, and the remaining + by using the incremental model strategy. Returns: int: Id of the newly created pipeline. """ pipeline = Pipeline( num_workers=num_workers, - model_id=model_id, + model_class_name=model_class_name, model_config=model_config, amp=amp, full_model_strategy_name=full_model_strategy.name, @@ -131,17 +133,18 @@ def add_trained_model( pipeline_id: int, trigger_id: int, model_path: str, - metadata_path: Optional[str] = None, + metadata_path: str, parent_model: Optional[int] = None, ) -> int: - """Add a trained model to the database. + """Add a trained model to the database. Whenever the parent model is not specified, the model is expected to be + fully stored, i.e., by applying a full model strategy. Args: pipeline_id: id of the pipeline it was created from. trigger_id: id of the trigger it was created. model_path: path on the local filesystem on which the model is stored. metadata_path: the path on the local filesystem on which metadata to the model are stored. - parent_model: id of the parent model. + parent_model: (optional) id of the parent model. Returns: int: Id of the registered model """ @@ -164,7 +167,7 @@ def get_model_configuration(self, pipeline_id: int) -> tuple[str, str, bool]: pipeline_id: id of the pipeline from which we want to extract the model. Returns: - (str, str, bool): the model id, its configuration options and if amp is enabled. + (str, str, bool): the model class name, its configuration options and if amp is enabled. """ pipeline: Pipeline = self.session.query(Pipeline).get(pipeline_id) - return pipeline.model_id, pipeline.model_config, pipeline.amp + return pipeline.model_class_name, pipeline.model_config, pipeline.amp diff --git a/modyn/metadata_database/models/pipelines.py b/modyn/metadata_database/models/pipelines.py index 739160de1..2b27c92a1 100644 --- a/modyn/metadata_database/models/pipelines.py +++ b/modyn/metadata_database/models/pipelines.py @@ -12,17 +12,17 @@ class Pipeline(MetadataBase): __table_args__ = {"extend_existing": True} pipeline_id = Column("pipeline_id", Integer, primary_key=True) num_workers = Column("num_workers", Integer, nullable=False) - model_id = Column("model_id", String(length=50), nullable=False) + model_class_name = Column("model_class_name", String(length=50), nullable=False) model_config = Column("model_config", String(length=500), nullable=False) amp = Column("amp", Boolean, nullable=False) full_model_strategy_name = Column("full_model_strategy_name", String(length=50), nullable=False) - full_model_strategy_zip = Column("full_model_strategy_zip", Boolean, default=None, nullable=True) + full_model_strategy_zip = Column("full_model_strategy_zip", Boolean, default=False) full_model_strategy_zip_algorithm = Column( "full_model_strategy_zip_algorithm", String(length=50), default=None, nullable=True ) full_model_strategy_config = Column("full_model_strategy_config", String(length=500), default=None, nullable=True) inc_model_strategy_name = Column("inc_model_strategy_name", String(length=50), default=None, nullable=True) - inc_model_strategy_zip = Column("inc_model_strategy_zip", Boolean, default=None, nullable=True) + inc_model_strategy_zip = Column("inc_model_strategy_zip", Boolean, default=False) inc_model_strategy_zip_algorithm = Column( "inc_model_strategy_zip_algorithm", String(length=50), default=None, nullable=True ) diff --git a/modyn/metadata_database/models/trained_models.py b/modyn/metadata_database/models/trained_models.py index 111f2d40b..c2da41b22 100644 --- a/modyn/metadata_database/models/trained_models.py +++ b/modyn/metadata_database/models/trained_models.py @@ -17,7 +17,7 @@ class TrainedModel(MetadataBase): trigger_id = Column("trigger_id", Integer) timestamp = Column("timestamp", TIMESTAMP(timezone=False), default=datetime.now()) model_path = Column("model_path", String(length=200), nullable=False) - metadata_path = Column("metadata_path", String(length=200), nullable=True, default=None) + metadata_path = Column("metadata_path", String(length=200), nullable=False) parent_model = Column("parent_model", Integer, ForeignKey(f"{__tablename__}.model_id"), nullable=True, default=None) children = relationship("TrainedModel") __table_args__ = ( diff --git a/modyn/metadata_database/utils/model_storage_strategy_config.py b/modyn/metadata_database/utils/model_storage_strategy_config.py index de3663754..1d2124ff2 100644 --- a/modyn/metadata_database/utils/model_storage_strategy_config.py +++ b/modyn/metadata_database/utils/model_storage_strategy_config.py @@ -3,7 +3,7 @@ class ModelStorageStrategyConfig: """ - Helper class to represent the configuration of a model storage strategy. + This class is used to hold the configuration options of a model storage strategy. """ def __init__(self, name: str): diff --git a/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py b/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py index 0e16f950c..63152a21a 100644 --- a/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py +++ b/modyn/model_storage/internal/grpc/model_storage_grpc_servicer.py @@ -32,15 +32,15 @@ def __init__(self, config: dict, storage_dir: pathlib.Path, ftp_dir: pathlib.Pat Args: config (dict): Configuration of the storage module. - storage_dir (path): Path to the model storage directory. - ftp_dir (path): Path to the ftp directory. + storage_dir (path): Path to the directory, where the trained models are stored. + ftp_dir (path): Path to the temporary FTP directory, where the trained models are served. """ super().__init__() self._config = config self.ftp_dir = ftp_dir self.storage_dir = storage_dir - self.model_storage_manager = ModelStorageManager(self._config, self.storage_dir) + self.model_storage_manager = ModelStorageManager(self._config, self.storage_dir, self.ftp_dir) def RegisterModel(self, request: RegisterModelRequest, context: grpc.ServicerContext) -> RegisterModelResponse: """Registers a new model at the model storage component by downloading it from a given server. @@ -124,12 +124,9 @@ def DeleteModel(self, request: DeleteModelRequest, context: grpc.ServicerContext model_id = request.model_id logger.info(f"Try to delete model having id {model_id}") - response = DeleteModelResponse() success = self.model_storage_manager.delete_model(model_id) - if success: logger.info(f"Deleted model {request.model_id}.") else: logger.error(f"Deletion of model {request.model_id} was not successful.") - response.success = success - return response + return DeleteModelResponse(success=success) diff --git a/modyn/model_storage/internal/model_storage_manager.py b/modyn/model_storage/internal/model_storage_manager.py index 811f4e9e7..7655c8983 100644 --- a/modyn/model_storage/internal/model_storage_manager.py +++ b/modyn/model_storage/internal/model_storage_manager.py @@ -8,7 +8,7 @@ import torch from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.metadata_database.models import Pipeline, TrainedModel -from modyn.model_storage.internal.utils import ModelStorageStrategy +from modyn.model_storage.internal.utils import ModelStoragePolicy from modyn.utils import current_time_millis, dynamic_module_import, unzip_file, zip_file logger = logging.getLogger(__name__) @@ -19,30 +19,58 @@ class ModelStorageManager: Class used as manager of the model storage component. Implements all model storage related functionalities. """ - def __init__(self, modyn_config: dict, storage_dir: pathlib.Path): + def __init__(self, modyn_config: dict, storage_dir: pathlib.Path, ftp_dir: pathlib.Path): + """ + Constructor of the model storage manager. It establishes a connection to the metadata database in order + to store information related to the trained models. + + Args: + modyn_config: the modyn configuration. + storage_dir: path to the folder, in which the trained models are stored. + ftp_dir: FTP directory, which is used as temporary folder for serving trained models. + """ self._modyn_config = modyn_config self._storage_dir = storage_dir + self._ftp_dir = ftp_dir def store_model(self, pipeline_id: int, trigger_id: int, checkpoint_path: pathlib.Path) -> int: + """ + Store the trained model contained in the checkpoint file to disk. It uses the model storage policy that is + specified for the pipeline. Depending on the trigger id, it is either stored fully (according to full model + strategy) or incrementally by using the incremental model strategy. + + Args: + pipeline_id: the pipeline identifier for the model. + trigger_id: the trigger associated with the model. + checkpoint_path: path to the checkpoint containing the model. + + Returns: + int: the model id which identifies the stored model. + """ checkpoint = torch.load(checkpoint_path) + policy = self.get_model_storage_policy(pipeline_id) - model_storage_strategy = self.get_model_storage_strategy(pipeline_id) - + # split the model (stored under the "model" key) from metadata. assert "model" in checkpoint state_dict = checkpoint["model"] local_model_filename = f"{current_time_millis()}_{pipeline_id}_{trigger_id}.model" model_path = self._storage_dir / local_model_filename - parent_id = self._handle_new_model(pipeline_id, trigger_id, state_dict, model_path, model_storage_strategy) + + # handle the new model according to the model storage policy. If it is stored incrementally, we receive + # the model id of the parent. + parent_id = self._handle_new_model(pipeline_id, trigger_id, state_dict, model_path, policy) checkpoint.pop("model") - # now checkpoint only contains optimizer state and metadata + # now checkpoint only contains optimizer state and metadata. local_metadata_filename = f"{current_time_millis()}_{pipeline_id}_{trigger_id}.metadata.zip" metadata_path = self._storage_dir / local_metadata_filename - with tempfile.NamedTemporaryFile() as temp_file: + # zip the metadata file. + with tempfile.NamedTemporaryFile(dir=self._ftp_dir) as temp_file: torch.save(checkpoint, temp_file) zip_file(pathlib.Path(temp_file.name), metadata_path) + # add the new model to the database. with MetadataDatabaseConnection(self._modyn_config) as database: return database.add_trained_model( pipeline_id, trigger_id, local_model_filename, local_metadata_filename, parent_id @@ -54,33 +82,53 @@ def _handle_new_model( trigger_id: int, state_dict: dict, model_path: pathlib.Path, - model_storage_strategy: ModelStorageStrategy, + policy: ModelStoragePolicy, ) -> Optional[int]: - if model_storage_strategy.incremental_model_strategy and ( - model_storage_strategy.full_model_interval is None - or trigger_id % model_storage_strategy.full_model_interval != 0 + """ + Handle the new model according to the model storage policy. + + Args: + pipeline_id: the pipeline, to which the model belongs. + trigger_id: the trigger identifier associated with the model. + state_dict: the model's state. + model_path: path, under which the model must be stored. + policy: the model storage policy applied to store the model. + Returns: + int: if the model is stored incrementally, the parent model id is returned. + """ + + # check whether we must apply the incremental storage strategy or the full model strategy. + if policy.incremental_model_strategy and ( + policy.full_model_interval is None or trigger_id % policy.full_model_interval != 0 ): - prev_model: Optional[TrainedModel] = self._get_previous_model(pipeline_id, trigger_id) - if prev_model: - # handle incremental model storage - previous_model_state = self._get_base_model_state(pipeline_id) + parent_model_id: Optional[int] = self._get_parent_model_id(pipeline_id, trigger_id) + if parent_model_id is not None: + # store the model according to the incremental model strategy. + parent_model_state = self._get_base_model_state(pipeline_id) - # load previous model state - self._reconstruct_model(prev_model.model_id, previous_model_state, model_storage_strategy) + # load model state of the parent model. + self._reconstruct_model(parent_model_id, parent_model_state, policy) - # store incremental model - model_storage_strategy.incremental_model_strategy.save_model( - state_dict, previous_model_state, model_path - ) + # finally store the model delta. + policy.incremental_model_strategy.store_model(state_dict, parent_model_state, model_path) - return prev_model.model_id + return parent_model_id logger.warning("Previous model is not available! Storing full model...") - # handle full model storage - model_storage_strategy.full_model_strategy.save_model(state_dict, model_path) + # store the model in its entirety. + policy.full_model_strategy.store_model(state_dict, model_path) return None def _get_base_model_state(self, pipeline_id: int) -> dict: + """ + Get the base model state associated with a pipeline. + + Args: + pipeline_id: the involved pipeline. + + Returns: + dict: the plain model state derived from the model architecture of the pipeline's models. + """ with MetadataDatabaseConnection(self._modyn_config) as database: model_id, model_config, amp = database.get_model_configuration(pipeline_id) model_module = dynamic_module_import("modyn.models") @@ -89,47 +137,84 @@ def _get_base_model_state(self, pipeline_id: int) -> dict: model_handler = getattr(model_module, model_id) return model_handler(json.loads(model_config), "cpu", amp).model.state_dict() - def _reconstruct_model( - self, model_id: int, model_state: dict, model_storage_strategy: ModelStorageStrategy - ) -> None: - # we recursively overwrite the model state + def _reconstruct_model(self, model_id: int, model_state: dict, policy: ModelStoragePolicy) -> None: + """ + Reconstruct the model given the model state and the model storage policy. + The function recursively call itself, if the model is stored as a model delta. + In this case it first loads the (fully stored) parent model into the model state before overwriting it + according to the incremental model storage policy. + + Args: + model_id: the identifier of the model to be reconstructed. + model_state: the plain model state (or the loaded parent model state). + policy: the model storage policy containing the strategies. + Returns: + None: the model state is overwritten in order to minimize memory overhead. + """ + + # we recursively overwrite the model state. with MetadataDatabaseConnection(self._modyn_config) as database: model: TrainedModel = database.session.get(TrainedModel, model_id) if not model.parent_model: - # base case: we can load a fully stored model - model_storage_strategy.full_model_strategy.load_model(model_state, self._storage_dir / model.model_path) + # base case: we can load a fully stored model. + policy.full_model_strategy.load_model(model_state, self._storage_dir / model.model_path) return - # recursive step: we recurse to load the model state of the parent model - self._reconstruct_model(model.parent_model, model_state, model_storage_strategy) + # recursive step: we recurse to load the model state of the parent model. + self._reconstruct_model(model.parent_model, model_state, policy) - # we apply the incremental strategy to load our model state - model_storage_strategy.incremental_model_strategy.load_model(model_state, self._storage_dir / model.model_path) + # we apply the incremental strategy to load our model state. + policy.incremental_model_strategy.load_model(model_state, self._storage_dir / model.model_path) - def _get_previous_model(self, pipeline_id: int, trigger_id: int) -> Optional[TrainedModel]: + def _get_parent_model_id(self, pipeline_id: int, trigger_id: int) -> Optional[int]: + """ + Get the id of the parent model given the trigger id of a pipeline. + + Args: + pipeline_id: the pipeline that generated the model. + trigger_id: the trigger associated with the model. + + Returns: + Optional[int]: the parent model id (if it exists). + """ with MetadataDatabaseConnection(self._modyn_config) as database: - return ( + previous_model: TrainedModel = ( database.session.query(TrainedModel) .filter(TrainedModel.pipeline_id == pipeline_id, TrainedModel.trigger_id == trigger_id - 1) .first() ) + if not previous_model: + return None + if previous_model.parent_model is None: + return previous_model.model_id + return previous_model.parent_model + def load_model(self, model_id: int, metadata: bool) -> Optional[dict]: + """ + Loads a given model and optionally, also appends the metadata. + + Args: + model_id: the model identifier of the model. + metadata: whether metadata should be loaded alongside. + + Returns: + Optional[dict]: dictionary containing the model state and metadata if the model exists. + """ with MetadataDatabaseConnection(self._modyn_config) as database: model: Optional[TrainedModel] = database.session.get(TrainedModel, model_id) if model is None: logger.error(f"Model {model_id} does not exist.") return None - model_storage_strategy = self.get_model_storage_strategy(model.pipeline_id) + policy = self.get_model_storage_policy(model.pipeline_id) + # retrieve the model by loading its state dictionary. model_state = self._get_base_model_state(model.pipeline_id) - self._reconstruct_model(model_id, model_state, model_storage_strategy) + self._reconstruct_model(model_id, model_state, policy) model_dict = {"model": model_state} + # append the metadata to the dictionary if specified. if metadata: - if not model.metadata_path: - logger.error(f"Metadata not available for model {model_id}") - return None with tempfile.NamedTemporaryFile() as temp_file: temp_file_path = pathlib.Path(temp_file.name) unzip_file(self._storage_dir / model.metadata_path, temp_file_path) @@ -139,39 +224,52 @@ def load_model(self, model_id: int, metadata: bool) -> Optional[dict]: return model_dict def delete_model(self, model_id: int) -> bool: + """ + Deletes a given model id. Only works, if all depending models (children) are deleted. + + Args: + model_id: the identifier of the model. + + Returns: + bool: True, whenever deletion was successful. + """ with MetadataDatabaseConnection(self._modyn_config) as database: model: Optional[TrainedModel] = database.session.get(TrainedModel, model_id) if model is None: logger.error(f"Trained model {model_id} was not found.") return False - model_storage_strategy = self.get_model_storage_strategy(model.pipeline_id) - child_state = self._get_base_model_state(model.pipeline_id) - - child: TrainedModel - for child in model.children: - assert child.pipeline_id == model.pipeline_id, "Pipeline does not match for parent and child model" - self._reconstruct_model(child.model_id, child_state, model_storage_strategy) - model_storage_strategy.full_model_strategy.save_model(child_state, self._storage_dir / child.model_path) - database.session.query(TrainedModel).filter(TrainedModel.model_id == child.model_id).update( - {"parent_model": None} - ) + children = model.children + if len(children) > 0: + child_ids = [str(child.model_id) for child in children] + logger.info(f"Model {model_id} has depending child models: {', '.join(child_ids)}") + return False os.remove(self._storage_dir / model.model_path) - if model.metadata_path: - os.remove(self._storage_dir / model.metadata_path) + os.remove(self._storage_dir / model.metadata_path) database.session.delete(model) database.session.commit() - logger.info(f"Successfully deleted model {model_id} and converted child models to be fully stored.") + logger.info(f"Successfully deleted model {model_id}.") return True - def get_model_storage_strategy(self, pipeline_id: int) -> ModelStorageStrategy: + def get_model_storage_policy(self, pipeline_id: int) -> ModelStoragePolicy: + """ + Returns the model storage policy associated with the pipeline. + + Args: + pipeline_id: the id of the pipeline, from which the policy is taken. + + Returns: + ModelStoragePolicy: the model storage policy of the pipeline. + """ + with MetadataDatabaseConnection(self._modyn_config) as database: pipeline: Pipeline = database.session.query(Pipeline).get(pipeline_id) - strategy = ModelStorageStrategy( + policy = ModelStoragePolicy( + self._ftp_dir, pipeline.full_model_strategy_name, pipeline.full_model_strategy_zip, pipeline.full_model_strategy_zip_algorithm, @@ -179,7 +277,7 @@ def get_model_storage_strategy(self, pipeline_id: int) -> ModelStorageStrategy: ) if pipeline.inc_model_strategy_name is not None: - strategy.register_incremental_model_strategy( + policy.register_incremental_model_strategy( pipeline.inc_model_strategy_name, pipeline.inc_model_strategy_zip, pipeline.inc_model_strategy_zip_algorithm, @@ -187,4 +285,4 @@ def get_model_storage_strategy(self, pipeline_id: int) -> ModelStorageStrategy: pipeline.full_model_interval, ) - return strategy + return policy diff --git a/modyn/model_storage/internal/storage_strategies/abstract_difference_operator.py b/modyn/model_storage/internal/storage_strategies/abstract_difference_operator.py index 25c4bef79..161870fd1 100644 --- a/modyn/model_storage/internal/storage_strategies/abstract_difference_operator.py +++ b/modyn/model_storage/internal/storage_strategies/abstract_difference_operator.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod -from typing import BinaryIO import torch @@ -27,13 +26,13 @@ def calculate_difference(tensor: torch.Tensor, tensor_prev: torch.Tensor) -> byt @staticmethod @abstractmethod - def restore(tensor_prev: torch.Tensor, bytestream: BinaryIO) -> torch.Tensor: + def restore(tensor_prev: torch.Tensor, buffer: bytes) -> torch.Tensor: """ Restores a weight tensor. Args: tensor_prev: the tensor representing some weights of the preceding model. - bytestream: difference bytes from which to restore the weights of the current model. + buffer: difference bytes, from which to restore the weights of the current model. Returns: tensor: the weight tensor of the current model. diff --git a/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py b/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py index 3378c6c31..48b458926 100644 --- a/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py +++ b/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py @@ -1,3 +1,4 @@ +import pathlib from abc import ABC, abstractmethod from zipfile import ZIP_DEFLATED @@ -9,15 +10,17 @@ class AbstractModelStorageStrategy(ABC): Base class for all model storage strategies. """ - def __init__(self, zip_activated: bool, zip_algorithm_name: str, config: dict): + def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm_name: str, config: dict): """ Initialize a model storage strategy. Args: - zip_activated: whether the generated file(s) are zipped. + zipping_dir: directory, in which the model is zipped. + zip_activated: whether the generated file is zipped. zip_algorithm_name: name of the zip algorithm. config: configuration options for the strategy. """ + self.zipping_dir = zipping_dir self.zip = zip_activated self.zip_algorithm = ZIP_DEFLATED self._validate_zip_config(zip_algorithm_name) diff --git a/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py b/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py index 5626a004a..2d4efec4c 100644 --- a/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py +++ b/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py @@ -1,5 +1,3 @@ -from typing import BinaryIO - import torch from modyn.model_storage.internal.storage_strategies.abstract_difference_operator import AbstractDifferenceOperator from modyn.model_storage.internal.utils.data_types import read_tensor_from_bytes @@ -12,6 +10,6 @@ def calculate_difference(tensor: torch.Tensor, tensor_prev: torch.Tensor) -> byt return diff.numpy().tobytes() @staticmethod - def restore(tensor_prev: torch.Tensor, bytestream: BinaryIO) -> torch.Tensor: - difference_tensor = read_tensor_from_bytes(tensor_prev, bytestream) + def restore(tensor_prev: torch.Tensor, buffer: bytes) -> torch.Tensor: + difference_tensor = read_tensor_from_bytes(tensor_prev, buffer) return tensor_prev + difference_tensor diff --git a/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py b/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py index 7e969124a..8272ef2cd 100644 --- a/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py +++ b/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py @@ -1,14 +1,6 @@ -import math -from typing import BinaryIO - -import numpy as np import torch from modyn.model_storage.internal.storage_strategies.abstract_difference_operator import AbstractDifferenceOperator -from modyn.model_storage.internal.utils.data_types import ( - create_tensor, - torch_dtype_to_byte_size, - torch_dtype_to_numpy_dict, -) +from modyn.model_storage.internal.utils import read_tensor_from_bytes class XorDifferenceOperator(AbstractDifferenceOperator): @@ -20,11 +12,7 @@ def calculate_difference(tensor: torch.Tensor, tensor_prev: torch.Tensor) -> byt return bytes(a ^ b for (a, b) in zip(bytes_curr, bytes_prev)) @staticmethod - def restore(tensor_prev: torch.Tensor, bytestream: BinaryIO) -> torch.Tensor: - shape = tensor_prev.shape - num_bytes = math.prod(shape) * torch_dtype_to_byte_size[tensor_prev.dtype] - byte_data: bytes = bytestream.read(num_bytes) + def restore(tensor_prev: torch.Tensor, buffer: bytes) -> torch.Tensor: prev_model_data = tensor_prev.numpy().tobytes() - new_model_data = bytes(a ^ b for (a, b) in zip(byte_data, prev_model_data)) - np_dtype = np.dtype(torch_dtype_to_numpy_dict[tensor_prev.dtype]) - return create_tensor(new_model_data, np_dtype, shape) + new_model_data = bytes(a ^ b for (a, b) in zip(prev_model_data, buffer)) + return read_tensor_from_bytes(tensor_prev, new_model_data) diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py index 07bc08983..794995d04 100644 --- a/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py @@ -13,7 +13,7 @@ class AbstractFullModelStrategy(AbstractModelStorageStrategy, ABC): """ @abstractmethod - def _save_model(self, model_state: dict, file_path: pathlib.Path) -> None: + def _store_model(self, model_state: dict, file_path: pathlib.Path) -> None: """ Stores the model state to the given file. @@ -23,14 +23,14 @@ def _save_model(self, model_state: dict, file_path: pathlib.Path) -> None: """ raise NotImplementedError() - def save_model(self, model_state: dict, file_path: pathlib.Path) -> None: + def store_model(self, model_state: dict, file_path: pathlib.Path) -> None: if self.zip: - with tempfile.NamedTemporaryFile() as temporary_file: + with tempfile.NamedTemporaryFile(dir=self.zipping_dir) as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) - self._save_model(model_state, temp_file_path) + self._store_model(model_state, temp_file_path) zip_file(temp_file_path, file_path, self.zip_algorithm, remove_file=False) else: - self._save_model(model_state, file_path) + self._store_model(model_state, file_path) @abstractmethod def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: @@ -45,7 +45,7 @@ def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: def load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: if self.zip: - with tempfile.NamedTemporaryFile() as temporary_file: + with tempfile.NamedTemporaryFile(dir=self.zipping_dir) as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) unzip_file(file_path, temp_file_path, compression=self.zip_algorithm, remove_file=False) self._load_model(base_model_state, temp_file_path) diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/compressed_full_model.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/compressed_full_model.py index 3ef7e63e0..9f3a0edd7 100644 --- a/modyn/model_storage/internal/storage_strategies/full_model_strategies/compressed_full_model.py +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/compressed_full_model.py @@ -1,7 +1,8 @@ +import math import pathlib from modyn.model_storage.internal.storage_strategies.full_model_strategies import AbstractFullModelStrategy -from modyn.model_storage.internal.utils import read_tensor_from_bytes +from modyn.model_storage.internal.utils import read_tensor_from_bytes, torch_dtype_to_byte_size class CompressedFullModel(AbstractFullModelStrategy): @@ -9,15 +10,16 @@ class CompressedFullModel(AbstractFullModelStrategy): This full model strategy stores the weights as binary sequence. """ - def _save_model(self, model_state: dict, file_path: pathlib.Path) -> None: + def _store_model(self, model_state: dict, file_path: pathlib.Path) -> None: with open(file_path, "wb") as file: - for _, tensor in model_state.items(): + for tensor in model_state.values(): file.write(tensor.numpy().tobytes()) def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: with open(file_path, "rb") as file: for layer, tensor in base_model_state.items(): - base_model_state[layer] = read_tensor_from_bytes(tensor, file) + num_bytes = math.prod(tensor.shape) * torch_dtype_to_byte_size[tensor.dtype] + base_model_state[layer] = read_tensor_from_bytes(tensor, file.read(num_bytes)) def validate_config(self, config: dict) -> None: pass diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py index e28f039ef..02e344bb3 100644 --- a/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py @@ -11,7 +11,7 @@ class PyTorchFullModel(AbstractFullModelStrategy): This full model strategy naively stores the whole model on disk (default pytorch implementation). """ - def _save_model(self, model_state: dict, file_path: pathlib.Path) -> None: + def _store_model(self, model_state: dict, file_path: pathlib.Path) -> None: torch.save(model_state, file_path) def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: diff --git a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py index ce2a21aea..8dbcfc42c 100644 --- a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py +++ b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py @@ -13,9 +13,9 @@ class AbstractIncrementalModelStrategy(AbstractModelStorageStrategy, ABC): """ @abstractmethod - def _save_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: + def _store_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: """ - Stores the delta between to successive models. + Stores the delta between two successive models. Args: model_state: the newer model state. @@ -24,14 +24,14 @@ def _save_model(self, model_state: dict, prev_model_state: dict, file_path: path """ raise NotImplementedError() - def save_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: + def store_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: if self.zip: - with tempfile.NamedTemporaryFile() as temporary_file: + with tempfile.NamedTemporaryFile(dir=self.zipping_dir) as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) - self._save_model(model_state, prev_model_state, temp_file_path) + self._store_model(model_state, prev_model_state, temp_file_path) zip_file(temp_file_path, file_path, self.zip_algorithm, remove_file=False) else: - self._save_model(model_state, prev_model_state, file_path) + self._store_model(model_state, prev_model_state, file_path) @abstractmethod def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> None: @@ -46,7 +46,7 @@ def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> None: def load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> None: if self.zip: - with tempfile.NamedTemporaryFile() as temporary_file: + with tempfile.NamedTemporaryFile(dir=self.zipping_dir) as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) unzip_file(file_path, temp_file_path, compression=self.zip_algorithm, remove_file=False) self._load_model(prev_model_state, temp_file_path) diff --git a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py index 92bc23bf7..70299982a 100644 --- a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py +++ b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py @@ -1,6 +1,10 @@ import io +import math import pathlib +from typing import BinaryIO, Union +import torch +from bitstring import BitArray from modyn.model_storage.internal.storage_strategies.difference_operators import ( SubDifferenceOperator, XorDifferenceOperator, @@ -8,6 +12,7 @@ from modyn.model_storage.internal.storage_strategies.incremental_model_strategies import ( AbstractIncrementalModelStrategy, ) +from modyn.model_storage.internal.utils import torch_dtype_to_byte_size available_difference_operators = {"xor": XorDifferenceOperator, "sub": SubDifferenceOperator} @@ -18,25 +23,134 @@ class WeightsDifference(AbstractIncrementalModelStrategy): weight tensors. It currently supports two difference operators: xor and sub. """ - def __init__(self, zip_activated: bool, zip_algorithm_name: str, config: dict): + def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm_name: str, config: dict): self.difference_operator = SubDifferenceOperator self.split_exponent = False + self.rle = False - super().__init__(zip_activated, zip_algorithm_name, config) + super().__init__(zipping_dir, zip_activated, zip_algorithm_name, config) - def _save_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: + def _store_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: bytestream = io.BytesIO() + exponent_bytestream = io.BytesIO() if self.split_exponent else None for tensor_model, tensor_prev_model in zip(model_state.values(), prev_model_state.values()): - bytestream.write(self.difference_operator.calculate_difference(tensor_model, tensor_prev_model)) + difference = self.difference_operator.calculate_difference(tensor_model, tensor_prev_model) + + if exponent_bytestream and tensor_model.dtype == torch.float32: + for i in range(0, len(difference), 4): + reordered_diff = self.reorder_buffer(difference[i : i + 4]) + bytestream.write(reordered_diff[0:3]) + exponent_bytestream.write(reordered_diff[3:4]) + else: + bytestream.write(difference) with open(file_path, "wb") as file: + if exponent_bytestream: + exponents = exponent_bytestream.getvalue() + if self.rle: + exponents = self.rle_bytes(exponents) + file.write(len(exponents).to_bytes(8, byteorder="big")) + file.write(exponents) file.write(bytestream.getbuffer().tobytes()) def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> None: with open(file_path, "rb") as file: + if not self.split_exponent: + for layer_name, tensor in prev_model_state.items(): + shape = tensor.shape + num_bytes = math.prod(shape) * torch_dtype_to_byte_size[tensor.dtype] + + prev_model_state[layer_name] = self.difference_operator.restore(tensor, file.read(num_bytes)) + else: + self._load_model_split_exponent(prev_model_state, file) + + def _load_model_split_exponent(self, prev_model_state: dict, file: BinaryIO) -> None: + exponent_bytes_amount = int.from_bytes(file.read(8), byteorder="big") + + with io.BytesIO() as exponent_bytes: + exponent_bytes.write( + self.inv_rle_bytes(file.read(exponent_bytes_amount)) if self.rle else file.read(exponent_bytes_amount) + ) + exponent_bytes.seek(0) + for layer_name, tensor in prev_model_state.items(): - prev_model_state[layer_name] = self.difference_operator.restore(tensor, file) + shape = tensor.shape + num_bytes = math.prod(shape) * torch_dtype_to_byte_size[tensor.dtype] + + if tensor.dtype == torch.float32: + buffer = bytearray(num_bytes) + for i in range(0, num_bytes, 4): + buffer[i : i + 3] = file.read(3) + buffer[i + 3 : i + 4] = exponent_bytes.read(1) + + prev_model_state[layer_name] = self.difference_operator.restore(tensor, self.reorder_buffer(buffer)) + else: + prev_model_state[layer_name] = self.difference_operator.restore(tensor, file.read(num_bytes)) + + @staticmethod + def reorder_buffer(buffer: Union[bytes, bytearray]) -> bytes: + bit_array = BitArray(buffer) + array_size = len(bit_array) + + for i in range(0, array_size, 32): + # exchange sign bit with last exponent bit + sign_bit = bit_array[i + 24] + bit_array[i + 24] = bit_array[i + 16] + bit_array[i + 16] = sign_bit + + return bit_array.bytes + + @staticmethod + def rle_bytes(buffer: bytes) -> bytes: + """ + Perform byte-wise run-length encoding. + + Args: + buffer: the bytes to be encoded. + + Returns: + bytes: the encoded bytes. + """ + if len(buffer) == 0: + return buffer + bytestream = io.BytesIO() + + curr = buffer[0] + count = 0 + + for byte in buffer: + if byte == curr and count < 255: + count += 1 + else: + bytestream.write(count.to_bytes(1, byteorder="big")) + bytestream.write(curr.to_bytes(1, byteorder="big")) + curr = byte + count = 1 + bytestream.write(count.to_bytes(1, byteorder="big")) + bytestream.write(curr.to_bytes(1, byteorder="big")) + + return bytestream.getvalue() + + @staticmethod + def inv_rle_bytes(buffer: bytes) -> bytes: + """ + Decode run-length encoded bytes. + + Args: + buffer: the encoded bytes. + + Returns: + bytes: the decoded bytes. + """ + assert len(buffer) % 2 == 0, "should be of even length" + bytestream = io.BytesIO() + + for i in range(0, len(buffer), 2): + count = int.from_bytes(buffer[i : i + 1], byteorder="big") + + bytestream.write(count * buffer[i + 1 : i + 2]) + return bytestream.getvalue() def validate_config(self, config: dict) -> None: if "operator" in config: @@ -45,3 +159,4 @@ def validate_config(self, config: dict) -> None: raise ValueError(f"Operator should be one of {available_difference_operators}.") self.difference_operator = available_difference_operators[difference_operator_name] self.split_exponent = config["split_exponent"] if "split_exponent" in config else False + self.rle = config["rle"] if "rle" in config else False diff --git a/modyn/model_storage/internal/utils/__init__.py b/modyn/model_storage/internal/utils/__init__.py index 846887f99..0dc282c60 100644 --- a/modyn/model_storage/internal/utils/__init__.py +++ b/modyn/model_storage/internal/utils/__init__.py @@ -6,14 +6,8 @@ import os -from .data_types import ( # noqa: F401 - create_tensor, - numpy_dtype_to_torch_dict, - read_tensor_from_bytes, - torch_dtype_to_byte_size, - torch_dtype_to_numpy_dict, -) -from .model_storage_strategy import ModelStorageStrategy # noqa: F401 +from .data_types import read_tensor_from_bytes, torch_dtype_to_byte_size, torch_dtype_to_numpy_dict # noqa: F401 +from .model_storage_policy import ModelStoragePolicy # noqa: F401 files = os.listdir(os.path.dirname(__file__)) files.remove("__init__.py") diff --git a/modyn/model_storage/internal/utils/data_types.py b/modyn/model_storage/internal/utils/data_types.py index ae929e9cb..701c20df1 100644 --- a/modyn/model_storage/internal/utils/data_types.py +++ b/modyn/model_storage/internal/utils/data_types.py @@ -1,9 +1,6 @@ """ This class provides useful functionalities for different data types and conversions between them. """ -import math -from typing import BinaryIO - import numpy as np import torch @@ -20,8 +17,6 @@ torch.complex128: np.complex128, } -numpy_dtype_to_torch_dict = {value: key for (key, value) in torch_dtype_to_numpy_dict.items()} - torch_dtype_to_byte_size = { torch.uint8: 1, torch.int8: 1, @@ -36,16 +31,19 @@ } -def read_tensor_from_bytes(tensor: torch.Tensor, bytestream: BinaryIO) -> torch.Tensor: - shape = tensor.shape - num_bytes = math.prod(shape) * torch_dtype_to_byte_size[tensor.dtype] - byte_data = bytestream.read(num_bytes) - np_dtype = np.dtype(torch_dtype_to_numpy_dict[tensor.dtype]) - return create_tensor(byte_data, np_dtype, shape) +def read_tensor_from_bytes(tensor: torch.Tensor, buffer: bytes) -> torch.Tensor: + """ + Reconstruct a tensor from bytes. + Args: + tensor: the template for the reconstructed tensor. + buffer: the serialized tensor information. -def create_tensor(buffer: bytes, dtype: np.dtype, shape: torch.Size) -> torch.Tensor: - dtype = dtype.newbyteorder("<") - np_array = np.frombuffer(buffer, dtype=dtype) + Returns: + Tensor: the reconstructed tensor. + """ + np_dtype = np.dtype(torch_dtype_to_numpy_dict[tensor.dtype]) + np_dtype = np_dtype.newbyteorder("<") + np_array = np.frombuffer(buffer, dtype=np_dtype) array_tensor = torch.tensor(np.array(np_array)) - return torch.reshape(array_tensor, shape) + return torch.reshape(array_tensor, tensor.shape) diff --git a/modyn/model_storage/internal/utils/model_storage_strategy.py b/modyn/model_storage/internal/utils/model_storage_policy.py similarity index 82% rename from modyn/model_storage/internal/utils/model_storage_strategy.py rename to modyn/model_storage/internal/utils/model_storage_policy.py index 077f11216..9220be4e2 100644 --- a/modyn/model_storage/internal/utils/model_storage_strategy.py +++ b/modyn/model_storage/internal/utils/model_storage_policy.py @@ -1,5 +1,6 @@ import json import logging +import pathlib from typing import Optional, Union from modyn.model_storage.internal.storage_strategies.full_model_strategies import AbstractFullModelStrategy @@ -14,18 +15,20 @@ INCREMENTAL_MODEL_STRATEGY_MODULE = "modyn.model_storage.internal.storage_strategies.incremental_model_strategies" -class ModelStorageStrategy: +class ModelStoragePolicy: """ - Class used to represent the model storage strategy. It loads the specified strategies. + Class used to represent the model storage policy. It loads the specified strategies. """ def __init__( self, + zipping_dir: pathlib.Path, full_model_strategy_name: str, full_model_strategy_zip: Optional[bool], full_model_strategy_zip_algorithm: Optional[str], full_model_strategy_config: Optional[str], ) -> None: + self.zipping_dir = zipping_dir self.full_model_strategy: AbstractFullModelStrategy = self._setup_model_storage_strategy( full_model_strategy_name, full_model_strategy_zip, @@ -56,13 +59,19 @@ def _validate_full_model_interval(self, full_model_interval: int) -> None: raise ValueError("Full model interval should be positive.") self.full_model_interval = full_model_interval - @staticmethod def _setup_model_storage_strategy( - name: str, zip_enabled: Optional[bool], zip_algorithm: Optional[str], config: Optional[str], module_name: str + self, + name: str, + zip_enabled: Optional[bool], + zip_algorithm: Optional[str], + config: Optional[str], + module_name: str, ) -> Union[AbstractFullModelStrategy, AbstractIncrementalModelStrategy]: model_storage_module = dynamic_module_import(module_name) if not hasattr(model_storage_module, name): raise NotImplementedError(f"Strategy {name} not implemented!") model_storage_strategy_handler = getattr(model_storage_module, name) strategy_config = json.loads(config) if config else {} - return model_storage_strategy_handler(zip_enabled or False, zip_algorithm or "ZIP_DEFLATED", strategy_config) + return model_storage_strategy_handler( + self.zipping_dir, zip_enabled or False, zip_algorithm or "ZIP_DEFLATED", strategy_config + ) diff --git a/modyn/model_storage/model_storage.py b/modyn/model_storage/model_storage.py index 3b918a3af..737003829 100644 --- a/modyn/model_storage/model_storage.py +++ b/modyn/model_storage/model_storage.py @@ -1,27 +1,41 @@ import os import pathlib import shutil -import tempfile from modyn.common.ftp.ftp_server import FTPServer from modyn.model_storage.internal.grpc.grpc_server import GRPCServer +from modyn.utils import is_directory_writable class ModelStorage: def __init__(self, config: dict) -> None: self.config = config - self._setup_model_storage_directories() + self._init_model_storage_directory() + self._setup_ftp_directory() - def _setup_model_storage_directories(self) -> None: - self.model_storage_directory = pathlib.Path(os.getcwd()) / "model_storage" - self.ftp_directory = pathlib.Path(tempfile.gettempdir()) / "ftp_model_storage" + def _init_model_storage_directory(self) -> None: + self.model_storage_directory = pathlib.Path(self.config["model_storage"]["models_directory"]) - os.makedirs(self.model_storage_directory, exist_ok=True) + if not self.model_storage_directory.exists(): + raise ValueError( + f"The model storage directory {self.model_storage_directory} does not exist. \ + Please create the directory or mount another, existing directory." + ) + + if not is_directory_writable(self.model_storage_directory): + raise ValueError( + f"The model storage directory {self.model_storage_directory} is not writable. \ + Please check the directory permissions and try again.\n" + + f"Directory info: {os.stat(self.model_storage_directory)}" + ) + + def _setup_ftp_directory(self) -> None: + self.ftp_directory = pathlib.Path(os.getcwd()) / "ftp_model_storage" if self.ftp_directory.exists() and self.ftp_directory.is_dir(): shutil.rmtree(self.ftp_directory) - self.ftp_directory.mkdir() + self.ftp_directory.mkdir(exist_ok=False) def run(self) -> None: with GRPCServer(self.config, self.model_storage_directory, self.ftp_directory) as server: diff --git a/modyn/protos/selector.proto b/modyn/protos/selector.proto index b6e7a6a00..7f5658b09 100644 --- a/modyn/protos/selector.proto +++ b/modyn/protos/selector.proto @@ -33,7 +33,7 @@ message StrategyConfig { optional JsonString config = 4; } -message ModelStorageStrategyInfo { +message ModelStoragePolicyInfo { StrategyConfig full_model_strategy_config = 1; optional StrategyConfig incremental_model_strategy_config = 2; optional int32 full_model_interval = 3; @@ -56,10 +56,10 @@ message TriggerResponse { message RegisterPipelineRequest { int32 num_workers = 1; JsonString selection_strategy = 2; - string model_id = 3; + string model_class_name = 3; JsonString model_configuration = 4; bool amp = 5; - ModelStorageStrategyInfo model_storage_strategy = 6; + ModelStoragePolicyInfo model_storage_policy = 6; } message PipelineResponse { int32 pipeline_id = 1; } diff --git a/modyn/selector/internal/grpc/generated/selector_pb2.py b/modyn/selector/internal/grpc/generated/selector_pb2.py index 8091fd68e..9dbf92ebe 100644 --- a/modyn/selector/internal/grpc/generated/selector_pb2.py +++ b/modyn/selector/internal/grpc/generated/selector_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0eselector.proto\x12\x08selector\"\x07\n\x05\x45mpty\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"Z\n\x11\x44\x61taInformRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x0c\n\x04keys\x18\x02 \x03(\x03\x12\x12\n\ntimestamps\x18\x03 \x03(\x03\x12\x0e\n\x06labels\x18\x04 \x03(\x03\"%\n\x0fTriggerResponse\x12\x12\n\ntrigger_id\x18\x01 \x01(\x05\"`\n\x17RegisterPipelineRequest\x12\x13\n\x0bnum_workers\x18\x01 \x01(\x05\x12\x30\n\x12selection_strategy\x18\x02 \x01(\x0b\x32\x14.selector.JsonString\"\'\n\x10PipelineResponse\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"e\n\x11GetSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x14\n\x0cpartition_id\x18\x03 \x01(\x05\x12\x11\n\tworker_id\x18\x04 \x01(\x05\"T\n\x0fSamplesResponse\x12\x1f\n\x17training_samples_subset\x18\x01 \x03(\x03\x12 \n\x18training_samples_weights\x18\x02 \x03(\x02\"D\n\x19GetNumberOfSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\".\n\x17NumberOfSamplesResponse\x12\x13\n\x0bnum_samples\x18\x01 \x01(\x05\"/\n\x18GetStatusBarScaleRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"2\n\x16StatusBarScaleResponse\x12\x18\n\x10status_bar_scale\x18\x01 \x01(\x05\"G\n\x1cGetNumberOfPartitionsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\"4\n\x1aNumberOfPartitionsResponse\x12\x16\n\x0enum_partitions\x18\x01 \x01(\x05\"0\n\x19GetAvailableLabelsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"3\n\x17\x41vailableLabelsResponse\x12\x18\n\x10\x61vailable_labels\x18\x01 \x03(\x03\"2\n\x1bGetSelectionStrategyRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"\x82\x01\n\x19SelectionStrategyResponse\x12\x1c\n\x14\x64ownsampling_enabled\x18\x01 \x01(\x08\x12\x15\n\rstrategy_name\x18\x02 \x01(\t\x12\x30\n\x12\x64ownsampler_config\x18\x03 \x01(\x0b\x32\x14.selector.JsonString\")\n\x12UsesWeightsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"+\n\x13UsesWeightsResponse\x12\x14\n\x0cuses_weights\x18\x01 \x01(\x08\"#\n\x13SeedSelectorRequest\x12\x0c\n\x04seed\x18\x01 \x01(\x05\"\'\n\x14SeedSelectorResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xe9\x07\n\x08Selector\x12T\n\x11register_pipeline\x12!.selector.RegisterPipelineRequest\x1a\x1a.selector.PipelineResponse\"\x00\x12Y\n\x1bget_sample_keys_and_weights\x12\x1b.selector.GetSamplesRequest\x1a\x19.selector.SamplesResponse\"\x00\x30\x01\x12=\n\x0binform_data\x12\x1b.selector.DataInformRequest\x1a\x0f.selector.Empty\"\x00\x12S\n\x17inform_data_and_trigger\x12\x1b.selector.DataInformRequest\x1a\x19.selector.TriggerResponse\"\x00\x12\x61\n\x15get_number_of_samples\x12#.selector.GetNumberOfSamplesRequest\x1a!.selector.NumberOfSamplesResponse\"\x00\x12^\n\x14get_status_bar_scale\x12\".selector.GetStatusBarScaleRequest\x1a .selector.StatusBarScaleResponse\"\x00\x12j\n\x18get_number_of_partitions\x12&.selector.GetNumberOfPartitionsRequest\x1a$.selector.NumberOfPartitionsResponse\"\x00\x12`\n\x14get_available_labels\x12#.selector.GetAvailableLabelsRequest\x1a!.selector.AvailableLabelsResponse\"\x00\x12\x66\n\x16get_selection_strategy\x12%.selector.GetSelectionStrategyRequest\x1a#.selector.SelectionStrategyResponse\"\x00\x12P\n\rseed_selector\x12\x1d.selector.SeedSelectorRequest\x1a\x1e.selector.SeedSelectorResponse\"\x00\x12M\n\x0cuses_weights\x12\x1c.selector.UsesWeightsRequest\x1a\x1d.selector.UsesWeightsResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0eselector.proto\x12\x08selector\"\x07\n\x05\x45mpty\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x9c\x01\n\x0eStrategyConfig\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x03zip\x18\x02 \x01(\x08H\x00\x88\x01\x01\x12\x1a\n\rzip_algorithm\x18\x03 \x01(\tH\x01\x88\x01\x01\x12)\n\x06\x63onfig\x18\x04 \x01(\x0b\x32\x14.selector.JsonStringH\x02\x88\x01\x01\x42\x06\n\x04_zipB\x10\n\x0e_zip_algorithmB\t\n\x07_config\"\x80\x02\n\x16ModelStoragePolicyInfo\x12<\n\x1a\x66ull_model_strategy_config\x18\x01 \x01(\x0b\x32\x18.selector.StrategyConfig\x12H\n!incremental_model_strategy_config\x18\x02 \x01(\x0b\x32\x18.selector.StrategyConfigH\x00\x88\x01\x01\x12 \n\x13\x66ull_model_interval\x18\x03 \x01(\x05H\x01\x88\x01\x01\x42$\n\"_incremental_model_strategy_configB\x16\n\x14_full_model_interval\"Z\n\x11\x44\x61taInformRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x0c\n\x04keys\x18\x02 \x03(\x03\x12\x12\n\ntimestamps\x18\x03 \x03(\x03\x12\x0e\n\x06labels\x18\x04 \x03(\x03\"7\n\x12\x44\x61taInformResponse\x12!\n\x03log\x18\x01 \x01(\x0b\x32\x14.selector.JsonString\"H\n\x0fTriggerResponse\x12\x12\n\ntrigger_id\x18\x01 \x01(\x05\x12!\n\x03log\x18\x02 \x01(\x0b\x32\x14.selector.JsonString\"\xfa\x01\n\x17RegisterPipelineRequest\x12\x13\n\x0bnum_workers\x18\x01 \x01(\x05\x12\x30\n\x12selection_strategy\x18\x02 \x01(\x0b\x32\x14.selector.JsonString\x12\x18\n\x10model_class_name\x18\x03 \x01(\t\x12\x31\n\x13model_configuration\x18\x04 \x01(\x0b\x32\x14.selector.JsonString\x12\x0b\n\x03\x61mp\x18\x05 \x01(\x08\x12>\n\x14model_storage_policy\x18\x06 \x01(\x0b\x32 .selector.ModelStoragePolicyInfo\"\'\n\x10PipelineResponse\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"e\n\x11GetSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x14\n\x0cpartition_id\x18\x03 \x01(\x05\x12\x11\n\tworker_id\x18\x04 \x01(\x05\"T\n\x0fSamplesResponse\x12\x1f\n\x17training_samples_subset\x18\x01 \x03(\x03\x12 \n\x18training_samples_weights\x18\x02 \x03(\x02\"D\n\x19GetNumberOfSamplesRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\".\n\x17NumberOfSamplesResponse\x12\x13\n\x0bnum_samples\x18\x01 \x01(\x05\"/\n\x18GetStatusBarScaleRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"2\n\x16StatusBarScaleResponse\x12\x18\n\x10status_bar_scale\x18\x01 \x01(\x05\"G\n\x1cGetNumberOfPartitionsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\"4\n\x1aNumberOfPartitionsResponse\x12\x16\n\x0enum_partitions\x18\x01 \x01(\x05\"0\n\x19GetAvailableLabelsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"3\n\x17\x41vailableLabelsResponse\x12\x18\n\x10\x61vailable_labels\x18\x01 \x03(\x03\"2\n\x1bGetSelectionStrategyRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"\x82\x01\n\x19SelectionStrategyResponse\x12\x1c\n\x14\x64ownsampling_enabled\x18\x01 \x01(\x08\x12\x15\n\rstrategy_name\x18\x02 \x01(\t\x12\x30\n\x12\x64ownsampler_config\x18\x03 \x01(\x0b\x32\x14.selector.JsonString\")\n\x12UsesWeightsRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\"+\n\x13UsesWeightsResponse\x12\x14\n\x0cuses_weights\x18\x01 \x01(\x08\"#\n\x13SeedSelectorRequest\x12\x0c\n\x04seed\x18\x01 \x01(\x05\"\'\n\x14SeedSelectorResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x32\xf6\x07\n\x08Selector\x12T\n\x11register_pipeline\x12!.selector.RegisterPipelineRequest\x1a\x1a.selector.PipelineResponse\"\x00\x12Y\n\x1bget_sample_keys_and_weights\x12\x1b.selector.GetSamplesRequest\x1a\x19.selector.SamplesResponse\"\x00\x30\x01\x12J\n\x0binform_data\x12\x1b.selector.DataInformRequest\x1a\x1c.selector.DataInformResponse\"\x00\x12S\n\x17inform_data_and_trigger\x12\x1b.selector.DataInformRequest\x1a\x19.selector.TriggerResponse\"\x00\x12\x61\n\x15get_number_of_samples\x12#.selector.GetNumberOfSamplesRequest\x1a!.selector.NumberOfSamplesResponse\"\x00\x12^\n\x14get_status_bar_scale\x12\".selector.GetStatusBarScaleRequest\x1a .selector.StatusBarScaleResponse\"\x00\x12j\n\x18get_number_of_partitions\x12&.selector.GetNumberOfPartitionsRequest\x1a$.selector.NumberOfPartitionsResponse\"\x00\x12`\n\x14get_available_labels\x12#.selector.GetAvailableLabelsRequest\x1a!.selector.AvailableLabelsResponse\"\x00\x12\x66\n\x16get_selection_strategy\x12%.selector.GetSelectionStrategyRequest\x1a#.selector.SelectionStrategyResponse\"\x00\x12P\n\rseed_selector\x12\x1d.selector.SeedSelectorRequest\x1a\x1e.selector.SeedSelectorResponse\"\x00\x12M\n\x0cuses_weights\x12\x1c.selector.UsesWeightsRequest\x1a\x1d.selector.UsesWeightsResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'selector_pb2', globals()) @@ -25,46 +25,52 @@ _EMPTY._serialized_end=35 _JSONSTRING._serialized_start=37 _JSONSTRING._serialized_end=64 - _DATAINFORMREQUEST._serialized_start=66 - _DATAINFORMREQUEST._serialized_end=156 - _TRIGGERRESPONSE._serialized_start=158 - _TRIGGERRESPONSE._serialized_end=195 - _REGISTERPIPELINEREQUEST._serialized_start=197 - _REGISTERPIPELINEREQUEST._serialized_end=293 - _PIPELINERESPONSE._serialized_start=295 - _PIPELINERESPONSE._serialized_end=334 - _GETSAMPLESREQUEST._serialized_start=336 - _GETSAMPLESREQUEST._serialized_end=437 - _SAMPLESRESPONSE._serialized_start=439 - _SAMPLESRESPONSE._serialized_end=523 - _GETNUMBEROFSAMPLESREQUEST._serialized_start=525 - _GETNUMBEROFSAMPLESREQUEST._serialized_end=593 - _NUMBEROFSAMPLESRESPONSE._serialized_start=595 - _NUMBEROFSAMPLESRESPONSE._serialized_end=641 - _GETSTATUSBARSCALEREQUEST._serialized_start=643 - _GETSTATUSBARSCALEREQUEST._serialized_end=690 - _STATUSBARSCALERESPONSE._serialized_start=692 - _STATUSBARSCALERESPONSE._serialized_end=742 - _GETNUMBEROFPARTITIONSREQUEST._serialized_start=744 - _GETNUMBEROFPARTITIONSREQUEST._serialized_end=815 - _NUMBEROFPARTITIONSRESPONSE._serialized_start=817 - _NUMBEROFPARTITIONSRESPONSE._serialized_end=869 - _GETAVAILABLELABELSREQUEST._serialized_start=871 - _GETAVAILABLELABELSREQUEST._serialized_end=919 - _AVAILABLELABELSRESPONSE._serialized_start=921 - _AVAILABLELABELSRESPONSE._serialized_end=972 - _GETSELECTIONSTRATEGYREQUEST._serialized_start=974 - _GETSELECTIONSTRATEGYREQUEST._serialized_end=1024 - _SELECTIONSTRATEGYRESPONSE._serialized_start=1027 - _SELECTIONSTRATEGYRESPONSE._serialized_end=1157 - _USESWEIGHTSREQUEST._serialized_start=1159 - _USESWEIGHTSREQUEST._serialized_end=1200 - _USESWEIGHTSRESPONSE._serialized_start=1202 - _USESWEIGHTSRESPONSE._serialized_end=1245 - _SEEDSELECTORREQUEST._serialized_start=1247 - _SEEDSELECTORREQUEST._serialized_end=1282 - _SEEDSELECTORRESPONSE._serialized_start=1284 - _SEEDSELECTORRESPONSE._serialized_end=1323 - _SELECTOR._serialized_start=1326 - _SELECTOR._serialized_end=2327 + _STRATEGYCONFIG._serialized_start=67 + _STRATEGYCONFIG._serialized_end=223 + _MODELSTORAGEPOLICYINFO._serialized_start=226 + _MODELSTORAGEPOLICYINFO._serialized_end=482 + _DATAINFORMREQUEST._serialized_start=484 + _DATAINFORMREQUEST._serialized_end=574 + _DATAINFORMRESPONSE._serialized_start=576 + _DATAINFORMRESPONSE._serialized_end=631 + _TRIGGERRESPONSE._serialized_start=633 + _TRIGGERRESPONSE._serialized_end=705 + _REGISTERPIPELINEREQUEST._serialized_start=708 + _REGISTERPIPELINEREQUEST._serialized_end=958 + _PIPELINERESPONSE._serialized_start=960 + _PIPELINERESPONSE._serialized_end=999 + _GETSAMPLESREQUEST._serialized_start=1001 + _GETSAMPLESREQUEST._serialized_end=1102 + _SAMPLESRESPONSE._serialized_start=1104 + _SAMPLESRESPONSE._serialized_end=1188 + _GETNUMBEROFSAMPLESREQUEST._serialized_start=1190 + _GETNUMBEROFSAMPLESREQUEST._serialized_end=1258 + _NUMBEROFSAMPLESRESPONSE._serialized_start=1260 + _NUMBEROFSAMPLESRESPONSE._serialized_end=1306 + _GETSTATUSBARSCALEREQUEST._serialized_start=1308 + _GETSTATUSBARSCALEREQUEST._serialized_end=1355 + _STATUSBARSCALERESPONSE._serialized_start=1357 + _STATUSBARSCALERESPONSE._serialized_end=1407 + _GETNUMBEROFPARTITIONSREQUEST._serialized_start=1409 + _GETNUMBEROFPARTITIONSREQUEST._serialized_end=1480 + _NUMBEROFPARTITIONSRESPONSE._serialized_start=1482 + _NUMBEROFPARTITIONSRESPONSE._serialized_end=1534 + _GETAVAILABLELABELSREQUEST._serialized_start=1536 + _GETAVAILABLELABELSREQUEST._serialized_end=1584 + _AVAILABLELABELSRESPONSE._serialized_start=1586 + _AVAILABLELABELSRESPONSE._serialized_end=1637 + _GETSELECTIONSTRATEGYREQUEST._serialized_start=1639 + _GETSELECTIONSTRATEGYREQUEST._serialized_end=1689 + _SELECTIONSTRATEGYRESPONSE._serialized_start=1692 + _SELECTIONSTRATEGYRESPONSE._serialized_end=1822 + _USESWEIGHTSREQUEST._serialized_start=1824 + _USESWEIGHTSREQUEST._serialized_end=1865 + _USESWEIGHTSRESPONSE._serialized_start=1867 + _USESWEIGHTSRESPONSE._serialized_end=1910 + _SEEDSELECTORREQUEST._serialized_start=1912 + _SEEDSELECTORREQUEST._serialized_end=1947 + _SEEDSELECTORRESPONSE._serialized_start=1949 + _SEEDSELECTORRESPONSE._serialized_end=1988 + _SELECTOR._serialized_start=1991 + _SELECTOR._serialized_end=3005 # @@protoc_insertion_point(module_scope) diff --git a/modyn/selector/internal/grpc/generated/selector_pb2.pyi b/modyn/selector/internal/grpc/generated/selector_pb2.pyi index edc890338..baf170338 100644 --- a/modyn/selector/internal/grpc/generated/selector_pb2.pyi +++ b/modyn/selector/internal/grpc/generated/selector_pb2.pyi @@ -75,7 +75,7 @@ class StrategyConfig(google.protobuf.message.Message): global___StrategyConfig = StrategyConfig @typing_extensions.final -class ModelStorageStrategyInfo(google.protobuf.message.Message): +class ModelStoragePolicyInfo(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor FULL_MODEL_STRATEGY_CONFIG_FIELD_NUMBER: builtins.int @@ -100,7 +100,7 @@ class ModelStorageStrategyInfo(google.protobuf.message.Message): @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_incremental_model_strategy_config", b"_incremental_model_strategy_config"]) -> typing_extensions.Literal["incremental_model_strategy_config"] | None: ... -global___ModelStorageStrategyInfo = ModelStorageStrategyInfo +global___ModelStoragePolicyInfo = ModelStoragePolicyInfo @typing_extensions.final class DataInformRequest(google.protobuf.message.Message): @@ -172,31 +172,31 @@ class RegisterPipelineRequest(google.protobuf.message.Message): NUM_WORKERS_FIELD_NUMBER: builtins.int SELECTION_STRATEGY_FIELD_NUMBER: builtins.int - MODEL_ID_FIELD_NUMBER: builtins.int + MODEL_CLASS_NAME_FIELD_NUMBER: builtins.int MODEL_CONFIGURATION_FIELD_NUMBER: builtins.int AMP_FIELD_NUMBER: builtins.int - MODEL_STORAGE_STRATEGY_FIELD_NUMBER: builtins.int + MODEL_STORAGE_POLICY_FIELD_NUMBER: builtins.int num_workers: builtins.int @property def selection_strategy(self) -> global___JsonString: ... - model_id: builtins.str + model_class_name: builtins.str @property def model_configuration(self) -> global___JsonString: ... amp: builtins.bool @property - def model_storage_strategy(self) -> global___ModelStorageStrategyInfo: ... + def model_storage_policy(self) -> global___ModelStoragePolicyInfo: ... def __init__( self, *, num_workers: builtins.int = ..., selection_strategy: global___JsonString | None = ..., - model_id: builtins.str = ..., + model_class_name: builtins.str = ..., model_configuration: global___JsonString | None = ..., amp: builtins.bool = ..., - model_storage_strategy: global___ModelStorageStrategyInfo | None = ..., + model_storage_policy: global___ModelStoragePolicyInfo | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["model_configuration", b"model_configuration", "model_storage_strategy", b"model_storage_strategy", "selection_strategy", b"selection_strategy"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["amp", b"amp", "model_configuration", b"model_configuration", "model_id", b"model_id", "model_storage_strategy", b"model_storage_strategy", "num_workers", b"num_workers", "selection_strategy", b"selection_strategy"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["model_configuration", b"model_configuration", "model_storage_policy", b"model_storage_policy", "selection_strategy", b"selection_strategy"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["amp", b"amp", "model_class_name", b"model_class_name", "model_configuration", b"model_configuration", "model_storage_policy", b"model_storage_policy", "num_workers", b"num_workers", "selection_strategy", b"selection_strategy"]) -> None: ... global___RegisterPipelineRequest = RegisterPipelineRequest diff --git a/modyn/selector/internal/grpc/selector_grpc_servicer.py b/modyn/selector/internal/grpc/selector_grpc_servicer.py index f67fedef3..f7b2b7904 100644 --- a/modyn/selector/internal/grpc/selector_grpc_servicer.py +++ b/modyn/selector/internal/grpc/selector_grpc_servicer.py @@ -5,7 +5,6 @@ import grpc # pylint: disable=no-name-in-module -from modyn.selector.internal.grpc.generated.selector_pb2 import JsonString # noqa: E402, E501 from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.selector.internal.grpc.generated.selector_pb2 import ( AvailableLabelsResponse, @@ -17,6 +16,7 @@ GetSamplesRequest, GetSelectionStrategyRequest, GetStatusBarScaleRequest, + JsonString, NumberOfPartitionsResponse, NumberOfSamplesResponse, PipelineResponse, @@ -51,29 +51,29 @@ def register_pipeline(self, request: RegisterPipelineRequest, context: grpc.Serv logger.info(f"Registering pipeline with request - {str(request)}") full_model_strategy = self.get_model_storage_strategy_config( - request.model_storage_strategy.full_model_strategy_config + request.model_storage_policy.full_model_strategy_config ) incremental_model_strategy: Optional[ModelStorageStrategyConfig] = None if ( - request.model_storage_strategy.HasField("incremental_model_strategy_config") - and request.model_storage_strategy.incremental_model_strategy_config is not None + request.model_storage_policy.HasField("incremental_model_strategy_config") + and request.model_storage_policy.incremental_model_strategy_config is not None ): incremental_model_strategy = self.get_model_storage_strategy_config( - request.model_storage_strategy.incremental_model_strategy_config + request.model_storage_policy.incremental_model_strategy_config ) full_model_interval: Optional[int] = None if ( - request.model_storage_strategy.HasField("full_model_interval") + request.model_storage_policy.HasField("full_model_interval") and request.model_storage_strategy.full_model_interval is not None ): - full_model_interval = request.model_storage_strategy.full_model_interval + full_model_interval = request.model_storage_policy.full_model_interval pipeline_id = self.selector_manager.register_pipeline( request.num_workers, request.selection_strategy.value, - request.model_id, + request.model_class_name, request.model_configuration.value, request.amp, full_model_strategy, diff --git a/modyn/selector/internal/selector_manager.py b/modyn/selector/internal/selector_manager.py index 608797bb0..4ae1ef194 100644 --- a/modyn/selector/internal/selector_manager.py +++ b/modyn/selector/internal/selector_manager.py @@ -63,7 +63,7 @@ def register_pipeline( self, num_workers: int, selection_strategy: str, - model_id: str, + model_class_name: str, model_config: str, amp: bool, full_model_strategy: ModelStorageStrategyConfig, @@ -84,7 +84,7 @@ def register_pipeline( with MetadataDatabaseConnection(self._modyn_config) as database: pipeline_id = database.register_pipeline( num_workers, - model_id, + model_class_name, model_config, amp, full_model_strategy, diff --git a/modyn/supervisor/internal/grpc_handler.py b/modyn/supervisor/internal/grpc_handler.py index 175f73cfd..78d34886e 100644 --- a/modyn/supervisor/internal/grpc_handler.py +++ b/modyn/supervisor/internal/grpc_handler.py @@ -29,7 +29,7 @@ ) from modyn.selector.internal.grpc.generated.selector_pb2 import JsonString as SelectorJsonString from modyn.selector.internal.grpc.generated.selector_pb2 import ( - ModelStorageStrategyInfo, + ModelStoragePolicyInfo, NumberOfSamplesResponse, RegisterPipelineRequest, SeedSelectorRequest, @@ -209,8 +209,13 @@ def register_pipeline_at_selector(self, pipeline_config: dict) -> int: model_storage_config = pipeline_config["model_storage"] incremental_model_strategy: Optional[StrategyConfig] = None + full_model_interval: Optional[int] = None if "incremental_model_strategy" in model_storage_config: - incremental_model_strategy = self.get_model_strategy(model_storage_config["incremental_model_strategy"]) + incremental_strategy = model_storage_config["incremental_model_strategy"] + incremental_model_strategy = self.get_model_strategy(incremental_strategy) + full_model_interval = ( + incremental_strategy["full_model_interval"] if "full_model_interval" in incremental_strategy else None + ) pipeline_id = self.selector.register_pipeline( RegisterPipelineRequest( @@ -218,15 +223,13 @@ def register_pipeline_at_selector(self, pipeline_config: dict) -> int: selection_strategy=SelectorJsonString( value=json.dumps(pipeline_config["training"]["selection_strategy"]) ), - model_id=pipeline_config["model"]["id"], + model_class_name=pipeline_config["model"]["id"], model_configuration=SelectorJsonString(value=model_config), amp=pipeline_config["training"]["amp"] if "amp" in pipeline_config["training"] else False, - model_storage_strategy=ModelStorageStrategyInfo( + model_storage_policy=ModelStoragePolicyInfo( full_model_strategy_config=self.get_model_strategy(model_storage_config["full_model_strategy"]), incremental_model_strategy_config=incremental_model_strategy, - full_model_interval=model_storage_config["full_model_interval"] - if "full_model_interval" in model_storage_config - else None, + full_model_interval=full_model_interval ), ) ).pipeline_id diff --git a/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py b/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py index 4472c6fe6..768d57b35 100644 --- a/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py +++ b/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py @@ -63,8 +63,8 @@ def setup(): incremental_model_strategy=None, full_model_interval=None, ) - database.add_trained_model(1, 10, "trained_model.modyn") - database.add_trained_model(1, 11, "trained_model2.modyn") + database.add_trained_model(1, 10, "trained_model.modyn", "trained_model.metadata") + database.add_trained_model(1, 11, "trained_model2.modyn", "trained_model.metadata") def teardown(): diff --git a/modyn/tests/metadata_database/models/test_pipelines.py b/modyn/tests/metadata_database/models/test_pipelines.py index ada36347e..f9ad74152 100644 --- a/modyn/tests/metadata_database/models/test_pipelines.py +++ b/modyn/tests/metadata_database/models/test_pipelines.py @@ -23,7 +23,7 @@ def session(): def test_add_pipeline(session): pipeline = Pipeline( num_workers=10, - model_id="ResNet18", + model_class_name="ResNet18", model_config=json.dumps({"num_classes": 10}), amp=True, full_model_strategy_name="PyTorchFullModel", @@ -35,18 +35,22 @@ def test_add_pipeline(session): assert extracted_pipeline is not None assert extracted_pipeline.num_workers == 10 - assert extracted_pipeline.model_id == "ResNet18" + assert extracted_pipeline.model_class_name == "ResNet18" assert json.loads(extracted_pipeline.model_config)["num_classes"] == 10 assert extracted_pipeline.amp assert extracted_pipeline.full_model_strategy_name == "PyTorchFullModel" - assert extracted_pipeline.full_model_strategy_zip is None + assert not extracted_pipeline.full_model_strategy_zip assert extracted_pipeline.inc_model_strategy_name is None assert extracted_pipeline.full_model_strategy_config is None def test_update_pipeline(session): pipeline = Pipeline( - num_workers=10, model_id="ResNet18", model_config="{}", amp=True, full_model_strategy_name="PyTorchFullModel" + num_workers=10, + model_class_name="ResNet18", + model_config="{}", + amp=True, + full_model_strategy_name="PyTorchFullModel", ) session.add(pipeline) session.commit() @@ -59,15 +63,19 @@ def test_update_pipeline(session): assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().num_workers == 20 assert not session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().amp - pipeline.model_id = "test_model" + pipeline.model_class_name = "test_model" session.commit() - assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().model_id == "test_model" + assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().model_class_name == "test_model" def test_delete_pipeline(session): pipeline = Pipeline( - num_workers=10, model_id="ResNet18", model_config="{}", amp=False, full_model_strategy_name="PyTorchFullModel" + num_workers=10, + model_class_name="ResNet18", + model_config="{}", + amp=False, + full_model_strategy_name="PyTorchFullModel", ) session.add(pipeline) session.commit() diff --git a/modyn/tests/metadata_database/models/test_trained_models.py b/modyn/tests/metadata_database/models/test_trained_models.py index 979485e9c..b34465b98 100644 --- a/modyn/tests/metadata_database/models/test_trained_models.py +++ b/modyn/tests/metadata_database/models/test_trained_models.py @@ -20,7 +20,7 @@ def session(): def test_add_trained_model(session): - model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path") + model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path", metadata_path="metadata") session.add(model) session.commit() @@ -29,7 +29,7 @@ def test_add_trained_model(session): def test_get_trained_model(session): - model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path") + model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path", metadata_path="metadata") session.add(model) session.commit() @@ -38,11 +38,12 @@ def test_get_trained_model(session): assert fetched_valid.model_id == 1 assert fetched_valid.model_path == "test_path" + assert fetched_valid.metadata_path == "metadata" assert fetched_invalid is None def test_delete_trained_model(session): - model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path") + model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path", metadata_path="metadata") session.add(model) session.commit() @@ -50,6 +51,7 @@ def test_delete_trained_model(session): assert fetched.model_id == 1 assert fetched.model_path == "test_path" + assert fetched.metadata_path == "metadata" session.query(TrainedModel).filter(TrainedModel.model_id == 1).delete(synchronize_session="fetch") session.commit() @@ -60,7 +62,7 @@ def test_delete_trained_model(session): def test_string_repr(session): - model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path") + model = TrainedModel(pipeline_id=1, trigger_id=1, model_path="test_path", metadata_path="metadata") session.add(model) session.commit() diff --git a/modyn/tests/metadata_database/test_metadata_database_connection.py b/modyn/tests/metadata_database/test_metadata_database_connection.py index 1e62e9859..4d95dfef2 100644 --- a/modyn/tests/metadata_database/test_metadata_database_connection.py +++ b/modyn/tests/metadata_database/test_metadata_database_connection.py @@ -51,16 +51,19 @@ def test_add_trained_model(): assert pipeline_id == 1 and trigger_id == 5 - model_id = database.add_trained_model(pipeline_id, trigger_id, "test_path.modyn") + model_id = database.add_trained_model(pipeline_id, trigger_id, "test_path.modyn", "test_path.metadata") model_parent: TrainedModel = database.session.get(TrainedModel, model_id) assert model_parent.model_id == 1 assert model_parent.model_path == "test_path.modyn" + assert model_parent.metadata_path == "test_path.metadata" assert model_parent.pipeline_id == 1 and model_parent.trigger_id == 5 assert model_parent.parent_model is None - model_id = database.add_trained_model(pipeline_id, 6, "test_path.modyn", parent_model=model_parent.model_id) + model_id = database.add_trained_model( + pipeline_id, 6, "test_path.modyn", "test_path.metadata", parent_model=model_parent.model_id + ) model_child: TrainedModel = database.session.get(TrainedModel, model_id) assert model_child.parent_model == model_parent.model_id diff --git a/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_sub_difference_operator.py b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_sub_difference_operator.py index 4bc58d961..c635a6f2c 100644 --- a/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_sub_difference_operator.py +++ b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_sub_difference_operator.py @@ -1,5 +1,3 @@ -import io - import torch from modyn.model_storage.internal.storage_strategies import AbstractDifferenceOperator from modyn.model_storage.internal.storage_strategies.difference_operators import SubDifferenceOperator @@ -23,14 +21,6 @@ def test_calculate_restore(): difference_operator = SubDifferenceOperator() ones = torch.ones(1, dtype=torch.int32) - buf = io.BytesIO() - buf.write(b"\x00\x00\x00\x00") - buf.seek(0) - - assert difference_operator.restore(ones, buf).item() == 1 - - buf.seek(0) - buf.write(b"\x01\x00\x00\x00") - buf.seek(0) - assert difference_operator.restore(ones, buf).item() == 2 + assert difference_operator.restore(ones, b"\x00\x00\x00\x00").item() == 1 + assert difference_operator.restore(ones, b"\x01\x00\x00\x00").item() == 2 diff --git a/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_xor_difference_operator.py b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_xor_difference_operator.py index b949e72b1..b36f732c6 100644 --- a/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_xor_difference_operator.py +++ b/modyn/tests/model_storage/internal/storage_strategies/difference_operators/test_xor_difference_operator.py @@ -1,5 +1,3 @@ -import io - import torch from modyn.model_storage.internal.storage_strategies import AbstractDifferenceOperator from modyn.model_storage.internal.storage_strategies.difference_operators import XorDifferenceOperator @@ -23,14 +21,6 @@ def test_calculate_restore(): difference_operator = XorDifferenceOperator() ones = torch.ones(1, dtype=torch.int32) - buf = io.BytesIO() - buf.write(b"\x00\x00\x00\x00") - buf.seek(0) - - assert difference_operator.restore(ones, buf).item() == 1 - - buf.seek(0) - buf.write(b"\x03\x00\x00\x00") - buf.seek(0) - assert difference_operator.restore(ones, buf).item() == 2 + assert difference_operator.restore(ones, b"\x00\x00\x00\x00").item() == 1 + assert difference_operator.restore(ones, b"\x03\x00\x00\x00").item() == 2 diff --git a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_compressed_full_model.py b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_compressed_full_model.py index 44acc8fad..6003f0dcb 100644 --- a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_compressed_full_model.py +++ b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_compressed_full_model.py @@ -14,13 +14,15 @@ def forward(self, data): return data -def test_save_model(): +def test_store_model(): model = MockModel() - full_model_strategy = CompressedFullModel(zip_activated=False, zip_algorithm_name="", config={}) + full_model_strategy = CompressedFullModel( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} + ) with tempfile.NamedTemporaryFile() as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) - full_model_strategy.save_model(model.state_dict(), temp_file_path) + full_model_strategy.store_model(model.state_dict(), temp_file_path) with open(temp_file_path, "rb") as stored_model_file: assert stored_model_file.read() == b"\x00\x00\x80\x3f\x00\x00\x80\x3f" @@ -28,7 +30,9 @@ def test_save_model(): def test_load_model(): model = MockModel() - full_model_strategy = CompressedFullModel(zip_activated=False, zip_algorithm_name="", config={}) + full_model_strategy = CompressedFullModel( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} + ) with tempfile.NamedTemporaryFile() as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) diff --git a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py index e7cd0ba06..ec45bac99 100644 --- a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py +++ b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py @@ -7,25 +7,29 @@ from modyn.utils import unzip_file, zip_file -def test_save_model(): - full_model_strategy = PyTorchFullModel(zip_activated=False, zip_algorithm_name="", config={}) +def test_store_model(): + full_model_strategy = PyTorchFullModel( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} + ) with tempfile.NamedTemporaryFile() as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) - full_model_strategy.save_model({"conv_1": True}, temp_file_path) + full_model_strategy.store_model({"conv_1": True}, temp_file_path) loaded_state = torch.load(temp_file_path) assert loaded_state["conv_1"] -def test_save_model_zipped(): - full_model_strategy = PyTorchFullModel(zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={}) +def test_store_model_zipped(): + full_model_strategy = PyTorchFullModel( + zipping_dir=pathlib.Path(), zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={} + ) with tempfile.TemporaryDirectory() as temp_directory: directory_path = pathlib.Path(temp_directory) zipped_file_path = directory_path / "zipped.model" - full_model_strategy.save_model({"conv_1": True}, zipped_file_path) + full_model_strategy.store_model({"conv_1": True}, zipped_file_path) unzipped_file_path = pathlib.Path(directory_path / "unzipped.model") unzip_file(zipped_file_path, unzipped_file_path, compression=ZIP_DEFLATED) @@ -35,7 +39,9 @@ def test_save_model_zipped(): def test_load_model(): - full_model_strategy = PyTorchFullModel(zip_activated=False, zip_algorithm_name="", config={}) + full_model_strategy = PyTorchFullModel( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} + ) with tempfile.NamedTemporaryFile() as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) @@ -48,9 +54,11 @@ def test_load_model(): def test_load_model_zipped(): - full_model_strategy = PyTorchFullModel(zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={}) with tempfile.TemporaryDirectory() as temp_directory: directory_path = pathlib.Path(temp_directory) + full_model_strategy = PyTorchFullModel( + zipping_dir=directory_path, zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={} + ) model_path = directory_path / "basic.model" torch.save({"conv_1": True}, model_path) @@ -64,12 +72,14 @@ def test_load_model_zipped(): def test_store_then_load(): - full_model_strategy = PyTorchFullModel(zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={}) with tempfile.NamedTemporaryFile() as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) + full_model_strategy = PyTorchFullModel( + zipping_dir=temp_file_path.parent, zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={} + ) model_state = {"conv_1": True} - full_model_strategy.save_model(model_state, temp_file_path) + full_model_strategy.store_model(model_state, temp_file_path) loaded_state = {"conv_1": False} full_model_strategy.load_model(loaded_state, temp_file_path) diff --git a/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py b/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py index 192dd09c1..3eaebabc5 100644 --- a/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py +++ b/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py @@ -2,6 +2,7 @@ import tempfile from zipfile import ZIP_LZMA +import pytest import torch from modyn.model_storage.internal.storage_strategies.difference_operators import ( SubDifferenceOperator, @@ -19,6 +20,16 @@ def forward(self, data): return data +class MockComplexModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self._bias = torch.nn.Parameter(torch.ones(2, dtype=torch.float16)) + self._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32)) + + def forward(self, data): + return data + + def get_mock_model_after() -> MockModel: model_after = MockModel() model_after._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32)) @@ -27,13 +38,18 @@ def get_mock_model_after() -> MockModel: def test_init(): - incremental_strategy = WeightsDifference(zip_activated=False, zip_algorithm_name="", config={}) + incremental_strategy = WeightsDifference( + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} + ) assert isinstance(incremental_strategy.difference_operator, SubDifferenceOperator.__class__) assert not incremental_strategy.split_exponent incremental_strategy = WeightsDifference( - zip_activated=False, zip_algorithm_name="", config={"operator": "xor", "split_exponent": True} + zipping_dir=pathlib.Path(), + zip_activated=False, + zip_algorithm_name="", + config={"operator": "xor", "split_exponent": True}, ) assert not incremental_strategy.zip @@ -41,7 +57,10 @@ def test_init(): assert incremental_strategy.split_exponent incremental_strategy = WeightsDifference( - zip_activated=True, zip_algorithm_name="ZIP_LZMA", config={"operator": "sub", "split_exponent": False} + zipping_dir=pathlib.Path(), + zip_activated=True, + zip_algorithm_name="ZIP_LZMA", + config={"operator": "sub", "split_exponent": False}, ) assert incremental_strategy.zip @@ -50,19 +69,19 @@ def test_init(): assert not incremental_strategy.split_exponent -def test_save_model(): +def test_store_model(): model_before = MockModel() model_after = get_mock_model_after() for operator in ["xor", "sub"]: incremental_strategy = WeightsDifference( - zip_activated=False, zip_algorithm_name="", config={"operator": operator} + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={"operator": operator} ) with tempfile.NamedTemporaryFile() as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) - incremental_strategy.save_model(model_after.state_dict(), model_before.state_dict(), temp_file_path) + incremental_strategy.store_model(model_after.state_dict(), model_before.state_dict(), temp_file_path) with open(temp_file_path, "rb") as stored_model_file: assert stored_model_file.read() == b"\x00\x00\x80\x3f\x00\x00\x80\x3f" @@ -79,9 +98,69 @@ def test_load_model(): model = MockModel() model_state = model.state_dict() incremental_strategy = WeightsDifference( - zip_activated=False, zip_algorithm_name="", config={"operator": operator} + zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={"operator": operator} ) incremental_strategy.load_model(model_state, temp_file_path) assert model_state["_weight"][0] == 1 # pylint: disable=unsubscriptable-object + + +def test_rle(): + assert WeightsDifference.rle_bytes(b"") == b"" + + encoded = WeightsDifference.rle_bytes(b"\x00\x00\x02\x01\x01\x01\x00") + assert encoded == b"\x02\x00\x01\x02\x03\x01\x01\x00" + + encoded = WeightsDifference.rle_bytes(512 * b"\x00" + b"\x01") + assert encoded == b"\xff\x00\xff\x00\x02\x00\x01\x01" + + +def test_inv_rle(): + assert WeightsDifference.inv_rle_bytes(b"") == b"" + + encoded = WeightsDifference.inv_rle_bytes(b"\x02\x00\x01\x02\x03\x01\x01\x00") + assert encoded == b"\x00\x00\x02\x01\x01\x01\x00" + + encoded = WeightsDifference.inv_rle_bytes(b"\xff\x00\xff\x00\x02\x00\x01\x01") + assert encoded == 512 * b"\x00" + b"\x01" + + with pytest.raises(AssertionError): + WeightsDifference.inv_rle_bytes(b"\x02\x00\x01") + + +def test_store_then_load_model(): + model_before = MockComplexModel() + before_state = model_before.state_dict() + model_after = MockComplexModel() + model_after._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32) * 2) + + incremental_strategy = WeightsDifference( + zipping_dir=pathlib.Path(), + zip_activated=False, + zip_algorithm_name="", + config={"operator": "xor", "split_exponent": True, "rle": True}, + ) + + with tempfile.NamedTemporaryFile() as temporary_file: + temp_file_path = pathlib.Path(temporary_file.name) + + incremental_strategy.store_model(model_after.state_dict(), before_state, temp_file_path) + + with open(temp_file_path, "rb") as stored_model_file: + # we store 2 exponent bytes. + assert stored_model_file.read(8) == b"\x00\x00\x00\x00\x00\x00\x00\x02" + + # twice the xor difference between 2 and 1 in the exponent byte. + assert stored_model_file.read(2) == b"\x02\xff" + + # xor difference of the float16 tensors. + assert stored_model_file.read(4) == b"\x00\x00\x00\x00" + + # xor difference of the remaining float32 bytes. + assert stored_model_file.read(8) == b"\x00\x00\x00\x00\x00\x00" + + incremental_strategy.load_model(before_state, temp_file_path) + + assert before_state["_bias"][0].item() == 1 # pylint: disable=unsubscriptable-object + assert before_state["_weight"][0].item() == 2 # pylint: disable=unsubscriptable-object diff --git a/modyn/tests/model_storage/internal/test_model_storage_manager.py b/modyn/tests/model_storage/internal/test_model_storage_manager.py index f5d01de2a..9fdf09eda 100644 --- a/modyn/tests/model_storage/internal/test_model_storage_manager.py +++ b/modyn/tests/model_storage/internal/test_model_storage_manager.py @@ -71,24 +71,24 @@ def get_mock_model_after() -> MockModel: def test_init(): - manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) assert manager._modyn_config == get_modyn_config() assert manager._storage_dir == pathlib.Path("storage") + assert manager._ftp_dir == pathlib.Path("ftp") -def test__get_previous_model(): +def test__get_parent_model(): with MetadataDatabaseConnection(get_modyn_config()) as database: - database.add_trained_model(10, 2, "model.modyn") + model_id = database.add_trained_model(10, 2, "model.modyn", "model.metadata") - manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) - previous_model = manager._get_previous_model(10, 3) - assert previous_model and previous_model.trigger_id == 2 - assert manager._get_previous_model(10, 2) is None + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) + assert manager._get_parent_model_id(10, 3) == model_id + assert manager._get_parent_model_id(10, 2) is None def test__get_base_model_state(): - manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) model_state = manager._get_base_model_state(1) assert len(model_state) == 122 @@ -97,43 +97,49 @@ def test__get_base_model_state(): def test__reconstruct_model(): mock_model = MockModel() model_state = mock_model.state_dict() - full_model_strategy = PyTorchFullModel(zip_activated=False, zip_algorithm_name="", config={}) - incremental_model_strategy = WeightsDifference(zip_activated=False, zip_algorithm_name="", config={}) + full_model_strategy = PyTorchFullModel( + zipping_dir=pathlib.Path("ftp"), zip_activated=False, zip_algorithm_name="", config={} + ) + incremental_model_strategy = WeightsDifference( + zipping_dir=pathlib.Path("ftp"), zip_activated=False, zip_algorithm_name="", config={} + ) with tempfile.TemporaryDirectory() as temp_dir: temp_directory_path = pathlib.Path(temp_dir) - manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, pathlib.Path("ftp")) prev_model_file_name = "before.model" - full_model_strategy.save_model(model_state, temp_directory_path / prev_model_file_name) + full_model_strategy.store_model(model_state, temp_directory_path / prev_model_file_name) difference_model_file_name = "difference.model" - incremental_model_strategy.save_model( + incremental_model_strategy.store_model( get_mock_model_after().state_dict(), model_state, temp_directory_path / difference_model_file_name ) with MetadataDatabaseConnection(get_modyn_config()) as database: - prev_model_id = database.add_trained_model(15, 3, prev_model_file_name) - curr_model_id = database.add_trained_model(15, 4, difference_model_file_name, parent_model=prev_model_id) + prev_model_id = database.add_trained_model(15, 3, prev_model_file_name, "model.metadata") + curr_model_id = database.add_trained_model( + 15, 4, difference_model_file_name, "model.metadata", parent_model=prev_model_id + ) - manager._reconstruct_model(curr_model_id, model_state, manager.get_model_storage_strategy(1)) + manager._reconstruct_model(curr_model_id, model_state, manager.get_model_storage_policy(1)) assert model_state["_weight"].item() == 3 # pylint: disable=unsubscriptable-object def test__handle_new_model_full(): with MetadataDatabaseConnection(get_modyn_config()) as database: - database.add_trained_model(1, 4, "model.modyn") + database.add_trained_model(1, 4, "model.modyn", "model.metadata") mock_model = MockModel() model_state = mock_model.state_dict() - manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) with tempfile.NamedTemporaryFile() as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) - parent_id = manager._handle_new_model(1, 5, model_state, temp_file_path, manager.get_model_storage_strategy(1)) + parent_id = manager._handle_new_model(1, 5, model_state, temp_file_path, manager.get_model_storage_policy(1)) assert parent_id is None loaded_state = torch.load(temp_file_path) @@ -142,17 +148,17 @@ def test__handle_new_model_full(): @patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) @patch.object(ModelStorageManager, "_reconstruct_model") -@patch.object(ModelStorageManager, "_get_previous_model", return_value=TrainedModel(model_id=101)) +@patch.object(ModelStorageManager, "_get_parent_model_id", return_value=101) def test__handle_new_model_incremental( previous_model_mock, reconstruct_model_mock: MagicMock, base_model_state_mock: MagicMock ): - manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) with tempfile.NamedTemporaryFile() as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) parent_id = manager._handle_new_model( - 5, 4, get_mock_model_after().state_dict(), temp_file_path, manager.get_model_storage_strategy(1) + 5, 4, get_mock_model_after().state_dict(), temp_file_path, manager.get_model_storage_policy(1) ) assert parent_id == 101 @@ -164,7 +170,7 @@ def test__handle_new_model_incremental( previous_model_mock.assert_called_once_with(5, 4) -def test_get_model_storage_strategy(): +def test_get_model_storage_policy(): with MetadataDatabaseConnection(get_modyn_config()) as database: simple_pipeline = database.register_pipeline( 74, @@ -186,18 +192,18 @@ def test_get_model_storage_strategy(): 75, "ResNet18", json.dumps({"num_classes": 10}), True, full_model_strategy, inc_model_strategy, 10 ) - manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage")) + manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) - strategy = manager.get_model_storage_strategy(simple_pipeline) - assert strategy.incremental_model_strategy is None - assert strategy.full_model_interval is None - assert not strategy.full_model_strategy.zip + policy = manager.get_model_storage_policy(simple_pipeline) + assert policy.incremental_model_strategy is None + assert policy.full_model_interval is None + assert not policy.full_model_strategy.zip - complex_strategy = manager.get_model_storage_strategy(complex_pipeline) - assert complex_strategy.full_model_strategy.zip - assert complex_strategy.full_model_strategy.zip_algorithm == ZIP_DEFLATED - assert complex_strategy.incremental_model_strategy - assert not complex_strategy.incremental_model_strategy.zip + complex_policy = manager.get_model_storage_policy(complex_pipeline) + assert complex_policy.full_model_strategy.zip + assert complex_policy.full_model_strategy.zip_algorithm == ZIP_DEFLATED + assert complex_policy.incremental_model_strategy + assert not complex_policy.incremental_model_strategy.zip @patch("modyn.model_storage.internal.model_storage_manager.current_time_millis", return_value=100) @@ -205,15 +211,13 @@ def test_get_model_storage_strategy(): def test_store_model(base_model_mock, current_time_mock): with tempfile.TemporaryDirectory() as temp_dir: temp_directory_path = pathlib.Path(temp_dir) - manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) with MetadataDatabaseConnection(get_modyn_config()) as database: - parent_id = database.add_trained_model(1, 128, "before.model") + parent_id = database.add_trained_model(1, 128, "before.model", "before.metadata") - model_storage_strategy = manager.get_model_storage_strategy(1) - model_storage_strategy.full_model_strategy.save_model( - MockModel().state_dict(), temp_directory_path / "before.model" - ) + policy = manager.get_model_storage_policy(1) + policy.full_model_strategy.store_model(MockModel().state_dict(), temp_directory_path / "before.model") torch.save( {"model": get_mock_model_after().state_dict(), "metadata": True}, temp_directory_path / "model.modyn" @@ -254,7 +258,7 @@ def test_store_model_resnet(): resnet = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) with tempfile.TemporaryDirectory() as temp_dir: temp_directory_path = pathlib.Path(temp_dir) - manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) torch.save({"model": resnet.model.state_dict(), "metadata": True}, temp_directory_path / "model.modyn") @@ -264,21 +268,22 @@ def test_store_model_resnet(): original_state = resnet.model.state_dict() for layer_name, _ in loaded_state["model"].items(): - assert torch.all(torch.eq(loaded_state["model"][layer_name], original_state[layer_name])) + original_layer = original_state[layer_name] # pylint: disable=unsubscriptable-object + assert torch.all(torch.eq(loaded_state["model"][layer_name], original_layer)) @patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) def test_load_model(base_model_mock: MagicMock): with tempfile.TemporaryDirectory() as temp_dir: temp_directory_path = pathlib.Path(temp_dir) - manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) model_file_name = "mock.model" with MetadataDatabaseConnection(get_modyn_config()) as database: - model_id = database.add_trained_model(1, 32, model_file_name) + model_id = database.add_trained_model(1, 32, model_file_name, "mock.metadata") - model_storage_strategy = manager.get_model_storage_strategy(1) - model_storage_strategy.full_model_strategy.save_model( + policy = manager.get_model_storage_policy(1) + policy.full_model_strategy.store_model( get_mock_model_after().state_dict(), temp_directory_path / model_file_name ) @@ -291,14 +296,14 @@ def test_load_model(base_model_mock: MagicMock): def test_load_model_metadata(base_model_mock: MagicMock): with tempfile.TemporaryDirectory() as temp_dir: temp_directory_path = pathlib.Path(temp_dir) - manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) model_file_name = "mock.model" with MetadataDatabaseConnection(get_modyn_config()) as database: model_id = database.add_trained_model(1, 32, model_file_name, "mock.metadata.zip") - model_storage_strategy = manager.get_model_storage_strategy(1) - model_storage_strategy.full_model_strategy.save_model( + policy = manager.get_model_storage_policy(1) + policy.full_model_strategy.store_model( get_mock_model_after().state_dict(), temp_directory_path / model_file_name ) torch.save({"metadata": True}, temp_directory_path / "mock.metadata") @@ -314,21 +319,10 @@ def test_load_model_metadata(base_model_mock: MagicMock): def test_load_model_invalid(base_model_mock: MagicMock): with tempfile.TemporaryDirectory() as temp_dir: temp_directory_path = pathlib.Path(temp_dir) - manager = ModelStorageManager(get_modyn_config(), temp_directory_path) + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) assert manager.load_model(133, False) is None - model_file_name = "mock.model" - with MetadataDatabaseConnection(get_modyn_config()) as database: - model_id = database.add_trained_model(1, 23, model_file_name) - - model_storage_strategy = manager.get_model_storage_strategy(1) - model_storage_strategy.full_model_strategy.save_model( - get_mock_model_after().state_dict(), temp_directory_path / model_file_name - ) - - assert manager.load_model(model_id, True) is None - @patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) def test_delete_model(base_model_mock: MagicMock): @@ -340,32 +334,28 @@ def test_delete_model(base_model_mock: MagicMock): temp_directory_path = pathlib.Path(temp_dir) with MetadataDatabaseConnection(get_modyn_config()) as database: - parent_id = database.add_trained_model(1, 52, "parent.modyn") - first_child_id = database.add_trained_model(1, 53, "child1.modyn", parent_model=parent_id) - second_child_id = database.add_trained_model(1, 54, "child2.modyn", parent_model=parent_id) - - manager = ModelStorageManager(get_modyn_config(), temp_directory_path) - model_storage_strategy = manager.get_model_storage_strategy(1) - model_storage_strategy.full_model_strategy.save_model(model_state, temp_directory_path / "parent.modyn") - model_storage_strategy.incremental_model_strategy.save_model( - model_state_after, model_state, temp_directory_path / "child1.modyn" - ) - model_storage_strategy.incremental_model_strategy.save_model( - model_state_after, model_state, temp_directory_path / "child2.modyn" + parent_id = database.add_trained_model(1, 52, "parent.modyn", "parent.metadata") + child_id = database.add_trained_model(1, 53, "child.modyn", "child.metadata", parent_model=parent_id) + + manager = ModelStorageManager(get_modyn_config(), temp_directory_path, temp_directory_path) + policy = manager.get_model_storage_policy(1) + policy.full_model_strategy.store_model(model_state, temp_directory_path / "parent.modyn") + torch.save({"metadata": True}, temp_directory_path / "parent.metadata") + policy.incremental_model_strategy.store_model( + model_state_after, model_state, temp_directory_path / "child.modyn" ) + torch.save({"metadata": True}, temp_directory_path / "child.metadata") success = manager.delete_model(parent_id) + assert not success + success = manager.delete_model(child_id) + assert success + assert not (temp_directory_path / "child.modyn").exists() + success = manager.delete_model(parent_id) assert success assert not (temp_directory_path / "parent.modyn").exists() with MetadataDatabaseConnection(get_modyn_config()) as database: - first_child: TrainedModel = database.session.get(TrainedModel, first_child_id) - second_child: TrainedModel = database.session.get(TrainedModel, second_child_id) - - assert first_child.parent_model is None - assert second_child.parent_model is None - - assert manager.load_model(first_child_id, False)["model"]["_weight"] == 3 - assert manager.load_model(second_child_id, False)["model"]["_weight"] == 3 - assert not manager.delete_model(-1) + assert not database.session.get(TrainedModel, child_id) + assert not database.session.get(TrainedModel, parent_id) diff --git a/modyn/tests/model_storage/internal/utils/test_data_types.py b/modyn/tests/model_storage/internal/utils/test_data_types.py index 6e05fe035..27234c9d2 100644 --- a/modyn/tests/model_storage/internal/utils/test_data_types.py +++ b/modyn/tests/model_storage/internal/utils/test_data_types.py @@ -1,8 +1,7 @@ import io -import numpy as np import torch -from modyn.model_storage.internal.utils import create_tensor, read_tensor_from_bytes +from modyn.model_storage.internal.utils import read_tensor_from_bytes def test_read_tensor_from_bytes(): @@ -12,13 +11,6 @@ def test_read_tensor_from_bytes(): buf.write(b"\x03\x00\x00\x00") buf.write(b"\x04\x00\x00\x00") buf.seek(0) - res = read_tensor_from_bytes(torch.ones((2, 2), dtype=torch.int32), buf) + res = read_tensor_from_bytes(torch.ones((2, 2), dtype=torch.int32), buf.getvalue()) assert res[0, 0] == 1 and res[0, 1] == 2 and res[1, 0] == 3 and res[1, 1] == 4 - - -def test_create_tensor(): - byte_num = bytes(b"\x04\x00\x00\x00") - tensor = create_tensor(byte_num, dtype=np.dtype(np.int32), shape=torch.Size([1])) - - assert tensor.item() == 4 diff --git a/modyn/tests/model_storage/internal/utils/test_model_storage_strategy.py b/modyn/tests/model_storage/internal/utils/test_model_storage_policy.py similarity index 50% rename from modyn/tests/model_storage/internal/utils/test_model_storage_strategy.py rename to modyn/tests/model_storage/internal/utils/test_model_storage_policy.py index 1a9c4ed45..7594e5910 100644 --- a/modyn/tests/model_storage/internal/utils/test_model_storage_strategy.py +++ b/modyn/tests/model_storage/internal/utils/test_model_storage_policy.py @@ -1,27 +1,29 @@ import json +import pathlib from zipfile import ZIP_DEFLATED, ZIP_LZMA import pytest from modyn.model_storage.internal.storage_strategies.difference_operators import XorDifferenceOperator -from modyn.model_storage.internal.utils import ModelStorageStrategy +from modyn.model_storage.internal.utils import ModelStoragePolicy -def test_basic_model_storage_strategy(): - model_storage_strategy = ModelStorageStrategy("PyTorchFullModel", None, None, None) +def test_basic_model_storage_policy(): + policy = ModelStoragePolicy(pathlib.Path(), "PyTorchFullModel", None, None, None) - assert model_storage_strategy.incremental_model_strategy is None - assert model_storage_strategy.full_model_interval is None - assert not model_storage_strategy.full_model_strategy.zip + assert policy.incremental_model_strategy is None + assert policy.full_model_interval is None + assert not policy.full_model_strategy.zip -def test_extended_model_storage_strategy(): - model_storage_strategy = ModelStorageStrategy( +def test_extended_model_storage_policy(): + policy = ModelStoragePolicy( + zipping_dir=pathlib.Path(), full_model_strategy_name="PyTorchFullModel", full_model_strategy_zip=True, full_model_strategy_zip_algorithm="ZIP_LZMA", full_model_strategy_config=None, ) - model_storage_strategy.register_incremental_model_strategy( + policy.register_incremental_model_strategy( name="WeightsDifference", zip_enabled=True, zip_algorithm=None, @@ -29,20 +31,22 @@ def test_extended_model_storage_strategy(): full_model_interval=10, ) - assert model_storage_strategy.full_model_strategy.zip - assert model_storage_strategy.full_model_strategy.zip_algorithm == ZIP_LZMA + assert policy.zipping_dir == pathlib.Path("") + assert policy.full_model_strategy.zip + assert policy.full_model_strategy.zip_algorithm == ZIP_LZMA - weights_diff_strategy = model_storage_strategy.incremental_model_strategy + weights_diff_strategy = policy.incremental_model_strategy assert weights_diff_strategy.zip assert weights_diff_strategy.zip_algorithm == ZIP_DEFLATED assert getattr(weights_diff_strategy, "split_exponent") assert isinstance(getattr(weights_diff_strategy, "difference_operator"), XorDifferenceOperator.__class__) - assert model_storage_strategy.full_model_interval == 10 + assert policy.full_model_interval == 10 -def test_model_storage_strategy_invalid(): - strategy = ModelStorageStrategy( +def test_model_storage_policy_invalid(): + policy = ModelStoragePolicy( + zipping_dir=pathlib.Path(), full_model_strategy_name="PyTorchFullModel", full_model_strategy_zip=None, full_model_strategy_zip_algorithm=None, @@ -50,7 +54,7 @@ def test_model_storage_strategy_invalid(): ) with pytest.raises(ValueError): - strategy.register_incremental_model_strategy("WeightsDifference", None, None, None, 0) + policy.register_incremental_model_strategy("WeightsDifference", None, None, None, 0) with pytest.raises(NotImplementedError): - strategy.register_incremental_model_strategy("UnknownStrategy", None, None, None, None) + policy.register_incremental_model_strategy("UnknownStrategy", None, None, None, None) diff --git a/modyn/tests/model_storage/test_model_storage.py b/modyn/tests/model_storage/test_model_storage.py index 9f5b1cf12..202fa28ce 100644 --- a/modyn/tests/model_storage/test_model_storage.py +++ b/modyn/tests/model_storage/test_model_storage.py @@ -1,4 +1,3 @@ -import pathlib import tempfile from unittest.mock import patch @@ -11,7 +10,7 @@ def get_modyn_config(): # pylint: disable=unused-argument -def noop_setup_directories(self): +def noop_setup_directory(self): pass @@ -39,7 +38,8 @@ def __exit__(self, *args, **kwargs): # pylint: disable=unused-argument pass -@patch.object(ModelStorage, "_setup_model_storage_directories", noop_setup_directories) +@patch.object(ModelStorage, "_init_model_storage_directory", noop_setup_directory) +@patch.object(ModelStorage, "_setup_ftp_directory", noop_setup_directory) def test_model_storage_init(): model_storage = ModelStorage(get_modyn_config()) assert model_storage.config == get_modyn_config() @@ -47,12 +47,11 @@ def test_model_storage_init(): @patch("modyn.model_storage.model_storage.GRPCServer", MockGRPCServer) @patch("modyn.model_storage.model_storage.FTPServer", MockFTPServer) -@patch("os.makedirs") -def test_cleanup_at_exit(test_os_makedirs): - ftp_directory = pathlib.Path(tempfile.gettempdir()) / "ftp_model_storage" - assert not ftp_directory.exists() - - model_storage = ModelStorage(get_modyn_config()) - assert ftp_directory.exists() - model_storage.run() - assert not ftp_directory.exists() +def test_cleanup_at_exit(): + with tempfile.TemporaryDirectory() as temp_dir: + config = get_modyn_config() + config["model_storage"]["models_directory"] = temp_dir + model_storage = ModelStorage(config) + assert model_storage.ftp_directory.exists() + model_storage.run() + assert not model_storage.ftp_directory.exists() diff --git a/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py b/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py index 854acfaf1..69335f96e 100644 --- a/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py +++ b/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py @@ -11,7 +11,7 @@ GetSamplesRequest, GetSelectionStrategyRequest, JsonString, - ModelStorageStrategyInfo, + ModelStoragePolicyInfo, NumberOfPartitionsResponse, NumberOfSamplesResponse, PipelineResponse, @@ -51,16 +51,14 @@ def test_register_pipeline(test_register_pipeline: MagicMock): config["selector"]["trigger_sample_directory"] = tmp_dir mgr = SelectorManager(config) servicer = SelectorGRPCServicer(mgr, 8096) - model_storage_strategy = ModelStorageStrategyInfo( - full_model_strategy_config=StrategyConfig(name="PyTorchFullModel") - ) + policy = ModelStoragePolicyInfo(full_model_strategy_config=StrategyConfig(name="PyTorchFullModel")) request = RegisterPipelineRequest( num_workers=2, selection_strategy=JsonString(value="strat"), - model_id="ResNet18", + model_class_name="ResNet18", model_configuration=JsonString(value="{}"), amp=True, - model_storage_strategy=model_storage_strategy, + model_storage_policy=policy, ) test_register_pipeline.return_value = 42 diff --git a/modyn/tests/supervisor/internal/test_grpc_handler.py b/modyn/tests/supervisor/internal/test_grpc_handler.py index 5d6b35b36..541db266e 100644 --- a/modyn/tests/supervisor/internal/test_grpc_handler.py +++ b/modyn/tests/supervisor/internal/test_grpc_handler.py @@ -19,6 +19,7 @@ DataInformRequest, DataInformResponse, GetNumberOfSamplesRequest, + JsonString, NumberOfSamplesResponse, PipelineResponse, RegisterPipelineRequest, @@ -343,8 +344,11 @@ def test_register_pipeline_at_selector(test_grpc_connection_established): "model": {"id": "ResNet18"}, "model_storage": { "full_model_strategy": {"name": "PyTorchFullModel", "zip": True, "zip_algorithm": "ZIP_DEFLATED"}, - "incremental_model_strategy": {"name": "WeightsDifference", "config": {"operator": "sub"}}, - "full_model_interval": 10, + "incremental_model_strategy": { + "name": "WeightsDifference", + "config": {"operator": "sub"}, + "full_model_interval": 10, + }, }, } ) diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py index 300ccddf2..53f4b49bd 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xb9\x06\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x0b\n\x03\x61mp\x18\x04 \x01(\x08\x12\x10\n\x08model_id\x18\x05 \x01(\t\x12\x30\n\x13model_configuration\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1c\n\x14use_pretrained_model\x18\x07 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x08 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\t \x01(\x05\x12\x12\n\nbatch_size\x18\n \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x0b \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\x0c \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\r \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0e \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0f \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x11 \x03(\t\x12)\n\x0clr_scheduler\x18\x12 \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x13 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x14 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x15 \x01(\x05\x12\x11\n\x04seed\x18\x16 \x01(\x05H\x00\x88\x01\x01\x12-\n\ttokenizer\x18\x17 \x01(\x0b\x32\x15.trainer.PythonStringH\x01\x88\x01\x01\x42\x07\n\x05_seedB\x0c\n\n_tokenizer\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\x84\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12\x16\n\texception\x18\x06 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x07 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x08 \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\t \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\n \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xe8\x05\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x1c\n\x14use_pretrained_model\x18\x04 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x05 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\x06 \x01(\x05\x12\x12\n\nbatch_size\x18\x07 \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x08 \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\t \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\n \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0b \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0c \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\r \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x0e \x03(\t\x12)\n\x0clr_scheduler\x18\x0f \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x11 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x12 \x01(\x05\x12\x11\n\x04seed\x18\x13 \x01(\x05H\x00\x88\x01\x01\x12-\n\ttokenizer\x18\x14 \x01(\x0b\x32\x15.trainer.PythonStringH\x01\x88\x01\x01\x42\x07\n\x05_seedB\x0c\n\n_tokenizer\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\xa6\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12 \n\x03log\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x16\n\texception\x18\x07 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x08 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\t \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\n \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\x0b \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trainer_server_pb2', globals()) @@ -34,21 +34,21 @@ _CHECKPOINTINFO._serialized_start=220 _CHECKPOINTINFO._serialized_end=290 _STARTTRAININGREQUEST._serialized_start=293 - _STARTTRAININGREQUEST._serialized_end=1118 - _STARTTRAININGRESPONSE._serialized_start=1120 - _STARTTRAININGRESPONSE._serialized_end=1190 - _TRAININGSTATUSREQUEST._serialized_start=1192 - _TRAININGSTATUSREQUEST._serialized_end=1236 - _TRAININGSTATUSRESPONSE._serialized_start=1239 - _TRAININGSTATUSRESPONSE._serialized_end=1627 - _STOREFINALMODELREQUEST._serialized_start=1629 - _STOREFINALMODELREQUEST._serialized_end=1674 - _STOREFINALMODELRESPONSE._serialized_start=1676 - _STOREFINALMODELRESPONSE._serialized_end=1740 - _GETLATESTMODELREQUEST._serialized_start=1742 - _GETLATESTMODELREQUEST._serialized_end=1786 - _GETLATESTMODELRESPONSE._serialized_start=1788 - _GETLATESTMODELRESPONSE._serialized_end=1853 - _TRAINERSERVER._serialized_start=1856 - _TRAINERSERVER._serialized_end=2313 + _STARTTRAININGREQUEST._serialized_end=1037 + _STARTTRAININGRESPONSE._serialized_start=1039 + _STARTTRAININGRESPONSE._serialized_end=1109 + _TRAININGSTATUSREQUEST._serialized_start=1111 + _TRAININGSTATUSREQUEST._serialized_end=1155 + _TRAININGSTATUSRESPONSE._serialized_start=1158 + _TRAININGSTATUSRESPONSE._serialized_end=1580 + _STOREFINALMODELREQUEST._serialized_start=1582 + _STOREFINALMODELREQUEST._serialized_end=1627 + _STOREFINALMODELRESPONSE._serialized_start=1629 + _STOREFINALMODELRESPONSE._serialized_end=1693 + _GETLATESTMODELREQUEST._serialized_start=1695 + _GETLATESTMODELREQUEST._serialized_end=1739 + _GETLATESTMODELRESPONSE._serialized_start=1741 + _GETLATESTMODELRESPONSE._serialized_end=1806 + _TRAINERSERVER._serialized_start=1809 + _TRAINERSERVER._serialized_end=2266 # @@protoc_insertion_point(module_scope) From bd8786b4a7d5482983e9add434073b9b87fa835f Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Mon, 25 Sep 2023 10:46:02 +0200 Subject: [PATCH 05/12] Follow-up * fix grpc test --- modyn/supervisor/internal/grpc_handler.py | 2 +- .../supervisor/internal/test_grpc_handler.py | 21 +++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/modyn/supervisor/internal/grpc_handler.py b/modyn/supervisor/internal/grpc_handler.py index 78d34886e..ac3c7e834 100644 --- a/modyn/supervisor/internal/grpc_handler.py +++ b/modyn/supervisor/internal/grpc_handler.py @@ -229,7 +229,7 @@ def register_pipeline_at_selector(self, pipeline_config: dict) -> int: model_storage_policy=ModelStoragePolicyInfo( full_model_strategy_config=self.get_model_strategy(model_storage_config["full_model_strategy"]), incremental_model_strategy_config=incremental_model_strategy, - full_model_interval=full_model_interval + full_model_interval=full_model_interval, ), ) ).pipeline_id diff --git a/modyn/tests/supervisor/internal/test_grpc_handler.py b/modyn/tests/supervisor/internal/test_grpc_handler.py index 541db266e..44a8e6697 100644 --- a/modyn/tests/supervisor/internal/test_grpc_handler.py +++ b/modyn/tests/supervisor/internal/test_grpc_handler.py @@ -359,21 +359,20 @@ def test_register_pipeline_at_selector(test_grpc_connection_established): request: RegisterPipelineRequest = mock.call_args.args[0] assert request.num_workers == 2 assert request.selection_strategy.value == "{}" - assert request.model_id == "ResNet18" + assert request.model_class_name == "ResNet18" assert request.model_configuration.value == "{}" assert request.amp - assert request.model_storage_strategy.full_model_strategy_config.name == "PyTorchFullModel" - assert request.model_storage_strategy.full_model_strategy_config.zip - assert request.model_storage_strategy.full_model_strategy_config.zip_algorithm == "ZIP_DEFLATED" - assert not request.model_storage_strategy.full_model_strategy_config.HasField("config") - assert request.model_storage_strategy.incremental_model_strategy_config.name == "WeightsDifference" - assert not request.model_storage_strategy.incremental_model_strategy_config.HasField("zip") - assert not request.model_storage_strategy.incremental_model_strategy_config.HasField("zip_algorithm") + assert request.model_storage_policy.full_model_strategy_config.name == "PyTorchFullModel" + assert request.model_storage_policy.full_model_strategy_config.zip + assert request.model_storage_policy.full_model_strategy_config.zip_algorithm == "ZIP_DEFLATED" + assert not request.model_storage_policy.full_model_strategy_config.HasField("config") + assert request.model_storage_policy.incremental_model_strategy_config.name == "WeightsDifference" + assert not request.model_storage_policy.incremental_model_strategy_config.HasField("zip") + assert not request.model_storage_policy.incremental_model_strategy_config.HasField("zip_algorithm") assert ( - json.loads(request.model_storage_strategy.incremental_model_strategy_config.config.value)["operator"] - == "sub" + json.loads(request.model_storage_policy.incremental_model_strategy_config.config.value)["operator"] == "sub" ) - assert request.model_storage_strategy.full_model_interval == 10 + assert request.model_storage_policy.full_model_interval == 10 def test_unregister_pipeline_at_selector(): From 02ed1843f5701f881c86d20288321bf48b96215f Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Mon, 25 Sep 2023 15:40:26 +0200 Subject: [PATCH 06/12] Rename model_id to model_class_name * fix integration tests --- docker/Model_Storage/Dockerfile | 2 + .../integrationtest_model_storage.py | 4 +- .../selector/integrationtest_selector.py | 42 +++++++++---------- .../internal/grpc/evaluator_grpc_servicer.py | 23 ++++++---- .../internal/grpc/generated/evaluator_pb2.py | 32 +++++++------- .../internal/grpc/generated/evaluator_pb2.pyi | 8 ++-- .../internal/utils/evaluation_info.py | 8 ++-- .../grpc/generated/model_storage_pb2.pyi | 6 +-- .../internal/model_storage_manager.py | 6 +-- modyn/protos/evaluator.proto | 2 +- modyn/supervisor/internal/grpc_handler.py | 8 ++-- modyn/supervisor/supervisor.py | 6 +-- .../grpc/test_evaluator_grpc_servicer.py | 8 ++-- .../internal/test_pytorch_evaluator.py | 2 +- .../test_metadata_database_connection.py | 6 +-- .../supervisor/internal/test_grpc_handler.py | 6 +-- .../grpc/test_trainer_server_grpc_servicer.py | 2 +- .../grpc/generated/trainer_server_pb2.pyi | 2 +- .../grpc/trainer_server_grpc_servicer.py | 8 ++-- .../internal/utils/training_info.py | 6 +-- 20 files changed, 98 insertions(+), 89 deletions(-) diff --git a/docker/Model_Storage/Dockerfile b/docker/Model_Storage/Dockerfile index 6b26823dd..144555e9e 100644 --- a/docker/Model_Storage/Dockerfile +++ b/docker/Model_Storage/Dockerfile @@ -1,6 +1,8 @@ FROM modynbase:latest RUN chmod a+x /src/modyn/model_storage/modyn-model-storage +RUN mkdir -p /tmp/models +RUN chown appuser /tmp/models # During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug CMD mamba run -n modyn --no-capture-output ./modyn/model_storage/modyn-model-storage ./modyn/config/examples/modyn_config.yaml \ No newline at end of file diff --git a/integrationtests/model_storage/integrationtest_model_storage.py b/integrationtests/model_storage/integrationtest_model_storage.py index 0d5e1a6ce..cd75996fb 100644 --- a/integrationtests/model_storage/integrationtest_model_storage.py +++ b/integrationtests/model_storage/integrationtest_model_storage.py @@ -117,9 +117,9 @@ def test_model_storage(config: dict): pipeline_id, trigger_id = insert_trigger_into_database(config) with MetadataDatabaseConnection(config) as database: - model_id, model_config, amp = database.get_model_configuration(pipeline_id) + model_class_name, model_config, amp = database.get_model_configuration(pipeline_id) - assert model_id == "ResNet18" + assert model_class_name == "ResNet18" assert json.loads(model_config) == {"num_classes": 10} assert not amp diff --git a/integrationtests/selector/integrationtest_selector.py b/integrationtests/selector/integrationtest_selector.py index a4bae54af..5d8410dd9 100644 --- a/integrationtests/selector/integrationtest_selector.py +++ b/integrationtests/selector/integrationtest_selector.py @@ -8,7 +8,7 @@ GetNumberOfPartitionsRequest, GetSamplesRequest, JsonString, - ModelStorageStrategyInfo, + ModelStoragePolicyInfo, RegisterPipelineRequest, SamplesResponse, StrategyConfig, @@ -31,8 +31,8 @@ def connect_to_selector_servicer() -> grpc.Channel: return selector_channel -def get_model_storage_strategy() -> ModelStorageStrategyInfo: - return ModelStorageStrategyInfo(full_model_strategy_config=StrategyConfig(name="PyTorchFullModel")) +def get_model_storage_policy() -> ModelStoragePolicyInfo: + return ModelStoragePolicyInfo(full_model_strategy_config=StrategyConfig(name="PyTorchFullModel")) def test_label_balanced_presampling_huge() -> None: @@ -53,10 +53,10 @@ def test_label_balanced_presampling_huge() -> None: RegisterPipelineRequest( num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config)), - model_id="ResNet10", + model_class_name="ResNet10", model_configuration=JsonString(value="{}"), amp=False, - model_storage_strategy=get_model_storage_strategy(), + model_storage_policy=get_model_storage_policy(), ) ).pipeline_id @@ -140,10 +140,10 @@ def test_label_balanced_force_same_size(): RegisterPipelineRequest( num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config)), - model_id="ResNet10", + model_class_name="ResNet10", model_configuration=JsonString(value="{}"), amp=False, - model_storage_strategy=get_model_storage_strategy(), + model_storage_policy=get_model_storage_policy(), ) ).pipeline_id @@ -231,10 +231,10 @@ def test_label_balanced_force_all_samples(): RegisterPipelineRequest( num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config)), - model_id="ResNet10", + model_class_name="ResNet10", model_configuration=JsonString(value="{}"), amp=False, - model_storage_strategy=get_model_storage_strategy(), + model_storage_policy=get_model_storage_policy(), ) ).pipeline_id @@ -328,10 +328,10 @@ def test_newdata() -> None: RegisterPipelineRequest( num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config)), - model_id="ResNet10", + model_class_name="ResNet10", model_configuration=JsonString(value="{}"), amp=False, - model_storage_strategy=get_model_storage_strategy(), + model_storage_policy=get_model_storage_policy(), ) ).pipeline_id @@ -474,10 +474,10 @@ def test_abstract_downsampler(reset_after_trigger) -> None: RegisterPipelineRequest( num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config)), - model_id="ResNet10", + model_class_name="ResNet10", model_configuration=JsonString(value="{}"), amp=False, - model_storage_strategy=get_model_storage_strategy(), + model_storage_policy=get_model_storage_policy(), ) ).pipeline_id @@ -630,10 +630,10 @@ def test_empty_triggers() -> None: RegisterPipelineRequest( num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config)), - model_id="ResNet10", + model_class_name="ResNet10", model_configuration=JsonString(value="{}"), amp=False, - model_storage_strategy=get_model_storage_strategy(), + model_storage_policy=get_model_storage_policy(), ) ).pipeline_id @@ -805,10 +805,10 @@ def test_many_samples_evenly_distributed(): RegisterPipelineRequest( num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config)), - model_id="ResNet10", + model_class_name="ResNet10", model_configuration=JsonString(value="{}"), amp=False, - model_storage_strategy=get_model_storage_strategy(), + model_storage_policy=get_model_storage_policy(), ) ).pipeline_id @@ -882,10 +882,10 @@ def test_many_samples_unevenly_distributed(): RegisterPipelineRequest( num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config)), - model_id="ResNet10", + model_class_name="ResNet10", model_configuration=JsonString(value="{}"), amp=False, - model_storage_strategy=get_model_storage_strategy(), + model_storage_policy=get_model_storage_policy(), ) ).pipeline_id @@ -960,10 +960,10 @@ def test_get_available_labels(reset_after_trigger: bool): RegisterPipelineRequest( num_workers=2, selection_strategy=JsonString(value=json.dumps(strategy_config)), - model_id="ResNet10", + model_class_name="ResNet10", model_configuration=JsonString(value="{}"), amp=False, - model_storage_strategy=get_model_storage_strategy(), + model_storage_policy=get_model_storage_policy(), ) ).pipeline_id diff --git a/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py b/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py index 5868339f0..1ffc517d2 100644 --- a/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py +++ b/modyn/evaluator/internal/grpc/evaluator_grpc_servicer.py @@ -94,23 +94,23 @@ def evaluate_model(self, request: EvaluateModelRequest, context: grpc.ServicerCo logger.info("Received evaluate model request.") with MetadataDatabaseConnection(self._config) as database: - trained_model: Optional[TrainedModel] = database.session.get(TrainedModel, request.trained_model_id) + trained_model: Optional[TrainedModel] = database.session.get(TrainedModel, request.model_id) if not trained_model: - logger.error(f"Trained model {request.trained_model_id} does not exist!") + logger.error(f"Trained model {request.model_id} does not exist!") return EvaluateModelResponse(evaluation_started=False) - model_id, model_config, amp = database.get_model_configuration(trained_model.pipeline_id) + model_class_name, model_config, amp = database.get_model_configuration(trained_model.pipeline_id) - if not hasattr(dynamic_module_import("modyn.models"), model_id): - logger.error(f"Model {model_id} not available!") + if not hasattr(dynamic_module_import("modyn.models"), model_class_name): + logger.error(f"Model {model_class_name} not available!") return EvaluateModelResponse(evaluation_started=False) - fetch_request = FetchModelRequest(model_id=request.trained_model_id, load_metadata=False) + fetch_request = FetchModelRequest(model_id=request.model_id, load_metadata=False) fetch_resp: FetchModelResponse = self._model_storage_stub.FetchModel(fetch_request) if not fetch_resp.success: logger.error( - f"Trained model {request.trained_model_id} cannot be fetched from model storage. " + f"Trained model {request.model_id} cannot be fetched from model storage. " f"Evaluation cannot be started." ) return EvaluateModelResponse(evaluation_started=False) @@ -143,7 +143,14 @@ def evaluate_model(self, request: EvaluateModelRequest, context: grpc.ServicerCo metrics = self._setup_metrics(request.metrics) evaluation_info = EvaluationInfo( - request, evaluation_id, model_id, model_config, amp, self._storage_address, metrics, trained_model_path + request, + evaluation_id, + model_class_name, + model_config, + amp, + self._storage_address, + metrics, + trained_model_path, ) self._evaluation_dict[evaluation_id] = evaluation_info self._run_evaluation(evaluation_id) diff --git a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py index 0f0d19ca6..1b4ea738a 100644 --- a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py +++ b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.py @@ -14,7 +14,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x65valuator.proto\x12\x0fmodyn.evaluator\":\n\x0b\x44\x61tasetInfo\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x8f\x01\n\x13MetricConfiguration\x12\x0c\n\x04name\x18\x01 \x01(\t\x12+\n\x06\x63onfig\x18\x02 \x01(\x0b\x32\x1b.modyn.evaluator.JsonString\x12=\n\x16\x65valuation_transformer\x18\x03 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"\xc6\x02\n\x14\x45valuateModelRequest\x12\x18\n\x10trained_model_id\x18\x01 \x01(\x05\x12\x32\n\x0c\x64\x61taset_info\x18\x02 \x01(\x0b\x32\x1c.modyn.evaluator.DatasetInfo\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\x05\x12\x35\n\x07metrics\x18\x05 \x03(\x0b\x32$.modyn.evaluator.MetricConfiguration\x12\x16\n\x0etransform_list\x18\x06 \x03(\t\x12\x33\n\x0c\x62ytes_parser\x18\x07 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\x12\x38\n\x11label_transformer\x18\x08 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"`\n\x15\x45valuateModelResponse\x12\x1a\n\x12\x65valuation_started\x18\x01 \x01(\x08\x12\x15\n\revaluation_id\x18\x02 \x01(\x05\x12\x14\n\x0c\x64\x61taset_size\x18\x03 \x01(\x03\"0\n\x17\x45valuationStatusRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"\xe5\x01\n\x18\x45valuationStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x17\n\x0fstate_available\x18\x03 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x04 \x01(\x08\x12\x16\n\texception\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x06 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x07 \x01(\x03H\x02\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seen\"0\n\x0e\x45valuationData\x12\x0e\n\x06metric\x18\x01 \x01(\t\x12\x0e\n\x06result\x18\x02 \x01(\x02\"0\n\x17\x45valuationResultRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"c\n\x18\x45valuationResultResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x38\n\x0f\x65valuation_data\x18\x02 \x03(\x0b\x32\x1f.modyn.evaluator.EvaluationData2\xce\x02\n\tEvaluator\x12\x61\n\x0e\x65valuate_model\x12%.modyn.evaluator.EvaluateModelRequest\x1a&.modyn.evaluator.EvaluateModelResponse\"\x00\x12n\n\x15get_evaluation_status\x12(.modyn.evaluator.EvaluationStatusRequest\x1a).modyn.evaluator.EvaluationStatusResponse\"\x00\x12n\n\x15get_evaluation_result\x12(.modyn.evaluator.EvaluationResultRequest\x1a).modyn.evaluator.EvaluationResultResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x65valuator.proto\x12\x0fmodyn.evaluator\":\n\x0b\x44\x61tasetInfo\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x8f\x01\n\x13MetricConfiguration\x12\x0c\n\x04name\x18\x01 \x01(\t\x12+\n\x06\x63onfig\x18\x02 \x01(\x0b\x32\x1b.modyn.evaluator.JsonString\x12=\n\x16\x65valuation_transformer\x18\x03 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"\xbe\x02\n\x14\x45valuateModelRequest\x12\x10\n\x08model_id\x18\x01 \x01(\x05\x12\x32\n\x0c\x64\x61taset_info\x18\x02 \x01(\x0b\x32\x1c.modyn.evaluator.DatasetInfo\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\x05\x12\x35\n\x07metrics\x18\x05 \x03(\x0b\x32$.modyn.evaluator.MetricConfiguration\x12\x16\n\x0etransform_list\x18\x06 \x03(\t\x12\x33\n\x0c\x62ytes_parser\x18\x07 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\x12\x38\n\x11label_transformer\x18\x08 \x01(\x0b\x32\x1d.modyn.evaluator.PythonString\"`\n\x15\x45valuateModelResponse\x12\x1a\n\x12\x65valuation_started\x18\x01 \x01(\x08\x12\x15\n\revaluation_id\x18\x02 \x01(\x05\x12\x14\n\x0c\x64\x61taset_size\x18\x03 \x01(\x03\"0\n\x17\x45valuationStatusRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"\xe5\x01\n\x18\x45valuationStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x17\n\x0fstate_available\x18\x03 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x04 \x01(\x08\x12\x16\n\texception\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x06 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\x07 \x01(\x03H\x02\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seen\"0\n\x0e\x45valuationData\x12\x0e\n\x06metric\x18\x01 \x01(\t\x12\x0e\n\x06result\x18\x02 \x01(\x02\"0\n\x17\x45valuationResultRequest\x12\x15\n\revaluation_id\x18\x01 \x01(\x05\"c\n\x18\x45valuationResultResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x38\n\x0f\x65valuation_data\x18\x02 \x03(\x0b\x32\x1f.modyn.evaluator.EvaluationData2\xce\x02\n\tEvaluator\x12\x61\n\x0e\x65valuate_model\x12%.modyn.evaluator.EvaluateModelRequest\x1a&.modyn.evaluator.EvaluateModelResponse\"\x00\x12n\n\x15get_evaluation_status\x12(.modyn.evaluator.EvaluationStatusRequest\x1a).modyn.evaluator.EvaluationStatusResponse\"\x00\x12n\n\x15get_evaluation_result\x12(.modyn.evaluator.EvaluationResultRequest\x1a).modyn.evaluator.EvaluationResultResponse\"\x00\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'evaluator_pb2', globals()) @@ -30,19 +30,19 @@ _METRICCONFIGURATION._serialized_start=157 _METRICCONFIGURATION._serialized_end=300 _EVALUATEMODELREQUEST._serialized_start=303 - _EVALUATEMODELREQUEST._serialized_end=629 - _EVALUATEMODELRESPONSE._serialized_start=631 - _EVALUATEMODELRESPONSE._serialized_end=727 - _EVALUATIONSTATUSREQUEST._serialized_start=729 - _EVALUATIONSTATUSREQUEST._serialized_end=777 - _EVALUATIONSTATUSRESPONSE._serialized_start=780 - _EVALUATIONSTATUSRESPONSE._serialized_end=1009 - _EVALUATIONDATA._serialized_start=1011 - _EVALUATIONDATA._serialized_end=1059 - _EVALUATIONRESULTREQUEST._serialized_start=1061 - _EVALUATIONRESULTREQUEST._serialized_end=1109 - _EVALUATIONRESULTRESPONSE._serialized_start=1111 - _EVALUATIONRESULTRESPONSE._serialized_end=1210 - _EVALUATOR._serialized_start=1213 - _EVALUATOR._serialized_end=1547 + _EVALUATEMODELREQUEST._serialized_end=621 + _EVALUATEMODELRESPONSE._serialized_start=623 + _EVALUATEMODELRESPONSE._serialized_end=719 + _EVALUATIONSTATUSREQUEST._serialized_start=721 + _EVALUATIONSTATUSREQUEST._serialized_end=769 + _EVALUATIONSTATUSRESPONSE._serialized_start=772 + _EVALUATIONSTATUSRESPONSE._serialized_end=1001 + _EVALUATIONDATA._serialized_start=1003 + _EVALUATIONDATA._serialized_end=1051 + _EVALUATIONRESULTREQUEST._serialized_start=1053 + _EVALUATIONRESULTREQUEST._serialized_end=1101 + _EVALUATIONRESULTRESPONSE._serialized_start=1103 + _EVALUATIONRESULTRESPONSE._serialized_end=1202 + _EVALUATOR._serialized_start=1205 + _EVALUATOR._serialized_end=1539 # @@protoc_insertion_point(module_scope) diff --git a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi index 3b16f97f0..b6b1803f7 100644 --- a/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi +++ b/modyn/evaluator/internal/grpc/generated/evaluator_pb2.pyi @@ -93,7 +93,7 @@ global___MetricConfiguration = MetricConfiguration class EvaluateModelRequest(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor - TRAINED_MODEL_ID_FIELD_NUMBER: builtins.int + MODEL_ID_FIELD_NUMBER: builtins.int DATASET_INFO_FIELD_NUMBER: builtins.int DEVICE_FIELD_NUMBER: builtins.int BATCH_SIZE_FIELD_NUMBER: builtins.int @@ -101,7 +101,7 @@ class EvaluateModelRequest(google.protobuf.message.Message): TRANSFORM_LIST_FIELD_NUMBER: builtins.int BYTES_PARSER_FIELD_NUMBER: builtins.int LABEL_TRANSFORMER_FIELD_NUMBER: builtins.int - trained_model_id: builtins.int + model_id: builtins.int @property def dataset_info(self) -> global___DatasetInfo: ... device: builtins.str @@ -117,7 +117,7 @@ class EvaluateModelRequest(google.protobuf.message.Message): def __init__( self, *, - trained_model_id: builtins.int = ..., + model_id: builtins.int = ..., dataset_info: global___DatasetInfo | None = ..., device: builtins.str = ..., batch_size: builtins.int = ..., @@ -127,7 +127,7 @@ class EvaluateModelRequest(google.protobuf.message.Message): label_transformer: global___PythonString | None = ..., ) -> None: ... def HasField(self, field_name: typing_extensions.Literal["bytes_parser", b"bytes_parser", "dataset_info", b"dataset_info", "label_transformer", b"label_transformer"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "dataset_info", b"dataset_info", "device", b"device", "label_transformer", b"label_transformer", "metrics", b"metrics", "trained_model_id", b"trained_model_id", "transform_list", b"transform_list"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "dataset_info", b"dataset_info", "device", b"device", "label_transformer", b"label_transformer", "metrics", b"metrics", "model_id", b"model_id", "transform_list", b"transform_list"]) -> None: ... global___EvaluateModelRequest = EvaluateModelRequest diff --git a/modyn/evaluator/internal/utils/evaluation_info.py b/modyn/evaluator/internal/utils/evaluation_info.py index aa12e904e..a7c804d8e 100644 --- a/modyn/evaluator/internal/utils/evaluation_info.py +++ b/modyn/evaluator/internal/utils/evaluation_info.py @@ -17,14 +17,14 @@ def __init__( self, request: EvaluateModelRequest, evaluation_id: int, - model_id: str, + model_class_name: str, model_config: str, amp: bool, storage_address: str, metrics: list[AbstractEvaluationMetric], model_path: pathlib.Path, ) -> None: - self.trained_model_id = request.trained_model_id + self.model_id = request.model_id self.dataset_id = request.dataset_info.dataset_id self.num_dataloaders = request.dataset_info.num_dataloaders @@ -33,9 +33,9 @@ def __init__( self.batch_size = request.batch_size self.metrics = metrics - self.model_id = model_id + self.model_class_name = model_class_name model_module = dynamic_module_import("modyn.models") - self.model_handler = getattr(model_module, self.model_id) + self.model_handler = getattr(model_module, self.model_class_name) self.model_configuration_dict = json.loads(model_config) self.transform_list = list(request.transform_list) diff --git a/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi b/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi index 019bdd69b..b03e736f3 100644 --- a/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi +++ b/modyn/model_storage/internal/grpc/generated/model_storage_pb2.pyi @@ -58,7 +58,7 @@ class RegisterModelResponse(google.protobuf.message.Message): success: builtins.bool = ..., model_id: builtins.int = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["model_id", b"model_id", "success", b"success"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["model_class_name", b"model_class_name", "success", b"success"]) -> None: ... global___RegisterModelResponse = RegisterModelResponse @@ -76,7 +76,7 @@ class FetchModelRequest(google.protobuf.message.Message): model_id: builtins.int = ..., load_metadata: builtins.bool = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["load_metadata", b"load_metadata", "model_id", b"model_id"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["load_metadata", b"load_metadata", "model_class_name", b"model_class_name"]) -> None: ... global___FetchModelRequest = FetchModelRequest @@ -112,7 +112,7 @@ class DeleteModelRequest(google.protobuf.message.Message): *, model_id: builtins.int = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["model_id", b"model_id"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["model_class_name", b"model_class_name"]) -> None: ... global___DeleteModelRequest = DeleteModelRequest diff --git a/modyn/model_storage/internal/model_storage_manager.py b/modyn/model_storage/internal/model_storage_manager.py index 7655c8983..78efaa94c 100644 --- a/modyn/model_storage/internal/model_storage_manager.py +++ b/modyn/model_storage/internal/model_storage_manager.py @@ -130,11 +130,11 @@ def _get_base_model_state(self, pipeline_id: int) -> dict: dict: the plain model state derived from the model architecture of the pipeline's models. """ with MetadataDatabaseConnection(self._modyn_config) as database: - model_id, model_config, amp = database.get_model_configuration(pipeline_id) + model_class_name, model_config, amp = database.get_model_configuration(pipeline_id) model_module = dynamic_module_import("modyn.models") - assert hasattr(model_module, model_id), f"Model {model_id} not available." + assert hasattr(model_module, model_class_name), f"Model {model_class_name} not available." - model_handler = getattr(model_module, model_id) + model_handler = getattr(model_module, model_class_name) return model_handler(json.loads(model_config), "cpu", amp).model.state_dict() def _reconstruct_model(self, model_id: int, model_state: dict, policy: ModelStoragePolicy) -> None: diff --git a/modyn/protos/evaluator.proto b/modyn/protos/evaluator.proto index 148e75df9..12cb64d02 100644 --- a/modyn/protos/evaluator.proto +++ b/modyn/protos/evaluator.proto @@ -24,7 +24,7 @@ message MetricConfiguration { } message EvaluateModelRequest { - int32 trained_model_id = 1; + int32 model_id = 1; DatasetInfo dataset_info = 2; string device = 3; int32 batch_size = 4; diff --git a/modyn/supervisor/internal/grpc_handler.py b/modyn/supervisor/internal/grpc_handler.py index ac3c7e834..d1daa2674 100644 --- a/modyn/supervisor/internal/grpc_handler.py +++ b/modyn/supervisor/internal/grpc_handler.py @@ -511,7 +511,7 @@ def seed_selector(self, seed: int) -> None: assert success, "Something went wrong while seeding the selector" - def start_evaluation(self, trained_model_id: int, pipeline_config: dict) -> dict[int, EvaluationStatusTracker]: + def start_evaluation(self, model_id: int, pipeline_config: dict) -> dict[int, EvaluationStatusTracker]: if not self.connected_to_evaluator: raise ConnectionError("Tried to start evaluation at evaluator, but there is no gRPC connection.") @@ -522,7 +522,7 @@ def start_evaluation(self, trained_model_id: int, pipeline_config: dict) -> dict for dataset in pipeline_config["evaluation"]["datasets"]: dataset_id = dataset["dataset_id"] - req = GRPCHandler._prepare_evaluation_request(dataset, trained_model_id, device) + req = GRPCHandler._prepare_evaluation_request(dataset, model_id, device) response: EvaluateModelResponse = self.evaluator.evaluate_model(req) if not response.evaluation_started: @@ -535,7 +535,7 @@ def start_evaluation(self, trained_model_id: int, pipeline_config: dict) -> dict return evaluations @staticmethod - def _prepare_evaluation_request(dataset_config: dict, trained_model_id: int, device: str) -> EvaluateModelRequest: + def _prepare_evaluation_request(dataset_config: dict, model_id: int, device: str) -> EvaluateModelRequest: dataset_id = dataset_config["dataset_id"] if "transformations" in dataset_config: @@ -573,7 +573,7 @@ def _prepare_evaluation_request(dataset_config: dict, trained_model_id: int, dev ) start_evaluation_kwargs = { - "trained_model_id": trained_model_id, + "model_id": model_id, "dataset_info": DatasetInfo(dataset_id=dataset_id, num_dataloaders=dataloader_workers), "device": device, "batch_size": batch_size, diff --git a/modyn/supervisor/supervisor.py b/modyn/supervisor/supervisor.py index 30ba489de..d95c1ead6 100644 --- a/modyn/supervisor/supervisor.py +++ b/modyn/supervisor/supervisor.py @@ -453,17 +453,17 @@ def _run_training(self, trigger_id: int) -> None: # We store the trained model for evaluation in any case. self._sw.start("store_trained_model", overwrite=True) - trained_model_id = self.grpc.store_trained_model(self.current_training_id) + model_id = self.grpc.store_trained_model(self.current_training_id) self.pipeline_log["supervisor"]["triggers"][trigger_id]["store_trained_model_time"] = self._sw.stop() # Only if the pipeline actually wants to continue the training on it, we set previous model. if self.pipeline_config["training"]["use_previous_model"]: - self.previous_model_id = trained_model_id + self.previous_model_id = model_id # Start evaluation if "evaluation" in self.pipeline_config: # TODO(#300) Add evaluator to pipeline log - evaluations = self.grpc.start_evaluation(trained_model_id, self.pipeline_config) + evaluations = self.grpc.start_evaluation(model_id, self.pipeline_config) self.grpc.wait_for_evaluation_completion(self.current_training_id, evaluations) writer_names: set[str] = set(self.pipeline_config["evaluation"]["result_writers"]) diff --git a/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py b/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py index 768d57b35..42dd36fe6 100644 --- a/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py +++ b/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py @@ -124,7 +124,7 @@ def get_mock_evaluation_transformer(): def get_evaluate_model_request(): return EvaluateModelRequest( - trained_model_id=1, + model_id=1, dataset_info=DatasetInfo(dataset_id="MNIST", num_dataloaders=1), device="cpu", batch_size=4, @@ -146,7 +146,7 @@ def get_evaluation_info(evaluation_id, model_path: pathlib.Path, config: dict): return EvaluationInfo( request=get_evaluate_model_request(), evaluation_id=evaluation_id, - model_id="ResNet18", + model_class_name="ResNet18", amp=False, model_config="{}", storage_address=storage_address, @@ -185,7 +185,7 @@ def test_evaluate_model_invalid(test_connect_to_model_storage, test_connect_to_s with tempfile.TemporaryDirectory() as modyn_temp: evaluator = EvaluatorGRPCServicer(get_modyn_config(), pathlib.Path(modyn_temp)) req = get_evaluate_model_request() - req.trained_model_id = 15 + req.model_id = 15 resp = evaluator.evaluate_model(req, None) assert not resp.evaluation_started @@ -196,7 +196,7 @@ def test_evaluate_model_invalid(test_connect_to_model_storage, test_connect_to_s assert evaluator._next_evaluation_id == 0 req = get_evaluate_model_request() - req.trained_model_id = 2 + req.model_id = 2 resp = evaluator.evaluate_model(req, None) assert not resp.evaluation_started diff --git a/modyn/tests/evaluator/internal/test_pytorch_evaluator.py b/modyn/tests/evaluator/internal/test_pytorch_evaluator.py index a00c3a8c7..d72f46399 100644 --- a/modyn/tests/evaluator/internal/test_pytorch_evaluator.py +++ b/modyn/tests/evaluator/internal/test_pytorch_evaluator.py @@ -92,7 +92,7 @@ def get_evaluation_info( ): model_dynamic_module_patch.return_value = MockModule() request = EvaluateModelRequest( - trained_model_id=1, + model_id=1, dataset_info=DatasetInfo(dataset_id="MNIST", num_dataloaders=1), device="cpu", batch_size=4, diff --git a/modyn/tests/metadata_database/test_metadata_database_connection.py b/modyn/tests/metadata_database/test_metadata_database_connection.py index 4d95dfef2..d10be6772 100644 --- a/modyn/tests/metadata_database/test_metadata_database_connection.py +++ b/modyn/tests/metadata_database/test_metadata_database_connection.py @@ -71,7 +71,7 @@ def test_add_trained_model(): assert model_parent.children[0] == model_child -def test_get_model_storage_strategy(): +def test_get_model_configuration(): with MetadataDatabaseConnection(get_minimal_modyn_config()) as database: database.create_tables() pipeline_id = database.register_pipeline( @@ -80,8 +80,8 @@ def test_get_model_storage_strategy(): assert pipeline_id == 1 - model_id, model_config, amp = database.get_model_configuration(pipeline_id) + model_class_name, model_config, amp = database.get_model_configuration(pipeline_id) - assert model_id == "ResNet18" + assert model_class_name == "ResNet18" assert json.loads(model_config) == {"num_classes": 10} assert amp diff --git a/modyn/tests/supervisor/internal/test_grpc_handler.py b/modyn/tests/supervisor/internal/test_grpc_handler.py index 44a8e6697..d0cc9137a 100644 --- a/modyn/tests/supervisor/internal/test_grpc_handler.py +++ b/modyn/tests/supervisor/internal/test_grpc_handler.py @@ -557,7 +557,7 @@ def test_start_evaluation(test_connection_established): handler = GRPCHandler(get_simple_config(), mgr, pbar) assert handler.evaluator is not None - trained_model_id = 10 + model_id = 10 pipeline_config = get_minimal_pipeline_config() with patch.object( @@ -565,7 +565,7 @@ def test_start_evaluation(test_connection_established): "evaluate_model", return_value=EvaluateModelResponse(evaluation_started=True, evaluation_id=12, dataset_size=1000), ) as avail_method: - evaluations = handler.start_evaluation(trained_model_id, pipeline_config) + evaluations = handler.start_evaluation(model_id, pipeline_config) assert len(evaluations) == 1 assert evaluations[12].dataset_id == "MNIST_eval" @@ -577,7 +577,7 @@ def test__prepare_evaluation_request(): pipeline_config = get_minimal_pipeline_config() request = GRPCHandler._prepare_evaluation_request(pipeline_config["evaluation"]["datasets"][0], 23, "cpu") - assert request.trained_model_id == 23 + assert request.model_id == 23 assert request.device == "cpu" assert request.batch_size == 64 assert request.dataset_info.dataset_id == "MNIST_eval" diff --git a/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py b/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py index dccd79d48..fc501c4f2 100644 --- a/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py +++ b/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py @@ -252,7 +252,7 @@ def test_start_training(test_getattr, test_hasattr, test_connect_to_model_storag trainer_server.start_training(get_start_training_request(), None) assert 1 in trainer_server._training_process_dict assert trainer_server._next_training_id == 2 - assert trainer_server._training_dict[1].model_id == "model" + assert trainer_server._training_dict[1].model_class_name == "model" assert trainer_server._training_dict[1].model_configuration_dict == {} assert trainer_server._training_dict[1].amp diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi index 703bd8d09..7615ba1fc 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi @@ -314,7 +314,7 @@ class StoreFinalModelResponse(google.protobuf.message.Message): valid_state: builtins.bool = ..., model_id: builtins.int = ..., ) -> None: ... - def ClearField(self, field_name: typing_extensions.Literal["model_id", b"model_id", "valid_state", b"valid_state"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["model_class_name", b"model_class_name", "valid_state", b"valid_state"]) -> None: ... global___StoreFinalModelResponse = StoreFinalModelResponse diff --git a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py index 70f9d0d39..a32124e03 100644 --- a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py +++ b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py @@ -98,10 +98,10 @@ def start_training( logger.info("Received start training request.") with MetadataDatabaseConnection(self._config) as database: - model_id, model_config, amp = database.get_model_configuration(request.pipeline_id) + model_class_name, model_config, amp = database.get_model_configuration(request.pipeline_id) - if not hasattr(dynamic_module_import("modyn.models"), model_id): - logger.error(f"Model {model_id} not available!") + if not hasattr(dynamic_module_import("modyn.models"), model_class_name): + logger.error(f"Model {model_class_name} not available!") return StartTrainingResponse(training_started=False) pretrained_model_path: Optional[pathlib.Path] = None @@ -142,7 +142,7 @@ def start_training( training_info = TrainingInfo( request, training_id, - model_id, + model_class_name, model_config, amp, self._storage_address, diff --git a/modyn/trainer_server/internal/utils/training_info.py b/modyn/trainer_server/internal/utils/training_info.py index b9d490260..e9ec414ab 100644 --- a/modyn/trainer_server/internal/utils/training_info.py +++ b/modyn/trainer_server/internal/utils/training_info.py @@ -17,7 +17,7 @@ def __init__( self, request: StartTrainingRequest, training_id: int, - model_id: str, + model_class_name: str, model_config: str, amp: bool, storage_address: str, @@ -44,9 +44,9 @@ def __init__( self.bytes_parser = request.bytes_parser.value self.label_transformer = request.label_transformer.value - self.model_id = model_id + self.model_class_name = model_class_name model_module = dynamic_module_import("modyn.models") - self.model_handler = getattr(model_module, self.model_id) + self.model_handler = getattr(model_module, self.model_class_name) self.use_pretrained_model = request.use_pretrained_model self.load_optimizer_state = request.load_optimizer_state From d0d536e9d929455ab2758c1cfa285e9872a053fe Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Mon, 9 Oct 2023 14:02:51 +0200 Subject: [PATCH 07/12] Rework on model storage integration test * fix bug in supervisor * remove data types class * add comprehensive docstrings for model storage manager * improve testing of model storage components * test end-to-end pipeline execution --- .../criteo_1TB/pipelines/exp0_finetune.yml | 3 + .../pipelines/exp1_finetune_ablation.yml | 3 + .../pipelines/exp2_retrain_keep_model.yml | 3 + .../pipelines/exp3_retrain_new_model.yml | 3 + .../pipelines/exp4_current_day_only.yml | 3 + benchmark/mnist/mnist.yaml | 2 - .../example_pipelines/arxiv.yaml | 3 + .../example_pipelines/fmow.yaml | 3 + .../example_pipelines/huffpost.yaml | 3 + .../example_pipelines/yearbook.yaml | 3 + .../integrationtest_model_storage.py | 211 ++++++++++++------ modyn/config/examples/example-pipeline.yaml | 5 +- modyn/config/schema/modyn_config_schema.yaml | 2 + modyn/config/schema/pipeline-schema.yaml | 3 +- modyn/evaluator/internal/pytorch_evaluator.py | 2 +- .../metadata_database_connection.py | 2 +- .../utils/model_storage_strategy_config.py | 27 ++- .../internal/model_storage_manager.py | 48 ++-- .../abstract_model_storage_strategy.py | 17 +- .../sub_difference_operator.py | 4 +- .../xor_difference_operator.py | 4 +- .../full_model_strategies/__init__.py | 2 +- ...sed_full_model.py => binary_full_model.py} | 16 +- .../pytorch_full_model.py | 13 +- .../weights_difference.py | 47 ++-- .../model_storage/internal/utils/__init__.py | 1 - .../internal/utils/data_types.py | 49 ---- modyn/model_storage/model_storage.py | 21 +- modyn/protos/selector.proto | 1 + .../internal/grpc/selector_grpc_servicer.py | 21 +- .../downsampling_strategies/utils.py | 2 +- .../presampling_strategies/utils.py | 2 +- modyn/supervisor/supervisor.py | 2 +- ...ull_model.py => test_binary_full_model.py} | 6 +- .../test_pytorch_full_model.py | 73 +++--- .../test_weights_difference.py | 14 +- .../internal/test_model_storage_manager.py | 29 ++- .../internal/utils/test_data_types.py | 16 -- .../utils/test_model_storage_policy.py | 11 +- .../tests/model_storage/test_model_storage.py | 1 - .../grpc/test_selector_grpc_servicer.py | 2 +- modyn/tests/storage/test_storage.py | 2 + modyn/tests/supervisor/test_supervisor.py | 17 +- modyn/tests/utils/test_utils.py | 45 +++- .../internal/dataset/online_dataset.py | 2 +- .../grpc/trainer_server_grpc_servicer.py | 4 +- .../internal/trainer/pytorch_trainer.py | 2 +- modyn/utils/__init__.py | 3 + modyn/utils/utils.py | 36 +++ 49 files changed, 436 insertions(+), 358 deletions(-) rename modyn/model_storage/internal/storage_strategies/full_model_strategies/{compressed_full_model.py => binary_full_model.py} (54%) delete mode 100644 modyn/model_storage/internal/utils/data_types.py rename modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/{test_compressed_full_model.py => test_binary_full_model.py} (91%) delete mode 100644 modyn/tests/model_storage/internal/utils/test_data_types.py diff --git a/benchmark/criteo_1TB/pipelines/exp0_finetune.yml b/benchmark/criteo_1TB/pipelines/exp0_finetune.yml index 0df038fb6..b8c2c8f66 100644 --- a/benchmark/criteo_1TB/pipelines/exp0_finetune.yml +++ b/benchmark/criteo_1TB/pipelines/exp0_finetune.yml @@ -40,6 +40,9 @@ model: cat_23: 12022 cat_24: 97 cat_25: 35 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml b/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml index 62495ba37..c8e291af0 100644 --- a/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml +++ b/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml @@ -40,6 +40,9 @@ model: cat_23: 12022 cat_24: 97 cat_25: 35 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml b/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml index 780656bd4..dfa8a74d1 100644 --- a/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml +++ b/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml @@ -40,6 +40,9 @@ model: cat_23: 12022 cat_24: 97 cat_25: 35 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml b/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml index 1646e561b..e06ba28af 100644 --- a/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml +++ b/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml @@ -40,6 +40,9 @@ model: cat_23: 12022 cat_24: 97 cat_25: 35 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml b/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml index 1a4ec65a1..f7782c920 100644 --- a/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml +++ b/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml @@ -40,6 +40,9 @@ model: cat_23: 12022 cat_24: 97 cat_25: 35 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/mnist/mnist.yaml b/benchmark/mnist/mnist.yaml index 4ebb6acdf..f8a9be21c 100644 --- a/benchmark/mnist/mnist.yaml +++ b/benchmark/mnist/mnist.yaml @@ -9,8 +9,6 @@ model: model_storage: full_model_strategy: name: "PyTorchFullModel" - zip: True - zip_algorithm: ZIP_DEFLATED training: gpus: 1 device: "cuda:0" diff --git a/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml b/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml index 0415e6a61..d2161e813 100644 --- a/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml +++ b/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml @@ -6,6 +6,9 @@ model: id: ArticleNet config: num_classes: 172 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml b/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml index 80bd1aa28..bf25c56ec 100644 --- a/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml +++ b/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml @@ -6,6 +6,9 @@ model: id: FmowNet config: num_classes: 62 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml b/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml index 667522f18..556e92fed 100644 --- a/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml +++ b/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml @@ -6,6 +6,9 @@ model: id: ArticleNet config: num_classes: 55 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml index d35dd09b5..2dd650266 100644 --- a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml +++ b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml @@ -7,6 +7,9 @@ model: config: num_input_channels: 1 num_classes: 2 +model_storage: + full_model_strategy: + name: "PyTorchFullModel" training: gpus: 1 device: "cuda:0" diff --git a/integrationtests/model_storage/integrationtest_model_storage.py b/integrationtests/model_storage/integrationtest_model_storage.py index cd75996fb..5a577e058 100644 --- a/integrationtests/model_storage/integrationtest_model_storage.py +++ b/integrationtests/model_storage/integrationtest_model_storage.py @@ -4,13 +4,14 @@ import logging import pathlib import shutil +from typing import Optional import grpc import torch from integrationtests.utils import get_modyn_config from modyn.common.ftp import delete_file, download_trained_model, upload_file from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection -from modyn.metadata_database.models import Trigger +from modyn.metadata_database.models import Pipeline, Trigger from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.model_storage.internal.grpc.generated.model_storage_pb2 import ( DeleteModelRequest, @@ -25,16 +26,19 @@ from modyn.utils import calculate_checksum, grpc_connection_established TEST_MODELS_PATH = pathlib.Path("/app") / "model_storage" / "test_models" -TEST_FILE_NAME_LOCAL = "test_model_local.modyn" -TEST_FILE_NAME_LOCAL_RESP = "test_model_local_response.modyn" -TEST_FILE_NAME_REMOTE = "test_model_remote.modyn" -SAMPLE_MODEL = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) + +FILE_NAME_PARENT = "test_parent.modyn" +MODEL_PARENT = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) + +FILE_NAME_CHILD = "test_child.modyn" +MODEL_CHILD = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) def create_dummy_file(): pathlib.Path(TEST_MODELS_PATH).mkdir(parents=True, exist_ok=True) - torch.save({"model": SAMPLE_MODEL.model.state_dict(), "metadata": True}, TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL) + for model, file_name in [(MODEL_PARENT, FILE_NAME_PARENT), (MODEL_CHILD, FILE_NAME_CHILD)]: + torch.save({"model": model.model.state_dict(), "metadata": True}, TEST_MODELS_PATH / file_name) def cleanup_models_dir() -> None: @@ -51,52 +55,59 @@ def connect_to_model_storage(config: dict) -> grpc.Channel: return model_storage_channel -def upload_dummy_file_to_trainer(config: dict): - upload_file( - config["trainer_server"]["hostname"], - int(config["trainer_server"]["ftp_port"]), - "modyn", - "modyn", - local_file_path=TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL, - remote_file_path=pathlib.Path(TEST_FILE_NAME_REMOTE), - ) - +def upload_dummy_files_to_trainer(config: dict): + for file_name in [FILE_NAME_PARENT, FILE_NAME_CHILD]: + upload_file( + config["trainer_server"]["hostname"], + int(config["trainer_server"]["ftp_port"]), + "modyn", + "modyn", + local_file_path=TEST_MODELS_PATH / file_name, + remote_file_path=pathlib.Path(file_name), + ) -def delete_dummy_file_from_trainer(config: dict): - delete_file( - config["trainer_server"]["hostname"], - int(config["trainer_server"]["ftp_port"]), - "modyn", - "modyn", - pathlib.Path(TEST_FILE_NAME_REMOTE), - ) +def delete_dummy_files_from_trainer(config: dict): + for file_name in [FILE_NAME_PARENT, FILE_NAME_CHILD]: + delete_file( + config["trainer_server"]["hostname"], + int(config["trainer_server"]["ftp_port"]), + "modyn", + "modyn", + pathlib.Path(file_name), + ) -def insert_trigger_into_database(config: dict) -> (int, int): - model_storage_strategy = ModelStorageStrategyConfig("CompressedFullModel") - model_storage_strategy.zip = True - with MetadataDatabaseConnection(config) as database: +def insert_triggers_into_database( + modyn_config: dict, + full_strategy: ModelStorageStrategyConfig, + inc_strategy: Optional[ModelStorageStrategyConfig], + full_model_interval: Optional[int], +) -> (int, int): + with MetadataDatabaseConnection(modyn_config) as database: pipeline_id = database.register_pipeline( - 2, "ResNet18", json.dumps({"num_classes": 10}), False, model_storage_strategy + 2, "ResNet18", json.dumps({"num_classes": 10}), False, full_strategy, inc_strategy, full_model_interval ) - trigger = Trigger(trigger_id=10, pipeline_id=pipeline_id) - database.session.add(trigger) + trigger_parent = Trigger(trigger_id=0, pipeline_id=pipeline_id) + trigger_child = Trigger(trigger_id=1, pipeline_id=pipeline_id) + database.session.add(trigger_parent) + database.session.add(trigger_child) database.session.commit() - return trigger.pipeline_id, trigger.trigger_id + return pipeline_id, trigger_parent.trigger_id, trigger_child.trigger_id -def delete_data_from_database(config: dict, pipeline_id: int, trigger_id: int): - with MetadataDatabaseConnection(config) as database: +def delete_data_from_database(modyn_config: dict, pipeline_id: int): + with MetadataDatabaseConnection(modyn_config) as database: database.session.query(Trigger).filter( - Trigger.pipeline_id == pipeline_id and Trigger.trigger_id == trigger_id + Trigger.pipeline_id == pipeline_id, ).delete() + database.session.query(Pipeline).filter(Pipeline.pipeline_id == pipeline_id).delete() database.session.commit() -def check_loaded_model(path: pathlib.Path) -> None: +def check_loaded_model(path: pathlib.Path, original_model_state: dict) -> None: with open(path, "rb") as state_file: checkpoint = torch.load(io.BytesIO(state_file.read())) @@ -107,33 +118,26 @@ def check_loaded_model(path: pathlib.Path) -> None: assert checkpoint["metadata"] loaded_state = resnet.model.state_dict() - original_state = SAMPLE_MODEL.model.state_dict() for layer_name, _ in resnet.model.state_dict().items(): - assert torch.all(torch.eq(loaded_state[layer_name], original_state[layer_name])) - - -def test_model_storage(config: dict): - # register pipeline and trigger - pipeline_id, trigger_id = insert_trigger_into_database(config) - - with MetadataDatabaseConnection(config) as database: - model_class_name, model_config, amp = database.get_model_configuration(pipeline_id) - - assert model_class_name == "ResNet18" - assert json.loads(model_config) == {"num_classes": 10} - assert not amp - - model_storage_channel = connect_to_model_storage(config) - model_storage = ModelStorageStub(model_storage_channel) - - # try to register a new model in the model storage + assert torch.allclose(loaded_state[layer_name], original_model_state[layer_name], rtol=1e-04, atol=1e-05) + + +def download_and_check_model( + pipeline_id: int, + trigger_id: int, + modyn_config: dict, + model_storage: ModelStorageStub, + file_name: str, + original_model_state: dict, +) -> int: + # try to register a new model at model storage request_register = RegisterModelRequest( pipeline_id=pipeline_id, trigger_id=trigger_id, - hostname=config["trainer_server"]["hostname"], - port=int(config["trainer_server"]["ftp_port"]), - model_path=str(TEST_FILE_NAME_REMOTE), - checksum=calculate_checksum(TEST_MODELS_PATH / TEST_FILE_NAME_LOCAL), + hostname=modyn_config["trainer_server"]["hostname"], + port=int(modyn_config["trainer_server"]["ftp_port"]), + model_path=file_name, + checksum=calculate_checksum(TEST_MODELS_PATH / file_name), ) response_register: RegisterModelResponse = model_storage.RegisterModel(request_register) @@ -149,7 +153,7 @@ def test_model_storage(config: dict): # download the model (dummy file) from model storage downloaded_path = download_trained_model( logging.getLogger(__name__), - config["model_storage"], + modyn_config["model_storage"], remote_path=pathlib.Path(response_fetch.model_path), checksum=response_fetch.checksum, identifier=42, @@ -158,39 +162,106 @@ def test_model_storage(config: dict): assert downloaded_path is not None - # compare if content matches initial dummy file - check_loaded_model(downloaded_path) + # compare if content matches initial dummy file & delete it + check_loaded_model(downloaded_path, original_model_state) + downloaded_path.unlink() + + return model_id + + +def test_model_storage( + modyn_config: dict, + full_strategy: ModelStorageStrategyConfig, + inc_strategy: Optional[ModelStorageStrategyConfig], + full_model_interval: Optional[int], +): + # register pipeline and trigger + pipeline_id, parent_trigger, child_trigger = insert_triggers_into_database( + modyn_config, full_strategy, inc_strategy, full_model_interval + ) + + with MetadataDatabaseConnection(modyn_config) as database: + model_class_name, model_config, amp = database.get_model_configuration(pipeline_id) + + assert model_class_name == "ResNet18" + assert json.loads(model_config) == {"num_classes": 10} + assert not amp + + model_storage_channel = connect_to_model_storage(modyn_config) + model_storage = ModelStorageStub(model_storage_channel) + + parent_id = download_and_check_model( + pipeline_id, parent_trigger, modyn_config, model_storage, FILE_NAME_PARENT, MODEL_PARENT.model.state_dict() + ) + + child_id = download_and_check_model( + pipeline_id, child_trigger, modyn_config, model_storage, FILE_NAME_CHILD, MODEL_CHILD.model.state_dict() + ) + + if inc_strategy is not None: + # try to delete parent on model storage + request_delete = DeleteModelRequest(model_id=parent_id) + response_delete: DeleteModelResponse = model_storage.DeleteModel(request_delete) + + assert not response_delete.success - # delete model on model storage component - request_delete = DeleteModelRequest(model_id=model_id) + # delete child on model storage + request_delete = DeleteModelRequest(model_id=child_id) response_delete: DeleteModelResponse = model_storage.DeleteModel(request_delete) assert response_delete.success # fetch a (now) invalid model - request_invalid_fetch = FetchModelRequest(model_id=model_id) + request_invalid_fetch = FetchModelRequest(model_id=child_id) response_invalid_fetch: FetchModelResponse = model_storage.FetchModel(request_invalid_fetch) assert not response_invalid_fetch.success # delete a (now) invalid model - request_invalid_delete = DeleteModelRequest(model_id=model_id) + request_invalid_delete = DeleteModelRequest(model_id=child_id) response_invalid_delete: DeleteModelResponse = model_storage.DeleteModel(request_invalid_delete) assert not response_invalid_delete.success + # delete parent on model storage + request_delete = DeleteModelRequest(model_id=parent_id) + response_delete: DeleteModelResponse = model_storage.DeleteModel(request_delete) + + assert response_delete.success + # clean-up database - delete_data_from_database(config, pipeline_id, trigger_id) + delete_data_from_database(modyn_config, pipeline_id) def main() -> None: modyn_config = get_modyn_config() + + pytorch_full = ModelStorageStrategyConfig("PyTorchFullModel") + + compressed_full = ModelStorageStrategyConfig("BinaryFullModel") + compressed_full.zip = True + compressed_full.zip_algorithm = "ZIP_LZMA" + + sub_delta_inc = ModelStorageStrategyConfig("WeightsDifference") + sub_delta_inc.config = json.dumps({"operator": "sub"}) + + xor_full = ModelStorageStrategyConfig("WeightsDifference") + xor_full.zip = True + xor_full.config = json.dumps({"operator": "xor", "split_exponent": True, "rle": True}) + + policies = [ + (pytorch_full, None, None), + (compressed_full, sub_delta_inc, 5), + (pytorch_full, xor_full, 5), + ] try: create_dummy_file() - upload_dummy_file_to_trainer(modyn_config) - test_model_storage(modyn_config) + upload_dummy_files_to_trainer(modyn_config) + + for policy in policies: + test_model_storage(modyn_config, *policy) finally: - delete_dummy_file_from_trainer(modyn_config) + delete_dummy_files_from_trainer(modyn_config) cleanup_models_dir() diff --git a/modyn/config/examples/example-pipeline.yaml b/modyn/config/examples/example-pipeline.yaml index 3c2feb98a..66e664093 100644 --- a/modyn/config/examples/example-pipeline.yaml +++ b/modyn/config/examples/example-pipeline.yaml @@ -9,13 +9,12 @@ model: model_storage: full_model_strategy: name: "PyTorchFullModel" - zip: True - zip_algorithm: ZIP_DEFLATED incremental_model_strategy: name: "WeightsDifference" + zip: True + zip_algorithm: ZIP_DEFLATED config: operator: xor - split_exponent: True full_model_interval: 10 training: gpus: 1 diff --git a/modyn/config/schema/modyn_config_schema.yaml b/modyn/config/schema/modyn_config_schema.yaml index 7cf525916..5227ad24b 100644 --- a/modyn/config/schema/modyn_config_schema.yaml +++ b/modyn/config/schema/modyn_config_schema.yaml @@ -367,6 +367,8 @@ properties: required: - project - storage + - evaluator + - model_storage - metadata_database - selector - trainer_server \ No newline at end of file diff --git a/modyn/config/schema/pipeline-schema.yaml b/modyn/config/schema/pipeline-schema.yaml index 03f93b67c..6f898613a 100644 --- a/modyn/config/schema/pipeline-schema.yaml +++ b/modyn/config/schema/pipeline-schema.yaml @@ -48,7 +48,7 @@ properties: name: type: string description: | - Name of the full model strategy. We currently support PyTorchFullModel and CompressedFullModel. + Name of the full model strategy. We currently support PyTorchFullModel and BinaryFullModel. config: type: object description: | @@ -495,6 +495,7 @@ properties: required: - pipeline - model + - model_storage - training - data - trigger diff --git a/modyn/evaluator/internal/pytorch_evaluator.py b/modyn/evaluator/internal/pytorch_evaluator.py index 68dd6b4d9..7115966d3 100644 --- a/modyn/evaluator/internal/pytorch_evaluator.py +++ b/modyn/evaluator/internal/pytorch_evaluator.py @@ -89,7 +89,7 @@ def _load_state(self, path: pathlib.Path) -> None: self._model.model.load_state_dict(checkpoint["model"]) # delete trained model from disk - os.remove(path) + path.unlink() def send_status_to_server(self, batch_number: int) -> None: self._status_response_queue.put({"num_batches": batch_number, "num_samples": self._num_samples}) diff --git a/modyn/metadata_database/metadata_database_connection.py b/modyn/metadata_database/metadata_database_connection.py index 6ad76e3f4..f85ef9618 100644 --- a/modyn/metadata_database/metadata_database_connection.py +++ b/modyn/metadata_database/metadata_database_connection.py @@ -143,7 +143,7 @@ def add_trained_model( pipeline_id: id of the pipeline it was created from. trigger_id: id of the trigger it was created. model_path: path on the local filesystem on which the model is stored. - metadata_path: the path on the local filesystem on which metadata to the model are stored. + metadata_path: the path on the local filesystem where model metadata is stored. parent_model: (optional) id of the parent model. Returns: int: Id of the registered model diff --git a/modyn/metadata_database/utils/model_storage_strategy_config.py b/modyn/metadata_database/utils/model_storage_strategy_config.py index 1d2124ff2..f109ab978 100644 --- a/modyn/metadata_database/utils/model_storage_strategy_config.py +++ b/modyn/metadata_database/utils/model_storage_strategy_config.py @@ -1,13 +1,32 @@ +from dataclasses import dataclass from typing import Optional +# pylint: disable=no-name-in-module +from modyn.selector.internal.grpc.generated.selector_pb2 import StrategyConfig + +@dataclass class ModelStorageStrategyConfig: """ - This class is used to hold the configuration options of a model storage strategy. + This class holds all information of a generic model storage strategy. + It is used to insert a given strategy in the metadata database. """ + name: str + zip: bool = False + zip_algorithm: Optional[str] = None + config: Optional[str] = None + def __init__(self, name: str): self.name = name - self.zip: Optional[bool] = None - self.zip_algorithm: Optional[str] = None - self.config: Optional[str] = None + + @classmethod + def from_config(cls, strategy_config: StrategyConfig): # type: ignore[no-untyped-def] + strategy = cls(strategy_config.name) + if strategy_config.HasField("zip") and strategy_config.zip is not None: + strategy.zip = strategy_config.zip + if strategy_config.HasField("zip_algorithm") and strategy_config.zip is not None: + strategy.zip_algorithm = strategy_config.zip_algorithm + if strategy_config.HasField("config") and strategy_config.config is not None: + strategy.config = strategy_config.config.value + return strategy diff --git a/modyn/model_storage/internal/model_storage_manager.py b/modyn/model_storage/internal/model_storage_manager.py index 78efaa94c..d939b3684 100644 --- a/modyn/model_storage/internal/model_storage_manager.py +++ b/modyn/model_storage/internal/model_storage_manager.py @@ -1,15 +1,13 @@ import json import logging -import os import pathlib -import tempfile from typing import Optional import torch from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection from modyn.metadata_database.models import Pipeline, TrainedModel from modyn.model_storage.internal.utils import ModelStoragePolicy -from modyn.utils import current_time_millis, dynamic_module_import, unzip_file, zip_file +from modyn.utils import current_time_millis, dynamic_module_import logger = logging.getLogger(__name__) @@ -64,11 +62,7 @@ def store_model(self, pipeline_id: int, trigger_id: int, checkpoint_path: pathli # now checkpoint only contains optimizer state and metadata. local_metadata_filename = f"{current_time_millis()}_{pipeline_id}_{trigger_id}.metadata.zip" metadata_path = self._storage_dir / local_metadata_filename - - # zip the metadata file. - with tempfile.NamedTemporaryFile(dir=self._ftp_dir) as temp_file: - torch.save(checkpoint, temp_file) - zip_file(pathlib.Path(temp_file.name), metadata_path) + torch.save(checkpoint, metadata_path) # add the new model to the database. with MetadataDatabaseConnection(self._modyn_config) as database: @@ -107,7 +101,7 @@ def _handle_new_model( parent_model_state = self._get_base_model_state(pipeline_id) # load model state of the parent model. - self._reconstruct_model(parent_model_id, parent_model_state, policy) + parent_model_state = self._reconstruct_model_state(parent_model_id, parent_model_state, policy) # finally store the model delta. policy.incremental_model_strategy.store_model(state_dict, parent_model_state, model_path) @@ -121,7 +115,7 @@ def _handle_new_model( def _get_base_model_state(self, pipeline_id: int) -> dict: """ - Get the base model state associated with a pipeline. + Get a randomly initialized model associated with the pipeline. Args: pipeline_id: the involved pipeline. @@ -137,19 +131,19 @@ def _get_base_model_state(self, pipeline_id: int) -> dict: model_handler = getattr(model_module, model_class_name) return model_handler(json.loads(model_config), "cpu", amp).model.state_dict() - def _reconstruct_model(self, model_id: int, model_state: dict, policy: ModelStoragePolicy) -> None: + def _reconstruct_model_state(self, model_id: int, model_state: dict, policy: ModelStoragePolicy) -> dict: """ - Reconstruct the model given the model state and the model storage policy. - The function recursively call itself, if the model is stored as a model delta. - In this case it first loads the (fully stored) parent model into the model state before overwriting it - according to the incremental model storage policy. + Reconstruct a given model according to the model storage policy. + The function recursively calls itself whenever the model is stored as a delta. + Otherwise it is stored according to a full model strategy and the model state can be retrieved. + Finally, the model_state is overwritten by the state of the inquired model. Args: model_id: the identifier of the model to be reconstructed. - model_state: the plain model state (or the loaded parent model state). - policy: the model storage policy containing the strategies. + model_state: a random model state (or the loaded parent model state). + policy: the model storage policy of the pipeline. Returns: - None: the model state is overwritten in order to minimize memory overhead. + dict: the reconstructed model state. Refers to the same object as model_state. """ # we recursively overwrite the model state. @@ -158,14 +152,16 @@ def _reconstruct_model(self, model_id: int, model_state: dict, policy: ModelStor if not model.parent_model: # base case: we can load a fully stored model. policy.full_model_strategy.load_model(model_state, self._storage_dir / model.model_path) - return + return model_state # recursive step: we recurse to load the model state of the parent model. - self._reconstruct_model(model.parent_model, model_state, policy) + model_state = self._reconstruct_model_state(model.parent_model, model_state, policy) # we apply the incremental strategy to load our model state. policy.incremental_model_strategy.load_model(model_state, self._storage_dir / model.model_path) + return model_state + def _get_parent_model_id(self, pipeline_id: int, trigger_id: int) -> Optional[int]: """ Get the id of the parent model given the trigger id of a pipeline. @@ -210,15 +206,11 @@ def load_model(self, model_id: int, metadata: bool) -> Optional[dict]: # retrieve the model by loading its state dictionary. model_state = self._get_base_model_state(model.pipeline_id) - self._reconstruct_model(model_id, model_state, policy) - model_dict = {"model": model_state} + model_dict = {"model": self._reconstruct_model_state(model_id, model_state, policy)} # append the metadata to the dictionary if specified. if metadata: - with tempfile.NamedTemporaryFile() as temp_file: - temp_file_path = pathlib.Path(temp_file.name) - unzip_file(self._storage_dir / model.metadata_path, temp_file_path) - metadata_dict = torch.load(temp_file_path) + metadata_dict = torch.load(self._storage_dir / model.metadata_path) model_dict.update(metadata_dict) return model_dict @@ -246,8 +238,8 @@ def delete_model(self, model_id: int) -> bool: logger.info(f"Model {model_id} has depending child models: {', '.join(child_ids)}") return False - os.remove(self._storage_dir / model.model_path) - os.remove(self._storage_dir / model.metadata_path) + (self._storage_dir / model.model_path).unlink() + (self._storage_dir / model.metadata_path).unlink() database.session.delete(model) database.session.commit() diff --git a/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py b/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py index 48b458926..d4f01cc3a 100644 --- a/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py +++ b/modyn/model_storage/internal/storage_strategies/abstract_model_storage_strategy.py @@ -1,5 +1,5 @@ import pathlib -from abc import ABC, abstractmethod +from abc import ABC from zipfile import ZIP_DEFLATED from modyn.utils import dynamic_module_import @@ -10,7 +10,7 @@ class AbstractModelStorageStrategy(ABC): Base class for all model storage strategies. """ - def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm_name: str, config: dict): + def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm_name: str): """ Initialize a model storage strategy. @@ -18,25 +18,12 @@ def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm zipping_dir: directory, in which the model is zipped. zip_activated: whether the generated file is zipped. zip_algorithm_name: name of the zip algorithm. - config: configuration options for the strategy. """ self.zipping_dir = zipping_dir self.zip = zip_activated self.zip_algorithm = ZIP_DEFLATED self._validate_zip_config(zip_algorithm_name) - self.validate_config(config) - - @abstractmethod - def validate_config(self, config: dict) -> None: - """ - Validates the strategy-dependent configuration options. - - Args: - config: the configuration options. - """ - raise NotImplementedError() - def _validate_zip_config(self, zip_algorithm_name: str) -> None: if self.zip and zip_algorithm_name: zip_module = dynamic_module_import("zipfile") diff --git a/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py b/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py index 2d4efec4c..343b5d0a0 100644 --- a/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py +++ b/modyn/model_storage/internal/storage_strategies/difference_operators/sub_difference_operator.py @@ -1,6 +1,6 @@ import torch from modyn.model_storage.internal.storage_strategies.abstract_difference_operator import AbstractDifferenceOperator -from modyn.model_storage.internal.utils.data_types import read_tensor_from_bytes +from modyn.utils import reconstruct_tensor_from_bytes class SubDifferenceOperator(AbstractDifferenceOperator): @@ -11,5 +11,5 @@ def calculate_difference(tensor: torch.Tensor, tensor_prev: torch.Tensor) -> byt @staticmethod def restore(tensor_prev: torch.Tensor, buffer: bytes) -> torch.Tensor: - difference_tensor = read_tensor_from_bytes(tensor_prev, buffer) + difference_tensor = reconstruct_tensor_from_bytes(tensor_prev, buffer) return tensor_prev + difference_tensor diff --git a/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py b/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py index 8272ef2cd..7c0883924 100644 --- a/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py +++ b/modyn/model_storage/internal/storage_strategies/difference_operators/xor_difference_operator.py @@ -1,6 +1,6 @@ import torch from modyn.model_storage.internal.storage_strategies.abstract_difference_operator import AbstractDifferenceOperator -from modyn.model_storage.internal.utils import read_tensor_from_bytes +from modyn.utils import reconstruct_tensor_from_bytes class XorDifferenceOperator(AbstractDifferenceOperator): @@ -15,4 +15,4 @@ def calculate_difference(tensor: torch.Tensor, tensor_prev: torch.Tensor) -> byt def restore(tensor_prev: torch.Tensor, buffer: bytes) -> torch.Tensor: prev_model_data = tensor_prev.numpy().tobytes() new_model_data = bytes(a ^ b for (a, b) in zip(prev_model_data, buffer)) - return read_tensor_from_bytes(tensor_prev, new_model_data) + return reconstruct_tensor_from_bytes(tensor_prev, new_model_data) diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/__init__.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/__init__.py index 31585b38f..c185068a9 100644 --- a/modyn/model_storage/internal/storage_strategies/full_model_strategies/__init__.py +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/__init__.py @@ -7,7 +7,7 @@ import os from .abstract_full_model_strategy import AbstractFullModelStrategy # noqa: F401 -from .compressed_full_model import CompressedFullModel # noqa: F401 +from .binary_full_model import BinaryFullModel # noqa: F401 from .pytorch_full_model import PyTorchFullModel # noqa: F401 files = os.listdir(os.path.dirname(__file__)) diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/compressed_full_model.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/binary_full_model.py similarity index 54% rename from modyn/model_storage/internal/storage_strategies/full_model_strategies/compressed_full_model.py rename to modyn/model_storage/internal/storage_strategies/full_model_strategies/binary_full_model.py index 9f3a0edd7..499de5b8b 100644 --- a/modyn/model_storage/internal/storage_strategies/full_model_strategies/compressed_full_model.py +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/binary_full_model.py @@ -1,15 +1,18 @@ -import math import pathlib from modyn.model_storage.internal.storage_strategies.full_model_strategies import AbstractFullModelStrategy -from modyn.model_storage.internal.utils import read_tensor_from_bytes, torch_dtype_to_byte_size +from modyn.utils import get_tensor_byte_size, reconstruct_tensor_from_bytes -class CompressedFullModel(AbstractFullModelStrategy): +class BinaryFullModel(AbstractFullModelStrategy): """ This full model strategy stores the weights as binary sequence. """ + # pylint: disable-next=unused-argument + def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm_name: str, config: dict): + super().__init__(zipping_dir, zip_activated, zip_algorithm_name) + def _store_model(self, model_state: dict, file_path: pathlib.Path) -> None: with open(file_path, "wb") as file: for tensor in model_state.values(): @@ -18,8 +21,5 @@ def _store_model(self, model_state: dict, file_path: pathlib.Path) -> None: def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: with open(file_path, "rb") as file: for layer, tensor in base_model_state.items(): - num_bytes = math.prod(tensor.shape) * torch_dtype_to_byte_size[tensor.dtype] - base_model_state[layer] = read_tensor_from_bytes(tensor, file.read(num_bytes)) - - def validate_config(self, config: dict) -> None: - pass + num_bytes = get_tensor_byte_size(tensor) + base_model_state[layer] = reconstruct_tensor_from_bytes(tensor, file.read(num_bytes)) diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py index 02e344bb3..58f3bc0cc 100644 --- a/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py @@ -1,3 +1,4 @@ +import logging import pathlib import torch @@ -5,17 +6,23 @@ AbstractFullModelStrategy, ) +logger = logging.getLogger(__name__) + class PyTorchFullModel(AbstractFullModelStrategy): """ This full model strategy naively stores the whole model on disk (default pytorch implementation). """ + # pylint: disable-next=unused-argument + def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm_name: str, config: dict): + super().__init__(zipping_dir, False, zip_algorithm_name) + + if zip_activated: + logger.warning("The zipping option is disabled for this strategy since its already performed natively.") + def _store_model(self, model_state: dict, file_path: pathlib.Path) -> None: torch.save(model_state, file_path) def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: base_model_state.update(torch.load(file_path)) - - def validate_config(self, config: dict) -> None: - pass diff --git a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py index 70299982a..78e90dfb4 100644 --- a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py +++ b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py @@ -1,5 +1,4 @@ import io -import math import pathlib from typing import BinaryIO, Union @@ -12,7 +11,7 @@ from modyn.model_storage.internal.storage_strategies.incremental_model_strategies import ( AbstractIncrementalModelStrategy, ) -from modyn.model_storage.internal.utils import torch_dtype_to_byte_size +from modyn.utils import get_tensor_byte_size available_difference_operators = {"xor": XorDifferenceOperator, "sub": SubDifferenceOperator} @@ -24,11 +23,19 @@ class WeightsDifference(AbstractIncrementalModelStrategy): """ def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm_name: str, config: dict): - self.difference_operator = SubDifferenceOperator - self.split_exponent = False - self.rle = False + super().__init__(zipping_dir, zip_activated, zip_algorithm_name) + + self._validate_config(config) - super().__init__(zipping_dir, zip_activated, zip_algorithm_name, config) + def _validate_config(self, config: dict) -> None: + self.difference_operator = SubDifferenceOperator + if "operator" in config: + difference_operator_name = config["operator"] + if difference_operator_name not in available_difference_operators: + raise ValueError(f"Operator should be one of {available_difference_operators}.") + self.difference_operator = available_difference_operators[difference_operator_name] + self.split_exponent = config["split_exponent"] if "split_exponent" in config else False + self.rle = config["rle"] if "rle" in config else False def _store_model(self, model_state: dict, prev_model_state: dict, file_path: pathlib.Path) -> None: bytestream = io.BytesIO() @@ -37,7 +44,7 @@ def _store_model(self, model_state: dict, prev_model_state: dict, file_path: pat for tensor_model, tensor_prev_model in zip(model_state.values(), prev_model_state.values()): difference = self.difference_operator.calculate_difference(tensor_model, tensor_prev_model) - if exponent_bytestream and tensor_model.dtype == torch.float32: + if exponent_bytestream is not None and tensor_model.dtype == torch.float32: for i in range(0, len(difference), 4): reordered_diff = self.reorder_buffer(difference[i : i + 4]) bytestream.write(reordered_diff[0:3]) @@ -46,10 +53,10 @@ def _store_model(self, model_state: dict, prev_model_state: dict, file_path: pat bytestream.write(difference) with open(file_path, "wb") as file: - if exponent_bytestream: + if exponent_bytestream is not None: exponents = exponent_bytestream.getvalue() if self.rle: - exponents = self.rle_bytes(exponents) + exponents = self.encode_bytes(exponents) file.write(len(exponents).to_bytes(8, byteorder="big")) file.write(exponents) file.write(bytestream.getbuffer().tobytes()) @@ -58,9 +65,7 @@ def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> None: with open(file_path, "rb") as file: if not self.split_exponent: for layer_name, tensor in prev_model_state.items(): - shape = tensor.shape - num_bytes = math.prod(shape) * torch_dtype_to_byte_size[tensor.dtype] - + num_bytes = get_tensor_byte_size(tensor) prev_model_state[layer_name] = self.difference_operator.restore(tensor, file.read(num_bytes)) else: self._load_model_split_exponent(prev_model_state, file) @@ -70,13 +75,12 @@ def _load_model_split_exponent(self, prev_model_state: dict, file: BinaryIO) -> with io.BytesIO() as exponent_bytes: exponent_bytes.write( - self.inv_rle_bytes(file.read(exponent_bytes_amount)) if self.rle else file.read(exponent_bytes_amount) + self.decode_bytes(file.read(exponent_bytes_amount)) if self.rle else file.read(exponent_bytes_amount) ) exponent_bytes.seek(0) for layer_name, tensor in prev_model_state.items(): - shape = tensor.shape - num_bytes = math.prod(shape) * torch_dtype_to_byte_size[tensor.dtype] + num_bytes = get_tensor_byte_size(tensor) if tensor.dtype == torch.float32: buffer = bytearray(num_bytes) @@ -102,7 +106,7 @@ def reorder_buffer(buffer: Union[bytes, bytearray]) -> bytes: return bit_array.bytes @staticmethod - def rle_bytes(buffer: bytes) -> bytes: + def encode_bytes(buffer: bytes) -> bytes: """ Perform byte-wise run-length encoding. @@ -133,7 +137,7 @@ def rle_bytes(buffer: bytes) -> bytes: return bytestream.getvalue() @staticmethod - def inv_rle_bytes(buffer: bytes) -> bytes: + def decode_bytes(buffer: bytes) -> bytes: """ Decode run-length encoded bytes. @@ -151,12 +155,3 @@ def inv_rle_bytes(buffer: bytes) -> bytes: bytestream.write(count * buffer[i + 1 : i + 2]) return bytestream.getvalue() - - def validate_config(self, config: dict) -> None: - if "operator" in config: - difference_operator_name = config["operator"] - if difference_operator_name not in available_difference_operators: - raise ValueError(f"Operator should be one of {available_difference_operators}.") - self.difference_operator = available_difference_operators[difference_operator_name] - self.split_exponent = config["split_exponent"] if "split_exponent" in config else False - self.rle = config["rle"] if "rle" in config else False diff --git a/modyn/model_storage/internal/utils/__init__.py b/modyn/model_storage/internal/utils/__init__.py index 0dc282c60..56eeadaac 100644 --- a/modyn/model_storage/internal/utils/__init__.py +++ b/modyn/model_storage/internal/utils/__init__.py @@ -6,7 +6,6 @@ import os -from .data_types import read_tensor_from_bytes, torch_dtype_to_byte_size, torch_dtype_to_numpy_dict # noqa: F401 from .model_storage_policy import ModelStoragePolicy # noqa: F401 files = os.listdir(os.path.dirname(__file__)) diff --git a/modyn/model_storage/internal/utils/data_types.py b/modyn/model_storage/internal/utils/data_types.py deleted file mode 100644 index 701c20df1..000000000 --- a/modyn/model_storage/internal/utils/data_types.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -This class provides useful functionalities for different data types and conversions between them. -""" -import numpy as np -import torch - -torch_dtype_to_numpy_dict = { - torch.uint8: np.uint8, - torch.int8: np.int8, - torch.int16: np.int16, - torch.int32: np.int32, - torch.int64: np.int64, - torch.float16: np.float16, - torch.float32: np.float32, - torch.float64: np.float64, - torch.complex64: np.complex64, - torch.complex128: np.complex128, -} - -torch_dtype_to_byte_size = { - torch.uint8: 1, - torch.int8: 1, - torch.int16: 2, - torch.int32: 4, - torch.int64: 8, - torch.float16: 2, - torch.float32: 4, - torch.float64: 8, - torch.complex64: 8, - torch.complex128: 16, -} - - -def read_tensor_from_bytes(tensor: torch.Tensor, buffer: bytes) -> torch.Tensor: - """ - Reconstruct a tensor from bytes. - - Args: - tensor: the template for the reconstructed tensor. - buffer: the serialized tensor information. - - Returns: - Tensor: the reconstructed tensor. - """ - np_dtype = np.dtype(torch_dtype_to_numpy_dict[tensor.dtype]) - np_dtype = np_dtype.newbyteorder("<") - np_array = np.frombuffer(buffer, dtype=np_dtype) - array_tensor = torch.tensor(np.array(np_array)) - return torch.reshape(array_tensor, tensor.shape) diff --git a/modyn/model_storage/model_storage.py b/modyn/model_storage/model_storage.py index 737003829..8a6550e28 100644 --- a/modyn/model_storage/model_storage.py +++ b/modyn/model_storage/model_storage.py @@ -11,34 +11,33 @@ class ModelStorage: def __init__(self, config: dict) -> None: self.config = config self._init_model_storage_directory() - self._setup_ftp_directory() def _init_model_storage_directory(self) -> None: - self.model_storage_directory = pathlib.Path(self.config["model_storage"]["models_directory"]) + model_storage_directory = pathlib.Path(self.config["model_storage"]["models_directory"]) - if not self.model_storage_directory.exists(): + if not model_storage_directory.exists(): raise ValueError( - f"The model storage directory {self.model_storage_directory} does not exist. \ + f"The model storage directory {model_storage_directory} does not exist. \ Please create the directory or mount another, existing directory." ) - if not is_directory_writable(self.model_storage_directory): + if not is_directory_writable(model_storage_directory): raise ValueError( - f"The model storage directory {self.model_storage_directory} is not writable. \ + f"The model storage directory {model_storage_directory} is not writable. \ Please check the directory permissions and try again.\n" - + f"Directory info: {os.stat(self.model_storage_directory)}" + + f"Directory info: {os.stat(model_storage_directory)}" ) - def _setup_ftp_directory(self) -> None: - self.ftp_directory = pathlib.Path(os.getcwd()) / "ftp_model_storage" + self.models_directory = model_storage_directory / "models" + self.models_directory.mkdir(exist_ok=True) + self.ftp_directory = model_storage_directory / "ftp" if self.ftp_directory.exists() and self.ftp_directory.is_dir(): shutil.rmtree(self.ftp_directory) - self.ftp_directory.mkdir(exist_ok=False) def run(self) -> None: - with GRPCServer(self.config, self.model_storage_directory, self.ftp_directory) as server: + with GRPCServer(self.config, self.models_directory, self.ftp_directory) as server: with FTPServer(self.config["model_storage"]["ftp_port"], self.ftp_directory): server.wait_for_termination() diff --git a/modyn/protos/selector.proto b/modyn/protos/selector.proto index 7f5658b09..4bbdc157f 100644 --- a/modyn/protos/selector.proto +++ b/modyn/protos/selector.proto @@ -33,6 +33,7 @@ message StrategyConfig { optional JsonString config = 4; } +// TODO(#302): Remove this when reworking pipeline registration message ModelStoragePolicyInfo { StrategyConfig full_model_strategy_config = 1; optional StrategyConfig incremental_model_strategy_config = 2; diff --git a/modyn/selector/internal/grpc/selector_grpc_servicer.py b/modyn/selector/internal/grpc/selector_grpc_servicer.py index f7b2b7904..a2b02a7a1 100644 --- a/modyn/selector/internal/grpc/selector_grpc_servicer.py +++ b/modyn/selector/internal/grpc/selector_grpc_servicer.py @@ -3,9 +3,9 @@ from typing import Iterable, Optional import grpc +from modyn.metadata_database.utils import ModelStorageStrategyConfig # pylint: disable=no-name-in-module -from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.selector.internal.grpc.generated.selector_pb2 import ( AvailableLabelsResponse, DataInformRequest, @@ -26,7 +26,6 @@ SeedSelectorResponse, SelectionStrategyResponse, StatusBarScaleResponse, - StrategyConfig, TriggerResponse, UsesWeightsRequest, UsesWeightsResponse, @@ -47,10 +46,11 @@ def __init__(self, selector_manager: SelectorManager, sample_batch_size: int): self.selector_manager = selector_manager self._sample_batch_size = sample_batch_size + # TODO(#302): Remove this when reworking pipeline registration def register_pipeline(self, request: RegisterPipelineRequest, context: grpc.ServicerContext) -> PipelineResponse: logger.info(f"Registering pipeline with request - {str(request)}") - full_model_strategy = self.get_model_storage_strategy_config( + full_model_strategy = ModelStorageStrategyConfig.from_config( request.model_storage_policy.full_model_strategy_config ) @@ -59,14 +59,14 @@ def register_pipeline(self, request: RegisterPipelineRequest, context: grpc.Serv request.model_storage_policy.HasField("incremental_model_strategy_config") and request.model_storage_policy.incremental_model_strategy_config is not None ): - incremental_model_strategy = self.get_model_storage_strategy_config( + incremental_model_strategy = ModelStorageStrategyConfig.from_config( request.model_storage_policy.incremental_model_strategy_config ) full_model_interval: Optional[int] = None if ( request.model_storage_policy.HasField("full_model_interval") - and request.model_storage_strategy.full_model_interval is not None + and request.model_storage_policy.full_model_interval is not None ): full_model_interval = request.model_storage_policy.full_model_interval @@ -82,17 +82,6 @@ def register_pipeline(self, request: RegisterPipelineRequest, context: grpc.Serv ) return PipelineResponse(pipeline_id=pipeline_id) - @staticmethod - def get_model_storage_strategy_config(strategy_config: StrategyConfig) -> ModelStorageStrategyConfig: - strategy = ModelStorageStrategyConfig(strategy_config.name) - if strategy_config.HasField("zip") and strategy_config.zip is not None: - strategy.zip = strategy_config.zip - if strategy_config.HasField("zip_algorithm") and strategy_config.zip is not None: - strategy.zip_algorithm = strategy_config.zip_algorithm - if strategy_config.HasField("config") and strategy_config.config is not None: - strategy.config = strategy_config.config.value - return strategy - def get_sample_keys_and_weights( # pylint: disable-next=unused-argument self, request: GetSamplesRequest, context: grpc.ServicerContext ) -> Iterable[SamplesResponse]: diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/utils.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/utils.py index 45c0041d9..2b61bbe3f 100644 --- a/modyn/selector/internal/selector_strategies/downsampling_strategies/utils.py +++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/utils.py @@ -1,5 +1,5 @@ from modyn.selector.internal.selector_strategies.downsampling_strategies import AbstractDownsamplingStrategy -from modyn.utils.utils import instantiate_class +from modyn.utils import instantiate_class def instantiate_downsampler(config: dict, maximum_keys_in_memory: int) -> AbstractDownsamplingStrategy: diff --git a/modyn/selector/internal/selector_strategies/presampling_strategies/utils.py b/modyn/selector/internal/selector_strategies/presampling_strategies/utils.py index 7c7a0bc87..8f6e18a90 100644 --- a/modyn/selector/internal/selector_strategies/presampling_strategies/utils.py +++ b/modyn/selector/internal/selector_strategies/presampling_strategies/utils.py @@ -1,5 +1,5 @@ from modyn.selector.internal.selector_strategies.presampling_strategies import AbstractPresamplingStrategy -from modyn.utils.utils import instantiate_class +from modyn.utils import instantiate_class def instantiate_presampler(config: dict, modyn_config: dict, pipeline_id: int) -> AbstractPresamplingStrategy: diff --git a/modyn/supervisor/supervisor.py b/modyn/supervisor/supervisor.py index d95c1ead6..1760ffffa 100644 --- a/modyn/supervisor/supervisor.py +++ b/modyn/supervisor/supervisor.py @@ -302,7 +302,7 @@ def wait_for_new_data(self, start_timestamp: int) -> None: self.status_bar.update(demo="Fetching new data") trigger_occured = False largest_keys = set() - for new_data in self.grpc.get_new_data_since(dataset_id, last_timestamp): + for new_data, _ in self.grpc.get_new_data_since(dataset_id, last_timestamp): # Since get_new_data_since is inclusive, we need to filter out the keys # we have already processed in the previous get_new_data_since request new_data = [ diff --git a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_compressed_full_model.py b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_binary_full_model.py similarity index 91% rename from modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_compressed_full_model.py rename to modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_binary_full_model.py index 6003f0dcb..677d99a8a 100644 --- a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_compressed_full_model.py +++ b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_binary_full_model.py @@ -2,7 +2,7 @@ import tempfile import torch -from modyn.model_storage.internal.storage_strategies.full_model_strategies import CompressedFullModel +from modyn.model_storage.internal.storage_strategies.full_model_strategies import BinaryFullModel class MockModel(torch.nn.Module): @@ -16,7 +16,7 @@ def forward(self, data): def test_store_model(): model = MockModel() - full_model_strategy = CompressedFullModel( + full_model_strategy = BinaryFullModel( zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} ) with tempfile.NamedTemporaryFile() as temporary_file: @@ -30,7 +30,7 @@ def test_store_model(): def test_load_model(): model = MockModel() - full_model_strategy = CompressedFullModel( + full_model_strategy = BinaryFullModel( zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={} ) with tempfile.NamedTemporaryFile() as temporary_file: diff --git a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py index ec45bac99..4cc7496f9 100644 --- a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py +++ b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py @@ -1,10 +1,17 @@ import pathlib import tempfile -from zipfile import ZIP_DEFLATED import torch from modyn.model_storage.internal.storage_strategies.full_model_strategies import PyTorchFullModel -from modyn.utils import unzip_file, zip_file + + +class MockModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32)) + + def forward(self, data): + return data def test_store_model(): @@ -14,28 +21,12 @@ def test_store_model(): with tempfile.NamedTemporaryFile() as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) - full_model_strategy.store_model({"conv_1": True}, temp_file_path) + model = MockModel() + full_model_strategy.store_model(model.state_dict(), temp_file_path) loaded_state = torch.load(temp_file_path) - assert loaded_state["conv_1"] - - -def test_store_model_zipped(): - full_model_strategy = PyTorchFullModel( - zipping_dir=pathlib.Path(), zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={} - ) - with tempfile.TemporaryDirectory() as temp_directory: - directory_path = pathlib.Path(temp_directory) - - zipped_file_path = directory_path / "zipped.model" - full_model_strategy.store_model({"conv_1": True}, zipped_file_path) - - unzipped_file_path = pathlib.Path(directory_path / "unzipped.model") - unzip_file(zipped_file_path, unzipped_file_path, compression=ZIP_DEFLATED) - - loaded_state = torch.load(unzipped_file_path) - assert loaded_state["conv_1"] + assert loaded_state["_weight"][0] == 1.0 def test_load_model(): @@ -45,42 +36,28 @@ def test_load_model(): with tempfile.NamedTemporaryFile() as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) - torch.save({"conv_1": True}, temp_file_path) + model = MockModel() + torch.save(model.state_dict(), temp_file_path) - state_dict = {"conv_1": False} + model._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32) * 2) + state_dict = model.state_dict() full_model_strategy.load_model(state_dict, temp_file_path) - assert state_dict["conv_1"] - - -def test_load_model_zipped(): - with tempfile.TemporaryDirectory() as temp_directory: - directory_path = pathlib.Path(temp_directory) - full_model_strategy = PyTorchFullModel( - zipping_dir=directory_path, zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={} - ) - - model_path = directory_path / "basic.model" - torch.save({"conv_1": True}, model_path) - zipped_model_path = directory_path / "zipped.model" - zip_file(model_path, zipped_model_path, compression=ZIP_DEFLATED) - - state_dict = {"conv_1": False} - full_model_strategy.load_model(state_dict, zipped_model_path) - - assert state_dict["conv_1"] + assert state_dict["_weight"][0] == 1.0 # pylint: disable=unsubscriptable-object def test_store_then_load(): with tempfile.NamedTemporaryFile() as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) full_model_strategy = PyTorchFullModel( - zipping_dir=temp_file_path.parent, zip_activated=True, zip_algorithm_name="ZIP_DEFLATED", config={} + zipping_dir=temp_file_path.parent, zip_activated=False, zip_algorithm_name="", config={} ) - model_state = {"conv_1": True} - full_model_strategy.store_model(model_state, temp_file_path) - loaded_state = {"conv_1": False} - full_model_strategy.load_model(loaded_state, temp_file_path) + model = MockModel() + full_model_strategy.store_model(model.state_dict(), temp_file_path) + + model._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32) * 2) + state_dict = model.state_dict() + full_model_strategy.load_model(state_dict, temp_file_path) - assert loaded_state["conv_1"] + assert state_dict["_weight"][0] == 1.0 # pylint: disable=unsubscriptable-object diff --git a/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py b/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py index 3eaebabc5..aba51383f 100644 --- a/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py +++ b/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py @@ -107,26 +107,26 @@ def test_load_model(): def test_rle(): - assert WeightsDifference.rle_bytes(b"") == b"" + assert WeightsDifference.encode_bytes(b"") == b"" - encoded = WeightsDifference.rle_bytes(b"\x00\x00\x02\x01\x01\x01\x00") + encoded = WeightsDifference.encode_bytes(b"\x00\x00\x02\x01\x01\x01\x00") assert encoded == b"\x02\x00\x01\x02\x03\x01\x01\x00" - encoded = WeightsDifference.rle_bytes(512 * b"\x00" + b"\x01") + encoded = WeightsDifference.encode_bytes(512 * b"\x00" + b"\x01") assert encoded == b"\xff\x00\xff\x00\x02\x00\x01\x01" def test_inv_rle(): - assert WeightsDifference.inv_rle_bytes(b"") == b"" + assert WeightsDifference.decode_bytes(b"") == b"" - encoded = WeightsDifference.inv_rle_bytes(b"\x02\x00\x01\x02\x03\x01\x01\x00") + encoded = WeightsDifference.decode_bytes(b"\x02\x00\x01\x02\x03\x01\x01\x00") assert encoded == b"\x00\x00\x02\x01\x01\x01\x00" - encoded = WeightsDifference.inv_rle_bytes(b"\xff\x00\xff\x00\x02\x00\x01\x01") + encoded = WeightsDifference.decode_bytes(b"\xff\x00\xff\x00\x02\x00\x01\x01") assert encoded == 512 * b"\x00" + b"\x01" with pytest.raises(AssertionError): - WeightsDifference.inv_rle_bytes(b"\x02\x00\x01") + WeightsDifference.decode_bytes(b"\x02\x00\x01") def test_store_then_load_model(): diff --git a/modyn/tests/model_storage/internal/test_model_storage_manager.py b/modyn/tests/model_storage/internal/test_model_storage_manager.py index 9fdf09eda..ece6f377f 100644 --- a/modyn/tests/model_storage/internal/test_model_storage_manager.py +++ b/modyn/tests/model_storage/internal/test_model_storage_manager.py @@ -14,7 +14,6 @@ from modyn.model_storage.internal.storage_strategies.full_model_strategies import PyTorchFullModel from modyn.model_storage.internal.storage_strategies.incremental_model_strategies import WeightsDifference from modyn.models import ResNet18 -from modyn.utils import unzip_file, zip_file DATABASE = pathlib.Path(os.path.abspath(__file__)).parent / "test_model_storage.database" @@ -35,8 +34,7 @@ def get_modyn_config(): def setup(): - if os.path.exists(DATABASE): - os.remove(DATABASE) + DATABASE.unlink(True) with MetadataDatabaseConnection(get_modyn_config()) as database: database.create_tables() @@ -51,7 +49,7 @@ def setup(): def teardown(): - os.remove(DATABASE) + DATABASE.unlink() class MockModel(torch.nn.Module): @@ -122,9 +120,11 @@ def test__reconstruct_model(): 15, 4, difference_model_file_name, "model.metadata", parent_model=prev_model_id ) - manager._reconstruct_model(curr_model_id, model_state, manager.get_model_storage_policy(1)) + reconstructed_state = manager._reconstruct_model_state( + curr_model_id, model_state, manager.get_model_storage_policy(1) + ) - assert model_state["_weight"].item() == 3 # pylint: disable=unsubscriptable-object + assert reconstructed_state["_weight"].item() == 3 # pylint: disable=unsubscriptable-object def test__handle_new_model_full(): @@ -147,7 +147,7 @@ def test__handle_new_model_full(): @patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) -@patch.object(ModelStorageManager, "_reconstruct_model") +@patch.object(ModelStorageManager, "_reconstruct_model_state", return_value=MockModel().state_dict()) @patch.object(ModelStorageManager, "_get_parent_model_id", return_value=101) def test__handle_new_model_incremental( previous_model_mock, reconstruct_model_mock: MagicMock, base_model_state_mock: MagicMock @@ -183,10 +183,9 @@ def test_get_model_storage_policy(): ) full_model_strategy = ModelStorageStrategyConfig(name="PyTorchFullModel") - full_model_strategy.zip = True full_model_strategy.zip_algorithm = "ZIP_DEFLATED" inc_model_strategy = ModelStorageStrategyConfig(name="WeightsDifference") - inc_model_strategy.zip = False + inc_model_strategy.zip = True inc_model_strategy.config = json.dumps({"operator": "sub"}) complex_pipeline = database.register_pipeline( 75, "ResNet18", json.dumps({"num_classes": 10}), True, full_model_strategy, inc_model_strategy, 10 @@ -200,10 +199,10 @@ def test_get_model_storage_policy(): assert not policy.full_model_strategy.zip complex_policy = manager.get_model_storage_policy(complex_pipeline) - assert complex_policy.full_model_strategy.zip + assert not complex_policy.full_model_strategy.zip assert complex_policy.full_model_strategy.zip_algorithm == ZIP_DEFLATED assert complex_policy.incremental_model_strategy - assert not complex_policy.incremental_model_strategy.zip + assert complex_policy.incremental_model_strategy.zip @patch("modyn.model_storage.internal.model_storage_manager.current_time_millis", return_value=100) @@ -237,8 +236,7 @@ def test_store_model(base_model_mock, current_time_mock): with open(temp_directory_path / model.model_path, "rb") as model_file: assert model_file.read() == b"\x00\x00\x00\x40" - unzip_file(temp_directory_path / model.metadata_path, temp_directory_path / "unzipped.metadata") - assert torch.load(temp_directory_path / "unzipped.metadata")["metadata"] + assert torch.load(temp_directory_path / model.metadata_path)["metadata"] loaded_model = manager.load_model(model_id, True) @@ -247,7 +245,7 @@ def test_store_model(base_model_mock, current_time_mock): def test_store_model_resnet(): - full_model_strategy = ModelStorageStrategyConfig(name="CompressedFullModel") + full_model_strategy = ModelStorageStrategyConfig(name="BinaryFullModel") full_model_strategy.zip = True with MetadataDatabaseConnection(get_modyn_config()) as database: @@ -300,14 +298,13 @@ def test_load_model_metadata(base_model_mock: MagicMock): model_file_name = "mock.model" with MetadataDatabaseConnection(get_modyn_config()) as database: - model_id = database.add_trained_model(1, 32, model_file_name, "mock.metadata.zip") + model_id = database.add_trained_model(1, 32, model_file_name, "mock.metadata") policy = manager.get_model_storage_policy(1) policy.full_model_strategy.store_model( get_mock_model_after().state_dict(), temp_directory_path / model_file_name ) torch.save({"metadata": True}, temp_directory_path / "mock.metadata") - zip_file(temp_directory_path / "mock.metadata", temp_directory_path / "mock.metadata.zip") reconstructed_state = manager.load_model(model_id, True) diff --git a/modyn/tests/model_storage/internal/utils/test_data_types.py b/modyn/tests/model_storage/internal/utils/test_data_types.py deleted file mode 100644 index 27234c9d2..000000000 --- a/modyn/tests/model_storage/internal/utils/test_data_types.py +++ /dev/null @@ -1,16 +0,0 @@ -import io - -import torch -from modyn.model_storage.internal.utils import read_tensor_from_bytes - - -def test_read_tensor_from_bytes(): - buf = io.BytesIO() - buf.write(b"\x01\x00\x00\x00") - buf.write(b"\x02\x00\x00\x00") - buf.write(b"\x03\x00\x00\x00") - buf.write(b"\x04\x00\x00\x00") - buf.seek(0) - res = read_tensor_from_bytes(torch.ones((2, 2), dtype=torch.int32), buf.getvalue()) - - assert res[0, 0] == 1 and res[0, 1] == 2 and res[1, 0] == 3 and res[1, 1] == 4 diff --git a/modyn/tests/model_storage/internal/utils/test_model_storage_policy.py b/modyn/tests/model_storage/internal/utils/test_model_storage_policy.py index 7594e5910..0f319079a 100644 --- a/modyn/tests/model_storage/internal/utils/test_model_storage_policy.py +++ b/modyn/tests/model_storage/internal/utils/test_model_storage_policy.py @@ -1,6 +1,6 @@ import json import pathlib -from zipfile import ZIP_DEFLATED, ZIP_LZMA +from zipfile import ZIP_LZMA import pytest from modyn.model_storage.internal.storage_strategies.difference_operators import XorDifferenceOperator @@ -20,24 +20,23 @@ def test_extended_model_storage_policy(): zipping_dir=pathlib.Path(), full_model_strategy_name="PyTorchFullModel", full_model_strategy_zip=True, - full_model_strategy_zip_algorithm="ZIP_LZMA", + full_model_strategy_zip_algorithm=None, full_model_strategy_config=None, ) policy.register_incremental_model_strategy( name="WeightsDifference", zip_enabled=True, - zip_algorithm=None, + zip_algorithm="ZIP_LZMA", config=json.dumps({"operator": "xor", "split_exponent": True}), full_model_interval=10, ) assert policy.zipping_dir == pathlib.Path("") - assert policy.full_model_strategy.zip - assert policy.full_model_strategy.zip_algorithm == ZIP_LZMA + assert not policy.full_model_strategy.zip weights_diff_strategy = policy.incremental_model_strategy assert weights_diff_strategy.zip - assert weights_diff_strategy.zip_algorithm == ZIP_DEFLATED + assert weights_diff_strategy.zip_algorithm == ZIP_LZMA assert getattr(weights_diff_strategy, "split_exponent") assert isinstance(getattr(weights_diff_strategy, "difference_operator"), XorDifferenceOperator.__class__) diff --git a/modyn/tests/model_storage/test_model_storage.py b/modyn/tests/model_storage/test_model_storage.py index 202fa28ce..ad1df3134 100644 --- a/modyn/tests/model_storage/test_model_storage.py +++ b/modyn/tests/model_storage/test_model_storage.py @@ -39,7 +39,6 @@ def __exit__(self, *args, **kwargs): # pylint: disable=unused-argument @patch.object(ModelStorage, "_init_model_storage_directory", noop_setup_directory) -@patch.object(ModelStorage, "_setup_ftp_directory", noop_setup_directory) def test_model_storage_init(): model_storage = ModelStorage(get_modyn_config()) assert model_storage.config == get_modyn_config() diff --git a/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py b/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py index 69335f96e..0b7b0f6d5 100644 --- a/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py +++ b/modyn/tests/selector/internal/grpc/test_selector_grpc_servicer.py @@ -74,7 +74,7 @@ def test_register_pipeline(test_register_pipeline: MagicMock): assert arguments[3] == "{}" assert arguments[4] assert arguments[5].name == "PyTorchFullModel" - assert arguments[5].zip is None + assert not arguments[5].zip assert arguments[6] is None assert arguments[7] is None diff --git a/modyn/tests/storage/test_storage.py b/modyn/tests/storage/test_storage.py index 5ba24caa8..8b21f2f8d 100644 --- a/modyn/tests/storage/test_storage.py +++ b/modyn/tests/storage/test_storage.py @@ -54,6 +54,8 @@ def get_minimal_modyn_config() -> dict: }, "selector": {"hostname": "host", "port": "1337"}, "trainer_server": {"hostname": "host", "port": "1337"}, + "evaluator": {"hostname": "host", "port": "1337"}, + "model_storage": {"hostname": "host", "port": "1337", "ftp_port": "1337", "models_directory": "test.dir"}, } diff --git a/modyn/tests/supervisor/test_supervisor.py b/modyn/tests/supervisor/test_supervisor.py index 8284ab665..cb0ba0348 100644 --- a/modyn/tests/supervisor/test_supervisor.py +++ b/modyn/tests/supervisor/test_supervisor.py @@ -18,6 +18,7 @@ def get_minimal_pipeline_config() -> dict: return { "pipeline": {"name": "Test"}, "model": {"id": "ResNet18"}, + "model_storage": {"full_model_strategy": {"name": "PyTorchFullModel"}}, "training": { "gpus": 1, "device": "cpu", @@ -342,7 +343,7 @@ def test_shutdown_trainer(): pass -@patch.object(GRPCHandler, "get_new_data_since", return_value=[[(10, 42, 0), (11, 43, 1)]]) +@patch.object(GRPCHandler, "get_new_data_since", return_value=[([(10, 42, 0), (11, 43, 1)], {})]) @patch.object(Supervisor, "_handle_new_data", return_value=False, side_effect=KeyboardInterrupt) def test_wait_for_new_data(test__handle_new_data: MagicMock, test_get_new_data_since: MagicMock): # This is a simple test and does not the inclusivity filtering! @@ -353,7 +354,7 @@ def test_wait_for_new_data(test__handle_new_data: MagicMock, test_get_new_data_s test__handle_new_data.assert_called_once_with([(10, 42, 0), (11, 43, 1)]) -@patch.object(GRPCHandler, "get_new_data_since", return_value=[[(10, 42, 0)], [(11, 43, 1)]]) +@patch.object(GRPCHandler, "get_new_data_since", return_value=[([(10, 42, 0)], {}), ([(11, 43, 1)], {})]) @patch.object(Supervisor, "_handle_new_data", return_value=False, side_effect=[None, KeyboardInterrupt]) def test_wait_for_new_data_batched(test__handle_new_data: MagicMock, test_get_new_data_since: MagicMock): # This is a simple test and does not the inclusivity filtering! @@ -375,9 +376,9 @@ def test_wait_for_new_data_filtering(): mocked__handle_new_data_return_vals = [True, True, KeyboardInterrupt] mocked_get_new_data_since = [ - [[(10, 42, 0), (11, 43, 0), (12, 43, 1)]], - [[(11, 43, 0), (12, 43, 1), (13, 43, 2), (14, 45, 3)]], - [[]], + [([(10, 42, 0), (11, 43, 0), (12, 43, 1)], {})], + [([(11, 43, 0), (12, 43, 1), (13, 43, 2), (14, 45, 3)], {})], + [([], {})], ValueError, ] @@ -406,9 +407,9 @@ def test_wait_for_new_data_filtering_batched(): mocked__handle_new_data_return_vals = [True, True, True, True, True, KeyboardInterrupt] mocked_get_new_data_since = [ - [[(10, 42, 0), (11, 43, 0)], [(12, 43, 1)]], - [[(11, 43, 0)], [(12, 43, 1), (13, 43, 2)], [(14, 45, 3)]], - [[]], + [([(10, 42, 0), (11, 43, 0)], {}), ([(12, 43, 1)], {})], + [([(11, 43, 0)], {}), ([(12, 43, 1), (13, 43, 2)], {}), ([(14, 45, 3)], {})], + [([], {})], ValueError, ] diff --git a/modyn/tests/utils/test_utils.py b/modyn/tests/utils/test_utils.py index 20bb683ae..5cf171685 100644 --- a/modyn/tests/utils/test_utils.py +++ b/modyn/tests/utils/test_utils.py @@ -1,4 +1,5 @@ # pylint: disable=unused-argument,redefined-outer-name +import io import pathlib import tempfile from unittest.mock import patch @@ -19,9 +20,12 @@ dynamic_module_import, flatten, get_partition_for_worker, + get_tensor_byte_size, grpc_connection_established, + instantiate_class, model_available, package_available_and_can_be_imported, + reconstruct_tensor_from_bytes, seed_everything, trigger_available, unzip_file, @@ -29,7 +33,6 @@ validate_yaml, zip_file, ) -from modyn.utils.utils import instantiate_class @patch.object(GRPCHandler, "init_storage", lambda self: None) @@ -249,3 +252,43 @@ def test_zip_and_unzip_file(): with open(text_file_path, "r", encoding="utf-8") as file: assert file.read() == "This is a testfile!" + + +def test_read_tensor_from_bytes(): + buf = io.BytesIO() + buf.write(b"\x01\x00\x00\x00") + buf.write(b"\x02\x00\x00\x00") + buf.write(b"\x03\x00\x00\x00") + buf.write(b"\x04\x00\x00\x00") + buf.seek(0) + res = reconstruct_tensor_from_bytes(torch.ones((2, 2), dtype=torch.int32), buf.getvalue()) + + assert res[0, 0] == 1 and res[0, 1] == 2 and res[1, 0] == 3 and res[1, 1] == 4 + + buf.seek(0, io.SEEK_END) + buf.write(b"\xff\x00\x00\x00") + buf.write(b"\x0f\x00\x00\x00") + buf.seek(0) + + res = reconstruct_tensor_from_bytes(torch.ones((1, 6), dtype=torch.int32), buf.getvalue()) + assert ( + res[0, 0] == 1 and res[0, 1] == 2 and res[0, 2] == 3 and res[0, 3] == 4 and res[0, 4] == 255 and res[0, 5] == 15 + ) + + buf_floats = io.BytesIO() + buf_floats.write(b"\x00\x00\x00\x3f") + buf_floats.write(b"\x00\x00\x00\x3e") + + res = reconstruct_tensor_from_bytes(torch.ones((2, 1), dtype=torch.float32), buf_floats.getvalue()) + assert res[0, 0] == 1.0 / 2 and res[1, 0] == 1.0 / 8 + + +def test_get_tensor_byte_size(): + tensor = torch.ones((3, 3, 3), dtype=torch.int32) + assert get_tensor_byte_size(tensor) == 3 * 3 * 3 * 4 + + tensor = torch.ones((5, 5), dtype=torch.float64) * 5 + assert get_tensor_byte_size(tensor) == 5 * 5 * 8 + + tensor = torch.ones(10, dtype=torch.float32) + assert get_tensor_byte_size(tensor) == 40 diff --git a/modyn/trainer_server/internal/dataset/online_dataset.py b/modyn/trainer_server/internal/dataset/online_dataset.py index c97becff2..cfcea6632 100644 --- a/modyn/trainer_server/internal/dataset/online_dataset.py +++ b/modyn/trainer_server/internal/dataset/online_dataset.py @@ -13,7 +13,7 @@ ) from modyn.storage.internal.grpc.generated.storage_pb2_grpc import StorageStub from modyn.trainer_server.internal.dataset.key_sources import AbstractKeySource, SelectorKeySource -from modyn.utils.utils import ( +from modyn.utils import ( BYTES_PARSER_FUNC_NAME, MAX_MESSAGE_SIZE, deserialize_function, diff --git a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py index a32124e03..d5a87a45f 100644 --- a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py +++ b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py @@ -350,7 +350,7 @@ def store_final_model( logger.error(f"Training with id {training_id} is still running.") return StoreFinalModelResponse(valid_state=False) - final_checkpoint_path = self._prepare_final_model(training_id) + final_checkpoint_path = self._get_final_model_path(training_id) if final_checkpoint_path: prefix_model_path = final_checkpoint_path.relative_to(self._modyn_base_dir) @@ -379,7 +379,7 @@ def store_final_model( logger.error(f"Could not find final checkpoint of training with ID {training_id}.") return StoreFinalModelResponse(valid_state=False) - def _prepare_final_model(self, training_id: int) -> Optional[pathlib.Path]: + def _get_final_model_path(self, training_id: int) -> Optional[pathlib.Path]: final_checkpoint_path = self._training_dict[training_id].final_checkpoint_path / "model_final.modyn" if not final_checkpoint_path.exists(): return None diff --git a/modyn/trainer_server/internal/trainer/pytorch_trainer.py b/modyn/trainer_server/internal/trainer/pytorch_trainer.py index 47c118893..fe1deb234 100644 --- a/modyn/trainer_server/internal/trainer/pytorch_trainer.py +++ b/modyn/trainer_server/internal/trainer/pytorch_trainer.py @@ -47,10 +47,10 @@ deserialize_function, dynamic_module_import, grpc_connection_established, + instantiate_class, package_available_and_can_be_imported, seed_everything, ) -from modyn.utils.utils import instantiate_class AvailableQueues = Enum("AvailableQueues", ["TRAINING", "DOWNSAMPLING"]) diff --git a/modyn/utils/__init__.py b/modyn/utils/__init__.py index fbd37a71b..efe785d2e 100644 --- a/modyn/utils/__init__.py +++ b/modyn/utils/__init__.py @@ -19,10 +19,13 @@ dynamic_module_import, flatten, get_partition_for_worker, + get_tensor_byte_size, grpc_connection_established, + instantiate_class, is_directory_writable, model_available, package_available_and_can_be_imported, + reconstruct_tensor_from_bytes, seed_everything, trigger_available, unzip_file, diff --git a/modyn/utils/utils.py b/modyn/utils/utils.py index 1d34f849c..0eff6e594 100644 --- a/modyn/utils/utils.py +++ b/modyn/utils/utils.py @@ -4,6 +4,7 @@ import importlib.util import inspect import logging +import math import os import pathlib import random @@ -305,3 +306,38 @@ def unzip_file( if remove_file: os.remove(zipped_file_path) + + +def reconstruct_tensor_from_bytes(tensor: torch.Tensor, buffer: bytes) -> torch.Tensor: + """ + Reconstruct a tensor from bytes. + + Args: + tensor: the template for the reconstructed tensor. + buffer: the serialized tensor information. + + Returns: + Tensor: the reconstructed tensor. + """ + reconstructed_tensor = torch.frombuffer(buffer, dtype=tensor.dtype) + return torch.reshape(reconstructed_tensor, tensor.shape) + + +def get_tensor_byte_size(tensor: torch.Tensor) -> int: + """ + Get the amount of bytes needed to represent a tensor in binary format. + + Args: + tensor: the tensor, for which the number of bytes is calculated. + + Returns: + int: the number of bytes needed to represent the tensor. + """ + shape = tensor.shape + if torch.is_floating_point(tensor): + type_size = torch.finfo(tensor.dtype).bits / 8 + else: + type_size = torch.iinfo(tensor.dtype).bits / 8 + num_bytes = int(math.prod(shape) * type_size) + + return num_bytes From e3a4794e6e0108101661481cbe2c5e05ac785413 Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Mon, 9 Oct 2023 14:47:17 +0200 Subject: [PATCH 08/12] Follow-Up * make pylint happy --- modyn/models/articlenet/articlenet.py | 3 +++ .../remote_downsamplers/remote_gradnorm_downsampling.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/modyn/models/articlenet/articlenet.py b/modyn/models/articlenet/articlenet.py index 244e5c623..2069253c4 100644 --- a/modyn/models/articlenet/articlenet.py +++ b/modyn/models/articlenet/articlenet.py @@ -43,6 +43,9 @@ def __call__(self, data: torch.Tensor) -> torch.Tensor: # of the entire input sequence return pooled_output + def _reorder_cache(self, past, beam_idx): + pass + class ArticleNetwork(CoresetSupportingModule): def __init__(self, num_classes: int) -> None: diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py index cef9e8aff..af18a690e 100644 --- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py +++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py @@ -33,7 +33,7 @@ def get_scores(self, forward_output: torch.Tensor, target: torch.Tensor) -> torc # softmax to the forward output to obtain the probabilities probs = torch.nn.functional.softmax(forward_output, dim=1) num_classes = forward_output.shape[-1] - one_hot_targets = torch.nn.functional.one_hot(target, num_classes=num_classes) + one_hot_targets = torch.nn.functional.one_hot(target, num_classes=num_classes) # pylint: disable=E1102 scores = torch.norm(probs - one_hot_targets, dim=-1) else: sample_losses = self.per_sample_loss_fct(forward_output, target) From ae25ddbc739f8d6e374d04acca9f86e87259e6f0 Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Mon, 9 Oct 2023 15:41:27 +0200 Subject: [PATCH 09/12] Follow-Up * change ArticleNet to implement all abstract methods --- modyn/models/articlenet/articlenet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modyn/models/articlenet/articlenet.py b/modyn/models/articlenet/articlenet.py index 2069253c4..0f4225b41 100644 --- a/modyn/models/articlenet/articlenet.py +++ b/modyn/models/articlenet/articlenet.py @@ -43,7 +43,7 @@ def __call__(self, data: torch.Tensor) -> torch.Tensor: # of the entire input sequence return pooled_output - def _reorder_cache(self, past, beam_idx): + def _reorder_cache(self, past: Any, beam_idx: Any) -> None: pass From 087bb1caf2075f89e96e79262a2182888cbdfa12 Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Wed, 11 Oct 2023 09:18:50 +0200 Subject: [PATCH 10/12] Increase timeout threshold of availability test * from 60 to 90 seconds --- integrationtests/test_docker_compose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrationtests/test_docker_compose.py b/integrationtests/test_docker_compose.py index 81759de72..43d4a6d6c 100644 --- a/integrationtests/test_docker_compose.py +++ b/integrationtests/test_docker_compose.py @@ -7,7 +7,7 @@ from modyn.storage.internal.grpc.generated.storage_pb2_grpc import StorageStub # noqa: F401 from modyn.utils import grpc_connection_established -TIMEOUT = 60 # seconds +TIMEOUT = 90 # seconds def terminate_on_timeout(start_time: int) -> None: From ef25de3e25fc76eb1db915ce21a4eeedbb90dcb9 Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Fri, 20 Oct 2023 08:35:13 +0200 Subject: [PATCH 11/12] Merge Leftovers and Changes to Model Storage Manager - Return model state after loading --- .../integrationtest_model_storage.py | 9 ++- .../internal/model_storage_manager.py | 57 +++++++--------- .../abstract_full_model_strategy.py | 12 ++-- .../binary_full_model.py | 3 +- .../pytorch_full_model.py | 3 +- .../abstract_incremental_model_strategy.py | 12 ++-- .../weights_difference.py | 9 +-- modyn/models/articlenet/articlenet.py | 3 - modyn/selector/internal/selector_manager.py | 2 +- .../grpc/test_evaluator_grpc_servicer.py | 6 +- .../test_metadata_database_connection.py | 7 +- .../test_binary_full_model.py | 3 +- .../test_pytorch_full_model.py | 6 +- .../test_weights_difference.py | 11 ++- .../internal/test_model_storage_manager.py | 21 +++--- .../grpc/test_trainer_server_grpc_servicer.py | 6 +- .../grpc/generated/trainer_server_pb2.py | 67 +++++++++---------- .../grpc/generated/trainer_server_pb2.pyi | 14 +--- .../grpc/trainer_server_grpc_servicer.py | 2 +- 19 files changed, 122 insertions(+), 131 deletions(-) diff --git a/integrationtests/model_storage/integrationtest_model_storage.py b/integrationtests/model_storage/integrationtest_model_storage.py index cac1bd539..e142788c1 100644 --- a/integrationtests/model_storage/integrationtest_model_storage.py +++ b/integrationtests/model_storage/integrationtest_model_storage.py @@ -86,7 +86,14 @@ def insert_triggers_into_database( ) -> (int, int): with MetadataDatabaseConnection(modyn_config) as database: pipeline_id = database.register_pipeline( - 2, "ResNet18", json.dumps({"num_classes": 10}), False, "{}", full_strategy, inc_strategy, full_model_interval + 2, + "ResNet18", + json.dumps({"num_classes": 10}), + False, + "{}", + full_strategy, + inc_strategy, + full_model_interval, ) trigger_parent = Trigger(trigger_id=0, pipeline_id=pipeline_id) diff --git a/modyn/model_storage/internal/model_storage_manager.py b/modyn/model_storage/internal/model_storage_manager.py index d939b3684..9d81f8490 100644 --- a/modyn/model_storage/internal/model_storage_manager.py +++ b/modyn/model_storage/internal/model_storage_manager.py @@ -97,11 +97,8 @@ def _handle_new_model( ): parent_model_id: Optional[int] = self._get_parent_model_id(pipeline_id, trigger_id) if parent_model_id is not None: - # store the model according to the incremental model strategy. - parent_model_state = self._get_base_model_state(pipeline_id) - # load model state of the parent model. - parent_model_state = self._reconstruct_model_state(parent_model_id, parent_model_state, policy) + parent_model_state = self._reconstruct_model_state(parent_model_id, policy) # finally store the model delta. policy.incremental_model_strategy.store_model(state_dict, parent_model_state, model_path) @@ -113,34 +110,14 @@ def _handle_new_model( policy.full_model_strategy.store_model(state_dict, model_path) return None - def _get_base_model_state(self, pipeline_id: int) -> dict: - """ - Get a randomly initialized model associated with the pipeline. - - Args: - pipeline_id: the involved pipeline. - - Returns: - dict: the plain model state derived from the model architecture of the pipeline's models. - """ - with MetadataDatabaseConnection(self._modyn_config) as database: - model_class_name, model_config, amp = database.get_model_configuration(pipeline_id) - model_module = dynamic_module_import("modyn.models") - assert hasattr(model_module, model_class_name), f"Model {model_class_name} not available." - - model_handler = getattr(model_module, model_class_name) - return model_handler(json.loads(model_config), "cpu", amp).model.state_dict() - - def _reconstruct_model_state(self, model_id: int, model_state: dict, policy: ModelStoragePolicy) -> dict: + def _reconstruct_model_state(self, model_id: int, policy: ModelStoragePolicy) -> dict: """ Reconstruct a given model according to the model storage policy. The function recursively calls itself whenever the model is stored as a delta. Otherwise it is stored according to a full model strategy and the model state can be retrieved. - Finally, the model_state is overwritten by the state of the inquired model. Args: model_id: the identifier of the model to be reconstructed. - model_state: a random model state (or the loaded parent model state). policy: the model storage policy of the pipeline. Returns: dict: the reconstructed model state. Refers to the same object as model_state. @@ -151,16 +128,32 @@ def _reconstruct_model_state(self, model_id: int, model_state: dict, policy: Mod model: TrainedModel = database.session.get(TrainedModel, model_id) if not model.parent_model: # base case: we can load a fully stored model. - policy.full_model_strategy.load_model(model_state, self._storage_dir / model.model_path) - return model_state + model_state = self._get_base_model_state(model.pipeline_id) + return policy.full_model_strategy.load_model(model_state, self._storage_dir / model.model_path) # recursive step: we recurse to load the model state of the parent model. - model_state = self._reconstruct_model_state(model.parent_model, model_state, policy) + model_state = self._reconstruct_model_state(model.parent_model, policy) # we apply the incremental strategy to load our model state. - policy.incremental_model_strategy.load_model(model_state, self._storage_dir / model.model_path) + return policy.incremental_model_strategy.load_model(model_state, self._storage_dir / model.model_path) + + def _get_base_model_state(self, pipeline_id: int) -> dict: + """ + Get a randomly initialized model associated with the pipeline. + + Args: + pipeline_id: the involved pipeline. + + Returns: + dict: the plain model state derived from the model architecture of the pipeline's models. + """ + with MetadataDatabaseConnection(self._modyn_config) as database: + model_class_name, model_config, amp = database.get_model_configuration(pipeline_id) + model_module = dynamic_module_import("modyn.models") + assert hasattr(model_module, model_class_name), f"Model {model_class_name} not available." - return model_state + model_handler = getattr(model_module, model_class_name) + return model_handler(json.loads(model_config), "cpu", amp).model.state_dict() def _get_parent_model_id(self, pipeline_id: int, trigger_id: int) -> Optional[int]: """ @@ -205,8 +198,8 @@ def load_model(self, model_id: int, metadata: bool) -> Optional[dict]: policy = self.get_model_storage_policy(model.pipeline_id) # retrieve the model by loading its state dictionary. - model_state = self._get_base_model_state(model.pipeline_id) - model_dict = {"model": self._reconstruct_model_state(model_id, model_state, policy)} + model_state = self._reconstruct_model_state(model_id, policy) + model_dict = {"model": model_state} # append the metadata to the dictionary if specified. if metadata: diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py index 794995d04..d13c2c740 100644 --- a/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/abstract_full_model_strategy.py @@ -33,21 +33,23 @@ def store_model(self, model_state: dict, file_path: pathlib.Path) -> None: self._store_model(model_state, file_path) @abstractmethod - def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: + def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> dict: """ Load the model state from the given file. Args: base_model_state: the base model state which must be overwritten. file_path: the path to the file that contains the state information. + + Returns: + dict: the state dictionary of the loaded model. """ raise NotImplementedError() - def load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: + def load_model(self, base_model_state: dict, file_path: pathlib.Path) -> dict: if self.zip: with tempfile.NamedTemporaryFile(dir=self.zipping_dir) as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) unzip_file(file_path, temp_file_path, compression=self.zip_algorithm, remove_file=False) - self._load_model(base_model_state, temp_file_path) - else: - self._load_model(base_model_state, file_path) + return self._load_model(base_model_state, temp_file_path) + return self._load_model(base_model_state, file_path) diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/binary_full_model.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/binary_full_model.py index 499de5b8b..f8a669ce2 100644 --- a/modyn/model_storage/internal/storage_strategies/full_model_strategies/binary_full_model.py +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/binary_full_model.py @@ -18,8 +18,9 @@ def _store_model(self, model_state: dict, file_path: pathlib.Path) -> None: for tensor in model_state.values(): file.write(tensor.numpy().tobytes()) - def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: + def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> dict: with open(file_path, "rb") as file: for layer, tensor in base_model_state.items(): num_bytes = get_tensor_byte_size(tensor) base_model_state[layer] = reconstruct_tensor_from_bytes(tensor, file.read(num_bytes)) + return base_model_state diff --git a/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py b/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py index 58f3bc0cc..26c317ab2 100644 --- a/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py +++ b/modyn/model_storage/internal/storage_strategies/full_model_strategies/pytorch_full_model.py @@ -24,5 +24,6 @@ def __init__(self, zipping_dir: pathlib.Path, zip_activated: bool, zip_algorithm def _store_model(self, model_state: dict, file_path: pathlib.Path) -> None: torch.save(model_state, file_path) - def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> None: + def _load_model(self, base_model_state: dict, file_path: pathlib.Path) -> dict: base_model_state.update(torch.load(file_path)) + return base_model_state diff --git a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py index 8dbcfc42c..de7435b35 100644 --- a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py +++ b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/abstract_incremental_model_strategy.py @@ -34,21 +34,23 @@ def store_model(self, model_state: dict, prev_model_state: dict, file_path: path self._store_model(model_state, prev_model_state, file_path) @abstractmethod - def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> None: + def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> dict: """ Loads a model state by overwriting the state of the preceding model. Args: prev_model_state: the state of the preceding model. file_path: the path to the file which contains the delta. + + Returns: + dict: the state dictionary of the loaded model. """ raise NotImplementedError() - def load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> None: + def load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> dict: if self.zip: with tempfile.NamedTemporaryFile(dir=self.zipping_dir) as temporary_file: temp_file_path = pathlib.Path(temporary_file.name) unzip_file(file_path, temp_file_path, compression=self.zip_algorithm, remove_file=False) - self._load_model(prev_model_state, temp_file_path) - else: - self._load_model(prev_model_state, file_path) + return self._load_model(prev_model_state, temp_file_path) + return self._load_model(prev_model_state, file_path) diff --git a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py index 78e90dfb4..f86d60277 100644 --- a/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py +++ b/modyn/model_storage/internal/storage_strategies/incremental_model_strategies/weights_difference.py @@ -61,16 +61,16 @@ def _store_model(self, model_state: dict, prev_model_state: dict, file_path: pat file.write(exponents) file.write(bytestream.getbuffer().tobytes()) - def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> None: + def _load_model(self, prev_model_state: dict, file_path: pathlib.Path) -> dict: with open(file_path, "rb") as file: if not self.split_exponent: for layer_name, tensor in prev_model_state.items(): num_bytes = get_tensor_byte_size(tensor) prev_model_state[layer_name] = self.difference_operator.restore(tensor, file.read(num_bytes)) - else: - self._load_model_split_exponent(prev_model_state, file) + return prev_model_state + return self._load_model_split_exponent(prev_model_state, file) - def _load_model_split_exponent(self, prev_model_state: dict, file: BinaryIO) -> None: + def _load_model_split_exponent(self, prev_model_state: dict, file: BinaryIO) -> dict: exponent_bytes_amount = int.from_bytes(file.read(8), byteorder="big") with io.BytesIO() as exponent_bytes: @@ -91,6 +91,7 @@ def _load_model_split_exponent(self, prev_model_state: dict, file: BinaryIO) -> prev_model_state[layer_name] = self.difference_operator.restore(tensor, self.reorder_buffer(buffer)) else: prev_model_state[layer_name] = self.difference_operator.restore(tensor, file.read(num_bytes)) + return prev_model_state @staticmethod def reorder_buffer(buffer: Union[bytes, bytearray]) -> bytes: diff --git a/modyn/models/articlenet/articlenet.py b/modyn/models/articlenet/articlenet.py index c9bc3b980..e4f961bdb 100644 --- a/modyn/models/articlenet/articlenet.py +++ b/modyn/models/articlenet/articlenet.py @@ -44,9 +44,6 @@ def __call__(self, data: torch.Tensor) -> torch.Tensor: # of the entire input sequence return pooled_output - def _reorder_cache(self, past: Any, beam_idx: Any) -> None: - pass - class ArticleNetwork(CoresetSupportingModule): def __init__(self, num_classes: int) -> None: diff --git a/modyn/selector/internal/selector_manager.py b/modyn/selector/internal/selector_manager.py index bdcc0c386..7c2df37ad 100644 --- a/modyn/selector/internal/selector_manager.py +++ b/modyn/selector/internal/selector_manager.py @@ -10,8 +10,8 @@ from typing import Any, Optional from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection -from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.metadata_database.models.pipelines import Pipeline +from modyn.metadata_database.utils import ModelStorageStrategyConfig from modyn.selector.internal.selector_strategies.abstract_selection_strategy import AbstractSelectionStrategy from modyn.selector.selector import Selector from modyn.utils.utils import dynamic_module_import, is_directory_writable diff --git a/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py b/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py index 42dd36fe6..fb38e07f3 100644 --- a/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py +++ b/modyn/tests/evaluator/internal/grpc/test_evaluator_grpc_servicer.py @@ -48,8 +48,7 @@ def get_modyn_config(): def setup(): - if os.path.exists(DATABASE): - os.remove(DATABASE) + DATABASE.unlink(True) with MetadataDatabaseConnection(get_modyn_config()) as database: database.create_tables() @@ -59,6 +58,7 @@ def setup(): "ResNet18", json.dumps({}), True, + "{}", ModelStorageStrategyConfig(name="PyTorchFullModel"), incremental_model_strategy=None, full_model_interval=None, @@ -68,7 +68,7 @@ def setup(): def teardown(): - os.remove(DATABASE) + DATABASE.unlink() class DummyModelWrapper: diff --git a/modyn/tests/metadata_database/test_metadata_database_connection.py b/modyn/tests/metadata_database/test_metadata_database_connection.py index d34a6f91b..a56cab0cf 100644 --- a/modyn/tests/metadata_database/test_metadata_database_connection.py +++ b/modyn/tests/metadata_database/test_metadata_database_connection.py @@ -75,7 +75,12 @@ def test_get_model_configuration(): with MetadataDatabaseConnection(get_minimal_modyn_config()) as database: database.create_tables() pipeline_id = database.register_pipeline( - 1, "ResNet18", json.dumps({"num_classes": 10}), True, ModelStorageStrategyConfig(name="PyTorchFullModel") + 1, + "ResNet18", + json.dumps({"num_classes": 10}), + True, + "{}", + ModelStorageStrategyConfig(name="PyTorchFullModel"), ) assert pipeline_id == 1 diff --git a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_binary_full_model.py b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_binary_full_model.py index 677d99a8a..c5ae70589 100644 --- a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_binary_full_model.py +++ b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_binary_full_model.py @@ -39,7 +39,6 @@ def test_load_model(): with open(temp_file_path, "wb") as stored_model_file: assert stored_model_file.write(b"\x00\x00\x00\x3f\x00\x00\x00\x3f") - state_dict = model.state_dict() - full_model_strategy.load_model(state_dict, temp_file_path) + state_dict = full_model_strategy.load_model(model.state_dict(), temp_file_path) assert state_dict["_weight"][0] == 0.5 # pylint: disable=unsubscriptable-object diff --git a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py index 4cc7496f9..530026bc5 100644 --- a/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py +++ b/modyn/tests/model_storage/internal/storage_strategies/full_model_strategies/test_pytorch_full_model.py @@ -40,8 +40,7 @@ def test_load_model(): torch.save(model.state_dict(), temp_file_path) model._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32) * 2) - state_dict = model.state_dict() - full_model_strategy.load_model(state_dict, temp_file_path) + state_dict = full_model_strategy.load_model(model.state_dict(), temp_file_path) assert state_dict["_weight"][0] == 1.0 # pylint: disable=unsubscriptable-object @@ -57,7 +56,6 @@ def test_store_then_load(): full_model_strategy.store_model(model.state_dict(), temp_file_path) model._weight = torch.nn.Parameter(torch.ones(2, dtype=torch.float32) * 2) - state_dict = model.state_dict() - full_model_strategy.load_model(state_dict, temp_file_path) + state_dict = full_model_strategy.load_model(model.state_dict(), temp_file_path) assert state_dict["_weight"][0] == 1.0 # pylint: disable=unsubscriptable-object diff --git a/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py b/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py index aba51383f..1d5624794 100644 --- a/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py +++ b/modyn/tests/model_storage/internal/storage_strategies/incremental_model_strategies/test_weights_difference.py @@ -95,13 +95,12 @@ def test_load_model(): stored_model_file.write(b"\x00\x00\x80\x3f\x00\x00\x80\x3f") for operator in ["xor", "sub"]: - model = MockModel() - model_state = model.state_dict() incremental_strategy = WeightsDifference( zipping_dir=pathlib.Path(), zip_activated=False, zip_algorithm_name="", config={"operator": operator} ) - incremental_strategy.load_model(model_state, temp_file_path) + model = MockModel() + model_state = incremental_strategy.load_model(model.state_dict(), temp_file_path) assert model_state["_weight"][0] == 1 # pylint: disable=unsubscriptable-object @@ -160,7 +159,7 @@ def test_store_then_load_model(): # xor difference of the remaining float32 bytes. assert stored_model_file.read(8) == b"\x00\x00\x00\x00\x00\x00" - incremental_strategy.load_model(before_state, temp_file_path) + state_dict = incremental_strategy.load_model(before_state, temp_file_path) - assert before_state["_bias"][0].item() == 1 # pylint: disable=unsubscriptable-object - assert before_state["_weight"][0].item() == 2 # pylint: disable=unsubscriptable-object + assert state_dict["_bias"][0].item() == 1 # pylint: disable=unsubscriptable-object + assert state_dict["_weight"][0].item() == 2 # pylint: disable=unsubscriptable-object diff --git a/modyn/tests/model_storage/internal/test_model_storage_manager.py b/modyn/tests/model_storage/internal/test_model_storage_manager.py index ece6f377f..7367f9981 100644 --- a/modyn/tests/model_storage/internal/test_model_storage_manager.py +++ b/modyn/tests/model_storage/internal/test_model_storage_manager.py @@ -44,7 +44,7 @@ def setup(): inc_model_strategy.zip = False inc_model_strategy.config = json.dumps({"operator": "sub"}) database.register_pipeline( - 1, "ResNet18", json.dumps({"num_classes": 10}), True, full_model_strategy, inc_model_strategy, 5 + 1, "ResNet18", json.dumps({"num_classes": 10}), True, "{}", full_model_strategy, inc_model_strategy, 5 ) @@ -92,7 +92,8 @@ def test__get_base_model_state(): assert len(model_state) == 122 -def test__reconstruct_model(): +@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) +def test__reconstruct_model(base_model_state_mock: MagicMock): mock_model = MockModel() model_state = mock_model.state_dict() full_model_strategy = PyTorchFullModel( @@ -120,9 +121,7 @@ def test__reconstruct_model(): 15, 4, difference_model_file_name, "model.metadata", parent_model=prev_model_id ) - reconstructed_state = manager._reconstruct_model_state( - curr_model_id, model_state, manager.get_model_storage_policy(1) - ) + reconstructed_state = manager._reconstruct_model_state(curr_model_id, manager.get_model_storage_policy(1)) assert reconstructed_state["_weight"].item() == 3 # pylint: disable=unsubscriptable-object @@ -146,12 +145,9 @@ def test__handle_new_model_full(): assert loaded_state["_weight"].item() == 1 -@patch.object(ModelStorageManager, "_get_base_model_state", return_value=MockModel().state_dict()) @patch.object(ModelStorageManager, "_reconstruct_model_state", return_value=MockModel().state_dict()) @patch.object(ModelStorageManager, "_get_parent_model_id", return_value=101) -def test__handle_new_model_incremental( - previous_model_mock, reconstruct_model_mock: MagicMock, base_model_state_mock: MagicMock -): +def test__handle_new_model_incremental(previous_model_mock, reconstruct_model_mock: MagicMock): manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) with tempfile.NamedTemporaryFile() as temporary_file: @@ -166,7 +162,7 @@ def test__handle_new_model_incremental( with open(temp_file_path, "rb") as model_file: assert model_file.read() == b"\x00\x00\x00\x40" - base_model_state_mock.assert_called_once_with(5) + reconstruct_model_mock.assert_called_once() previous_model_mock.assert_called_once_with(5, 4) @@ -177,6 +173,7 @@ def test_get_model_storage_policy(): "ResNet18", json.dumps({"num_classes": 10}), True, + "{}", ModelStorageStrategyConfig(name="PyTorchFullModel"), None, None, @@ -188,7 +185,7 @@ def test_get_model_storage_policy(): inc_model_strategy.zip = True inc_model_strategy.config = json.dumps({"operator": "sub"}) complex_pipeline = database.register_pipeline( - 75, "ResNet18", json.dumps({"num_classes": 10}), True, full_model_strategy, inc_model_strategy, 10 + 75, "ResNet18", json.dumps({"num_classes": 10}), True, "{}", full_model_strategy, inc_model_strategy, 10 ) manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) @@ -250,7 +247,7 @@ def test_store_model_resnet(): with MetadataDatabaseConnection(get_modyn_config()) as database: pipeline_id = database.register_pipeline( - 1, "ResNet18", json.dumps({"num_classes": 10}), True, full_model_strategy + 1, "ResNet18", json.dumps({"num_classes": 10}), True, "{}", full_model_strategy ) resnet = ResNet18(model_configuration={"num_classes": 10}, device="cpu", amp=False) diff --git a/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py b/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py index fc501c4f2..1642bad08 100644 --- a/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py +++ b/modyn/tests/trainer_server/internal/grpc/test_trainer_server_grpc_servicer.py @@ -66,8 +66,7 @@ def setup(): - if os.path.exists(DATABASE): - os.remove(DATABASE) + DATABASE.unlink(True) with MetadataDatabaseConnection(modyn_config) as database: database.create_tables() @@ -77,6 +76,7 @@ def setup(): "model", json.dumps({}), True, + "{}", ModelStorageStrategyConfig(name="PyTorchFullModel"), incremental_model_strategy=None, full_model_interval=None, @@ -84,7 +84,7 @@ def setup(): def teardown(): - os.remove(DATABASE) + DATABASE.unlink() class DummyModelStorageStub: diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py index d02654560..72135930c 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py @@ -14,42 +14,41 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xb9\x06\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x0b\n\x03\x61mp\x18\x04 \x01(\x08\x12\x10\n\x08model_id\x18\x05 \x01(\t\x12\x30\n\x13model_configuration\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1c\n\x14use_pretrained_model\x18\x07 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x08 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\t \x01(\x05\x12\x12\n\nbatch_size\x18\n \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x0b \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\x0c \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\r \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0e \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0f \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x11 \x03(\t\x12)\n\x0clr_scheduler\x18\x12 \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x13 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x14 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x15 \x01(\x05\x12\x11\n\x04seed\x18\x16 \x01(\x05H\x00\x88\x01\x01\x12-\n\ttokenizer\x18\x17 \x01(\x0b\x32\x15.trainer.PythonStringH\x01\x88\x01\x01\x42\x07\n\x05_seedB\x0c\n\n_tokenizer\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\xa6\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12 \n\x03log\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x16\n\texception\x18\x07 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x08 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\t \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\n \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\x0b \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xaf\x06\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x1c\n\x14use_pretrained_model\x18\x04 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x05 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\x06 \x01(\x05\x12\x12\n\nbatch_size\x18\x07 \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x08 \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\t \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\n \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0b \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0c \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\r \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x0e \x03(\t\x12)\n\x0clr_scheduler\x18\x0f \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x11 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x12 \x01(\x05\x12!\n\x19num_prefetched_partitions\x18\x13 \x01(\x05\x12\"\n\x1aparallel_prefetch_requests\x18\x14 \x01(\x05\x12\x11\n\x04seed\x18\x15 \x01(\x05H\x00\x88\x01\x01\x12-\n\ttokenizer\x18\x16 \x01(\x0b\x32\x15.trainer.PythonStringH\x01\x88\x01\x01\x42\x07\n\x05_seedB\x0c\n\n_tokenizer\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\xa6\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12 \n\x03log\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x16\n\texception\x18\x07 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x08 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\t \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\n \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\x0b \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3') -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trainer_server_pb2', _globals) +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'trainer_server_pb2', globals()) if _descriptor._USE_C_DESCRIPTORS == False: DESCRIPTOR._options = None - _globals['_JSONSTRING']._serialized_start=33 - _globals['_JSONSTRING']._serialized_end=60 - _globals['_PYTHONSTRING']._serialized_start=62 - _globals['_PYTHONSTRING']._serialized_end=91 - _globals['_DATA']._serialized_start=93 - _globals['_DATA']._serialized_end=144 - _globals['_TRAINERAVAILABLEREQUEST']._serialized_start=146 - _globals['_TRAINERAVAILABLEREQUEST']._serialized_end=171 - _globals['_TRAINERAVAILABLERESPONSE']._serialized_start=173 - _globals['_TRAINERAVAILABLERESPONSE']._serialized_end=218 - _globals['_CHECKPOINTINFO']._serialized_start=220 - _globals['_CHECKPOINTINFO']._serialized_end=290 - _globals['_STARTTRAININGREQUEST']._serialized_start=293 - _globals['_STARTTRAININGREQUEST']._serialized_end=1118 - _globals['_STARTTRAININGRESPONSE']._serialized_start=1120 - _globals['_STARTTRAININGRESPONSE']._serialized_end=1190 - _globals['_TRAININGSTATUSREQUEST']._serialized_start=1192 - _globals['_TRAININGSTATUSREQUEST']._serialized_end=1236 - _globals['_TRAININGSTATUSRESPONSE']._serialized_start=1239 - _globals['_TRAININGSTATUSRESPONSE']._serialized_end=1661 - _globals['_STOREFINALMODELREQUEST']._serialized_start=1663 - _globals['_STOREFINALMODELREQUEST']._serialized_end=1708 - _globals['_STOREFINALMODELRESPONSE']._serialized_start=1710 - _globals['_STOREFINALMODELRESPONSE']._serialized_end=1774 - _globals['_GETLATESTMODELREQUEST']._serialized_start=1776 - _globals['_GETLATESTMODELREQUEST']._serialized_end=1820 - _globals['_GETLATESTMODELRESPONSE']._serialized_start=1822 - _globals['_GETLATESTMODELRESPONSE']._serialized_end=1887 - _globals['_TRAINERSERVER']._serialized_start=1890 - _globals['_TRAINERSERVER']._serialized_end=2347 + _JSONSTRING._serialized_start=33 + _JSONSTRING._serialized_end=60 + _PYTHONSTRING._serialized_start=62 + _PYTHONSTRING._serialized_end=91 + _DATA._serialized_start=93 + _DATA._serialized_end=144 + _TRAINERAVAILABLEREQUEST._serialized_start=146 + _TRAINERAVAILABLEREQUEST._serialized_end=171 + _TRAINERAVAILABLERESPONSE._serialized_start=173 + _TRAINERAVAILABLERESPONSE._serialized_end=218 + _CHECKPOINTINFO._serialized_start=220 + _CHECKPOINTINFO._serialized_end=290 + _STARTTRAININGREQUEST._serialized_start=293 + _STARTTRAININGREQUEST._serialized_end=1108 + _STARTTRAININGRESPONSE._serialized_start=1110 + _STARTTRAININGRESPONSE._serialized_end=1180 + _TRAININGSTATUSREQUEST._serialized_start=1182 + _TRAININGSTATUSREQUEST._serialized_end=1226 + _TRAININGSTATUSRESPONSE._serialized_start=1229 + _TRAININGSTATUSRESPONSE._serialized_end=1651 + _STOREFINALMODELREQUEST._serialized_start=1653 + _STOREFINALMODELREQUEST._serialized_end=1698 + _STOREFINALMODELRESPONSE._serialized_start=1700 + _STOREFINALMODELRESPONSE._serialized_end=1764 + _GETLATESTMODELREQUEST._serialized_start=1766 + _GETLATESTMODELREQUEST._serialized_end=1810 + _GETLATESTMODELRESPONSE._serialized_start=1812 + _GETLATESTMODELRESPONSE._serialized_end=1877 + _TRAINERSERVER._serialized_start=1880 + _TRAINERSERVER._serialized_end=2337 # @@protoc_insertion_point(module_scope) diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi index 9723ebdb8..b185cc7b1 100644 --- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi +++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi @@ -115,9 +115,6 @@ class StartTrainingRequest(google.protobuf.message.Message): PIPELINE_ID_FIELD_NUMBER: builtins.int TRIGGER_ID_FIELD_NUMBER: builtins.int DEVICE_FIELD_NUMBER: builtins.int - AMP_FIELD_NUMBER: builtins.int - MODEL_ID_FIELD_NUMBER: builtins.int - MODEL_CONFIGURATION_FIELD_NUMBER: builtins.int USE_PRETRAINED_MODEL_FIELD_NUMBER: builtins.int LOAD_OPTIMIZER_STATE_FIELD_NUMBER: builtins.int PRETRAINED_MODEL_ID_FIELD_NUMBER: builtins.int @@ -140,10 +137,6 @@ class StartTrainingRequest(google.protobuf.message.Message): pipeline_id: builtins.int trigger_id: builtins.int device: builtins.str - amp: builtins.bool - model_id: builtins.str - @property - def model_configuration(self) -> global___JsonString: ... use_pretrained_model: builtins.bool load_optimizer_state: builtins.bool pretrained_model_id: builtins.int @@ -179,9 +172,6 @@ class StartTrainingRequest(google.protobuf.message.Message): pipeline_id: builtins.int = ..., trigger_id: builtins.int = ..., device: builtins.str = ..., - amp: builtins.bool = ..., - model_id: builtins.str = ..., - model_configuration: global___JsonString | None = ..., use_pretrained_model: builtins.bool = ..., load_optimizer_state: builtins.bool = ..., pretrained_model_id: builtins.int = ..., @@ -202,8 +192,8 @@ class StartTrainingRequest(google.protobuf.message.Message): seed: builtins.int | None = ..., tokenizer: global___PythonString | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "lr_scheduler", b"lr_scheduler", "model_configuration", b"model_configuration", "seed", b"seed", "tokenizer", b"tokenizer", "torch_optimizers_configuration", b"torch_optimizers_configuration"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "amp", b"amp", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "model_configuration", b"model_configuration", "model_id", b"model_id", "num_prefetched_partitions", b"num_prefetched_partitions", "parallel_prefetch_requests", b"parallel_prefetch_requests", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "tokenizer", b"tokenizer", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "lr_scheduler", b"lr_scheduler", "seed", b"seed", "tokenizer", b"tokenizer", "torch_optimizers_configuration", b"torch_optimizers_configuration"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "num_prefetched_partitions", b"num_prefetched_partitions", "parallel_prefetch_requests", b"parallel_prefetch_requests", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "tokenizer", b"tokenizer", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ... @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["_seed", b"_seed"]) -> typing_extensions.Literal["seed"] | None: ... @typing.overload diff --git a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py index d5a87a45f..5d8ceb058 100644 --- a/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py +++ b/modyn/trainer_server/internal/grpc/trainer_server_grpc_servicer.py @@ -371,7 +371,7 @@ def store_final_model( if not register_response.success: logger.error(f"Could not store final model from training id {training_id} at model storage.") return StoreFinalModelResponse(valid_state=False) - os.remove(final_checkpoint_path) + final_checkpoint_path.unlink() logger.info(f"Deleted final model on path {final_checkpoint_path}") return StoreFinalModelResponse(valid_state=True, model_id=register_response.model_id) From c19aaf28f6955fd6c904db57144d8a24bf3b27f4 Mon Sep 17 00:00:00 2001 From: Robin Oester Date: Tue, 24 Oct 2023 11:29:22 +0200 Subject: [PATCH 12/12] Minor Renaming - add comments to determine_parent_model_id --- .../model_storage/internal/model_storage_manager.py | 12 ++++++++---- .../internal/test_model_storage_manager.py | 8 ++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/modyn/model_storage/internal/model_storage_manager.py b/modyn/model_storage/internal/model_storage_manager.py index 9d81f8490..33c52a68e 100644 --- a/modyn/model_storage/internal/model_storage_manager.py +++ b/modyn/model_storage/internal/model_storage_manager.py @@ -95,7 +95,7 @@ def _handle_new_model( if policy.incremental_model_strategy and ( policy.full_model_interval is None or trigger_id % policy.full_model_interval != 0 ): - parent_model_id: Optional[int] = self._get_parent_model_id(pipeline_id, trigger_id) + parent_model_id: Optional[int] = self._determine_parent_model_id(pipeline_id, trigger_id) if parent_model_id is not None: # load model state of the parent model. parent_model_state = self._reconstruct_model_state(parent_model_id, policy) @@ -155,16 +155,17 @@ def _get_base_model_state(self, pipeline_id: int) -> dict: model_handler = getattr(model_module, model_class_name) return model_handler(json.loads(model_config), "cpu", amp).model.state_dict() - def _get_parent_model_id(self, pipeline_id: int, trigger_id: int) -> Optional[int]: + def _determine_parent_model_id(self, pipeline_id: int, trigger_id: int) -> Optional[int]: """ - Get the id of the parent model given the trigger id of a pipeline. + Determines the id of the parent model given the trigger id of a pipeline. Usually, the last fully stored + model is identified as such. The function returns None whenever no parent model can be found. Args: pipeline_id: the pipeline that generated the model. trigger_id: the trigger associated with the model. Returns: - Optional[int]: the parent model id (if it exists). + Optional[int]: the parent model id (if it can be found). """ with MetadataDatabaseConnection(self._modyn_config) as database: previous_model: TrainedModel = ( @@ -173,10 +174,13 @@ def _get_parent_model_id(self, pipeline_id: int, trigger_id: int) -> Optional[in .first() ) + # whenever the previous model is not present, a parent model cannot be determined. if not previous_model: return None + # return the id of the previous model if its stored in its entirety. if previous_model.parent_model is None: return previous_model.model_id + # otherwise return the parent model of the previous model. return previous_model.parent_model def load_model(self, model_id: int, metadata: bool) -> Optional[dict]: diff --git a/modyn/tests/model_storage/internal/test_model_storage_manager.py b/modyn/tests/model_storage/internal/test_model_storage_manager.py index 7367f9981..27334bcdf 100644 --- a/modyn/tests/model_storage/internal/test_model_storage_manager.py +++ b/modyn/tests/model_storage/internal/test_model_storage_manager.py @@ -76,13 +76,13 @@ def test_init(): assert manager._ftp_dir == pathlib.Path("ftp") -def test__get_parent_model(): +def test__determine_parent_model_id(): with MetadataDatabaseConnection(get_modyn_config()) as database: model_id = database.add_trained_model(10, 2, "model.modyn", "model.metadata") manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp")) - assert manager._get_parent_model_id(10, 3) == model_id - assert manager._get_parent_model_id(10, 2) is None + assert manager._determine_parent_model_id(10, 3) == model_id + assert manager._determine_parent_model_id(10, 2) is None def test__get_base_model_state(): @@ -146,7 +146,7 @@ def test__handle_new_model_full(): @patch.object(ModelStorageManager, "_reconstruct_model_state", return_value=MockModel().state_dict()) -@patch.object(ModelStorageManager, "_get_parent_model_id", return_value=101) +@patch.object(ModelStorageManager, "_determine_parent_model_id", return_value=101) def test__handle_new_model_incremental(previous_model_mock, reconstruct_model_mock: MagicMock): manager = ModelStorageManager(get_modyn_config(), pathlib.Path("storage"), pathlib.Path("ftp"))