Merge branch 'main' into robinholzi/fix/dynamic-threshold-direction-l…

…ower
eth-easl · Sep 12, 2024 · b3cfa56 · b3cfa56
2 parents 4dddd24 + 9c42e0f
commit b3cfa56
Show file tree

Hide file tree

Showing 12 changed files with 230 additions and 43 deletions.
diff --git a/.github/workflows/workflow.yaml b/.github/workflows/workflow.yaml
@@ -56,7 +56,7 @@ jobs:
 
     steps:
       - name: Check out code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Setup mamba
         uses: ./.github/actions/mamba
@@ -72,7 +72,7 @@ jobs:
 
     steps:
       - name: Check out code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Setup mamba
         uses: ./.github/actions/mamba
@@ -220,7 +220,7 @@ jobs:
 
       - name: Upload HTML coverage report
         if: ${{ matrix.compiler.coverage && matrix.build-type == 'Debug' }}
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: coverage-results
           path: ${{github.workspace}}/build/modyn/tests/coverage
@@ -236,7 +236,7 @@ jobs:
       line-coverage: ${{steps.run_main_test_with_coverage.outputs.LINE_COVERAGE}}
       branch-coverage: ${{steps.run_main_test_with_coverage.outputs.BRANCH_COVERAGE}}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           ref: main
 
@@ -315,7 +315,7 @@ jobs:
 
     steps:
       - name: Check out code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Setup base container
         uses: ./.github/actions/base
@@ -335,7 +335,7 @@ jobs:
 
     steps:
       - name: Check out code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Start docker compose and exit when tests run through
         run: bash scripts/run_integrationtests.sh Debug
@@ -351,7 +351,7 @@ jobs:
       - cpp_build_and_test
     steps:
       - name: Check out code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Start docker compose and exit when tests run through
         run: bash scripts/run_integrationtests.sh Asan
@@ -368,7 +368,7 @@ jobs:
 
     steps:
       - name: Check out code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Start docker compose and exit when tests run through
         run: bash scripts/run_integrationtests.sh Tsan
@@ -385,7 +385,7 @@ jobs:
 
     steps:
       - name: Check out code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Start docker compose and exit when tests run through
         run: bash scripts/run_integrationtests.sh Release
diff --git a/...server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py b/...server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py
@@ -56,15 +56,15 @@ def test_collect_embeddings(dummy_system_config: ModynConfig):
 
         first_embedding = torch.randn((4, 5))
         second_embedding = torch.randn((3, 5))
-        amds.inform_samples([1, 2, 3, 4], None, None, None, first_embedding)
-        amds.inform_samples([21, 31, 41], None, None, None, second_embedding)
+        amds.inform_samples([1, 2, 3, 4], None, torch.randn((4, 2)), None, first_embedding)
+        amds.inform_samples([21, 31, 41], None, torch.randn((3, 2)), None, second_embedding)
 
         assert np.concatenate(amds.matrix_elements).shape == (7, 5)
         assert all(torch.equal(el1, el2) for el1, el2 in zip(amds.matrix_elements, [first_embedding, second_embedding]))
         assert amds.index_sampleid_map == [1, 2, 3, 4, 21, 31, 41]
 
         third_embedding = torch.randn((23, 5))
-        amds.inform_samples(list(range(1000, 1023)), None, None, None, third_embedding)
+        amds.inform_samples(list(range(1000, 1023)), None, torch.randn((23, 2)), None, third_embedding)
 
         assert np.concatenate(amds.matrix_elements).shape == (30, 5)
         assert all(
@@ -88,8 +88,8 @@ def test_collect_embedding_balance(test_amds, dummy_system_config: ModynConfig):
 
         first_embedding = torch.randn((4, 5))
         second_embedding = torch.randn((3, 5))
-        amds.inform_samples([1, 2, 3, 4], None, None, None, first_embedding)
-        amds.inform_samples([21, 31, 41], None, None, None, second_embedding)
+        amds.inform_samples([1, 2, 3, 4], None, torch.randn((4, 2)), None, first_embedding)
+        amds.inform_samples([21, 31, 41], None, torch.randn((3, 2)), None, second_embedding)
 
         assert np.concatenate(amds.matrix_elements).shape == (7, 5)
         assert all(torch.equal(el1, el2) for el1, el2 in zip(amds.matrix_elements, [first_embedding, second_embedding]))
@@ -99,7 +99,7 @@ def test_collect_embedding_balance(test_amds, dummy_system_config: ModynConfig):
 
         third_embedding = torch.randn((23, 5))
         assert len(amds.matrix_elements) == 0
-        amds.inform_samples(list(range(1000, 1023)), None, None, None, third_embedding)
+        amds.inform_samples(list(range(1000, 1023)), None, torch.randn((23, 2)), None, third_embedding)
 
         assert np.concatenate(amds.matrix_elements).shape == (23, 5)
         assert all(torch.equal(el1, el2) for el1, el2 in zip(amds.matrix_elements, [third_embedding]))
@@ -142,3 +142,42 @@ def test_collect_gradients(matrix_content, dummy_system_config: ModynConfig):
         assert np.concatenate(amds.matrix_elements).shape == (7, gradient_shape)
 
         assert amds.index_sampleid_map == [1, 2, 3, 4, 21, 31, 41]
+
+
+@pytest.mark.parametrize(
+    "matrix_content", [MatrixContent.LAST_LAYER_GRADIENTS, MatrixContent.LAST_TWO_LAYERS_GRADIENTS]
+)
+@patch.multiple(AbstractMatrixDownsamplingStrategy, __abstractmethods__=set())
+def test_collect_gradients_binary(matrix_content, dummy_system_config: ModynConfig):
+    per_sample_loss_fct = torch.nn.BCEWithLogitsLoss(reduction="none")
+    sampler_config = list(get_sampler_config(dummy_system_config, matrix_content=matrix_content))
+    sampler_config[5] = per_sample_loss_fct
+    sampler_config = tuple(sampler_config)
+    amds = AbstractMatrixDownsamplingStrategy(*sampler_config)
+    with torch.inference_mode(mode=(not amds.requires_grad)):
+        forward_input = torch.randn((4, 5))
+        first_output = torch.randn((4,))
+        first_output.requires_grad = True
+        first_target = torch.tensor([1, 1, 1, 0], dtype=torch.float32)
+        first_embedding = torch.randn((4, 5))
+        amds.inform_samples([1, 2, 3, 4], forward_input, first_output, first_target, first_embedding)
+
+        second_output = torch.randn((3,))
+        second_output.requires_grad = True
+        second_target = torch.tensor([0, 1, 0], dtype=torch.float32)
+        second_embedding = torch.randn((3, 5))
+        amds.inform_samples([21, 31, 41], forward_input, second_output, second_target, second_embedding)
+
+        assert len(amds.matrix_elements) == 2
+
+        # expected shape = (a, gradient_shape)
+        # a = 7 (4 samples in the first batch and 3 samples in the second batch)
+        if matrix_content == MatrixContent.LAST_LAYER_GRADIENTS:
+            # shape same as the last dimension of output
+            gradient_shape = 1
+        else:
+            # 5 is the input dimension of the last layer and 1 is the output one
+            gradient_shape = 5 * 1 + 1
+        assert np.concatenate(amds.matrix_elements).shape == (7, gradient_shape)
+
+        assert amds.index_sampleid_map == [1, 2, 3, 4, 21, 31, 41]
diff --git a/...sts/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py b/...sts/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py
@@ -205,6 +205,39 @@ def test_bts(grad_approx: str, dummy_system_config: ModynConfig):
         assert all(id in [1, 2, 3, 10, 11, 12, 13] for id in selected_points)
 
 
+@pytest.mark.parametrize("grad_approx", ["LastLayer", "LastLayerWithEmbedding"])
+def test_bts_binary(grad_approx: str, dummy_system_config: ModynConfig):
+    sampler_config = get_sampler_config(dummy_system_config, grad_approx=grad_approx)
+    per_sample_loss_fct = torch.nn.BCEWithLogitsLoss(reduction="none")
+    sampler_config = (0, 0, 0, sampler_config[3], sampler_config[4], per_sample_loss_fct, "cpu")
+    sampler = RemoteCraigDownsamplingStrategy(*sampler_config)
+
+    with torch.inference_mode(mode=(not sampler.requires_grad)):
+        sample_ids = [1, 2, 3, 10, 11, 12, 13]
+        forward_input = torch.randn(7, 5)  # 7 samples, 5 input features
+        forward_output = torch.randn(
+            7,
+        )
+        forward_output.requires_grad = True
+        target = torch.tensor([1, 1, 1, 0, 0, 0, 1], dtype=torch.float32)  # 7 target labels
+        embedding = torch.randn(7, 10)  # 7 samples, embedding dimension 10
+
+        assert sampler.distance_matrix.shape == (0, 0)
+        sampler.inform_samples(sample_ids, forward_input, forward_output, target, embedding)
+        sampler.inform_end_of_current_label()
+        assert sampler.distance_matrix.shape == (7, 7)
+        assert len(sampler.current_class_gradients) == 0
+
+        assert sampler.index_sampleid_map == [10, 11, 12, 1, 2, 3, 13]
+
+        selected_points, selected_weights = sampler.select_points()
+
+        assert len(selected_points) == 3
+        assert len(selected_weights) == 3
+        assert all(weight > 0 for weight in selected_weights)
+        assert all(id in [1, 2, 3, 10, 11, 12, 13] for id in selected_points)
+
+
 @pytest.mark.parametrize("grad_approx", ["LastLayerWithEmbedding", "LastLayer"])
 def test_bts_equals_stb(grad_approx: str, dummy_system_config: ModynConfig):
     # data

diff --git a/...ts/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py b/...ts/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py
@@ -40,7 +40,7 @@ def test_sample_shape_ce(dummy_system_config: ModynConfig):
         assert set(downsampled_indexes) <= set(range(8))
 
 
-def test_sample_shape_other_losses(dummy_system_config: ModynConfig):
+def test_sample_shape_binary(dummy_system_config: ModynConfig):
     model = torch.nn.Linear(10, 1)
     downsampling_ratio = 50
     per_sample_loss_fct = torch.nn.BCEWithLogitsLoss(reduction="none")
@@ -51,11 +51,10 @@ def test_sample_shape_other_losses(dummy_system_config: ModynConfig):
     )
     with torch.inference_mode(mode=(not sampler.requires_grad)):
         data = torch.randn(8, 10)
-        target = torch.randint(2, size=(8,), dtype=torch.float32).unsqueeze(1)
+        forward_outputs = model(data).squeeze(1)
+        target = torch.randint(2, size=(8,), dtype=torch.float32)
         ids = list(range(8))
 
-        forward_outputs = model(data)
-
         sampler.inform_samples(ids, data, forward_outputs, target)
         downsampled_indexes, weights = sampler.select_points()
 

diff --git a/.../tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py b/.../tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py
@@ -35,6 +35,36 @@ def test_sample_shape(dummy_system_config: ModynConfig):
         assert len(indexes) == 4
 
 
+def test_sample_shape_binary(dummy_system_config: ModynConfig):
+    model = torch.nn.Linear(10, 1)
+    downsampling_ratio = 50
+    per_sample_loss_fct = torch.nn.BCEWithLogitsLoss(reduction="none")
+
+    params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False, "ratio_max": 100}
+    sampler = RemoteLossDownsampling(
+        0, 0, 0, params_from_selector, dummy_system_config.model_dump(by_alias=True), per_sample_loss_fct, "cpu"
+    )
+    with torch.inference_mode(mode=(not sampler.requires_grad)):
+        data = torch.randn(8, 10)
+        forward_outputs = model(data).squeeze(1)
+        target = torch.randint(2, size=(8,), dtype=torch.float32)
+        ids = list(range(8))
+
+        sampler.inform_samples(ids, data, forward_outputs, target)
+        downsampled_indexes, weights = sampler.select_points()
+
+        assert len(downsampled_indexes) == 4
+        assert weights.shape[0] == 4
+
+        sampled_data, sampled_target = get_tensors_subset(downsampled_indexes, data, target, ids)
+
+        assert weights.shape[0] == sampled_target.shape[0]
+        assert sampled_data.shape[0] == 4
+        assert sampled_data.shape[1] == data.shape[1]
+        assert weights.shape[0] == 4
+        assert sampled_target.shape[0] == 4
+
+
 def test_sample_weights(dummy_system_config: ModynConfig):
     model = torch.nn.Linear(10, 2)
     downsampling_ratio = 50

diff --git a/...ver/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py b/...ver/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py
@@ -74,6 +74,31 @@ def test_compute_score(sampler_config):
     assert np.allclose(scores, expected_scores, atol=1e-4)
 
 
+binary_test_data = {
+    "LeastConfidence": {
+        "outputs": torch.tensor([[-0.8], [0.5], [0.3]]),
+        "expected_scores": np.array([0.8, 0.5, 0.3]),  # confidence just picks the highest probability
+    },
+    "Entropy": {
+        "outputs": torch.tensor([[0.8], [0.5], [0.3]]),
+        "expected_scores": np.array([-0.5004, -0.6931, -0.6109]),
+    },
+    "Margin": {
+        "outputs": torch.tensor([[0.8], [0.5], [0.3]]),
+        "expected_scores": np.array([0.6, 0.0, 0.4]),  # margin between top two classes
+    },
+}
+
+
+def test_compute_score_binary(sampler_config):
+    metric = sampler_config[3]["score_metric"]
+    amds = RemoteUncertaintyDownsamplingStrategy(*sampler_config)
+    outputs = binary_test_data[metric]["outputs"]
+    expected_scores = binary_test_data[metric]["expected_scores"]
+    scores = amds._compute_score(outputs, disable_softmax=True)
+    assert np.allclose(scores, expected_scores, atol=1e-4)
+
+
 def test_select_points(balance_config):
     amds = RemoteUncertaintyDownsamplingStrategy(*balance_config)
     with torch.inference_mode():

diff --git a/modyn/trainer_server/internal/trainer/pytorch_trainer.py b/modyn/trainer_server/internal/trainer/pytorch_trainer.py
@@ -1,6 +1,7 @@
 # pylint: disable=no-name-in-module
 from __future__ import annotations
 
+import contextlib
 import glob
 import io
 import itertools
@@ -23,6 +24,7 @@
 
 from modyn.common.benchmark.stopwatch import Stopwatch
 from modyn.models.coreset_methods_support import CoresetSupportingModule
+from modyn.models.dlrm.dlrm import DLRM
 from modyn.selector.internal.grpc.generated.selector_pb2 import (
     AvailableLabelsResponse,
     GetAvailableLabelsRequest,
@@ -560,7 +562,14 @@ def downsample_batch(
         self._downsampler.init_downsampler()
         self.start_embedding_recording_if_needed()
 
-        with torch.inference_mode(mode=not self._downsampler.requires_grad):
+        # DLRM does not support inference_mode(), as it will complain during training that
+        # "that inference tensors cannot be saved for backward".
+        # It could be that some DLRM parameters are lazily created during the
+        # first forward pass and hence they are created as inference tensors if inference mode is used here.
+        # If this becomes a problem for more models, we might want to make it a field on the model class instead.
+        no_grad_mgr = torch.no_grad() if isinstance(self._model, DLRM) else torch.inference_mode()
+        context_manager = contextlib.nullcontext() if self._downsampler.requires_grad else no_grad_mgr
+        with context_manager:
             big_batch_output = self._model.model(data) if self._downsampler.forward_required else torch.Tensor()
             embeddings = self.get_embeddings_if_recorded()
             self._downsampler.inform_samples(sample_ids, data, big_batch_output, target, embeddings)
@@ -904,7 +913,9 @@ def _iterate_dataloader_and_compute_scores(
             sample_ids, target, data = self.preprocess_batch(batch)
             number_of_samples += len(sample_ids)
 
-            with torch.inference_mode(mode=not self._downsampler.requires_grad):
+            no_grad_mgr = torch.no_grad() if isinstance(self._model, DLRM) else torch.inference_mode()
+            context_manager = contextlib.nullcontext() if self._downsampler.requires_grad else no_grad_mgr
+            with context_manager:
                 with torch.autocast(self._device_type, enabled=self._amp):
                     # compute the scores and accumulate them
                     model_output = self._model.model(data) if self._downsampler.forward_required else torch.Tensor()

diff --git a/...iner_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py b/...iner_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py
@@ -8,6 +8,9 @@
 from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_per_label_remote_downsample_strategy import (
     AbstractPerLabelRemoteDownsamplingStrategy,
 )
+from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_remote_downsampling_strategy import (
+    unsqueeze_dimensions_if_necessary,
+)
 from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.shuffling import _shuffle_list_and_tensor
 
 
@@ -71,6 +74,7 @@ def inform_samples(
     ) -> None:
         batch_size = len(sample_ids)
         assert self.matrix_content is not None
+        forward_output, target = unsqueeze_dimensions_if_necessary(forward_output, target)
         if self.matrix_content == MatrixContent.LAST_LAYER_GRADIENTS:
             grads_wrt_loss_sum = self._compute_last_layer_gradient_wrt_loss_sum(self.criterion, forward_output, target)
             grads_wrt_loss_mean = grads_wrt_loss_sum / batch_size

diff --git a/...iner_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py b/...iner_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py
@@ -36,6 +36,23 @@ def get_tensors_subset(
     return sub_data, sub_target
 
 
+def unsqueeze_dimensions_if_necessary(
+    forward_output: torch.Tensor, target: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """For binary classification, the forward output is a 1D tensor of length
+    batch_size. We need to unsqueeze it to have a 2D tensor of shape
+    (batch_size, 1).
+
+    For binary classification we use BCEWithLogitsLoss, which requires
+    the same dimensionality between the forward output and the target,
+    so we also need to unsqueeze the target tensor.
+    """
+    if forward_output.dim() == 1:
+        forward_output = forward_output.unsqueeze(1)
+        target = target.unsqueeze(1)
+    return forward_output, target
+
+
 class AbstractRemoteDownsamplingStrategy(ABC):
     def __init__(
         self,

diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py
@@ -9,6 +9,7 @@
 )
 from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_remote_downsampling_strategy import (
     FULL_GRAD_APPROXIMATION,
+    unsqueeze_dimensions_if_necessary,
 )
 from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils import submodular_optimizer
 from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.euclidean import euclidean_dist_pair_np
@@ -110,6 +111,7 @@ def _inform_samples_single_class(
         target: torch.Tensor,
         embedding: torch.Tensor | None,
     ) -> None:
+        forward_output, target = unsqueeze_dimensions_if_necessary(forward_output, target)
         if self.full_grad_approximation == "LastLayerWithEmbedding":
             assert embedding is not None
             grads_wrt_loss_sum = self._compute_last_two_layers_gradient_wrt_loss_sum(