diff --git a/.github/workflows/workflow.yaml b/.github/workflows/workflow.yaml
index 9435eb3f3..daf9b15b8 100644
--- a/.github/workflows/workflow.yaml
+++ b/.github/workflows/workflow.yaml
@@ -335,8 +335,6 @@ jobs:
       run: docker run modynbase mamba run -n modyn bash -c "pip install -r dev-requirements.txt && echo Running pytest && pytest"
 
 
-# Tests whether docker-compose up starts all components successfully and integration tests run through
-# Only one job to reduce Github CI usage
   integrationtests:
     timeout-minutes: 60
     runs-on: ubuntu-latest
@@ -347,7 +345,6 @@ jobs:
       - unittests
       - isort
       - black
-      - dockerized-unittests
 
     steps:
       - name: Check out code
diff --git a/.gitignore b/.gitignore
index 4dfe551ef..fac632618 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,3 +66,6 @@ cmake-build-debug
 environment.yml.original
 docker-compose.yml.original
 Dockerfile.original
+
+# Experimental things
+plots/
diff --git a/.pylintrc b/.pylintrc
index c01dad950..a3ce414f2 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -61,6 +61,7 @@ ignore-paths=^modyn/trainer_server/internal/grpc/generated/.*$,
              ^modyn/models/dlrm/cuda_src/.*$,
              ^modyn/models/dlrm/utils/.*$,
              ^modyn/models/dlrm/nn/.*$,
+             ^modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/.*$,
 
 # Files or directories matching the regex patterns are skipped. The regex
 # matches against base names, not paths. The default value ignores Emacs file
diff --git a/benchmark/criteo_1TB/pipelines/exp0_finetune.yml b/benchmark/criteo_1TB/pipelines/exp0_finetune.yml
index 0df038fb6..c8d0a1275 100644
--- a/benchmark/criteo_1TB/pipelines/exp0_finetune.yml
+++ b/benchmark/criteo_1TB/pipelines/exp0_finetune.yml
@@ -44,7 +44,9 @@ training:
   gpus: 1
   device: "cuda:0"
   amp: True
-  dataloader_workers: 8
+  dataloader_workers: 16
+  num_prefetched_partitions: 4
+  parallel_prefetch_requests: 2
   use_previous_model: True
   initial_model: random
   initial_pass:
diff --git a/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml b/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml
index 62495ba37..e6957ce5b 100644
--- a/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml
+++ b/benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml
@@ -44,7 +44,9 @@ training:
   gpus: 1
   device: "cuda:0"
   amp: True
-  dataloader_workers: 8
+  dataloader_workers: 16
+  num_prefetched_partitions: 4
+  parallel_prefetch_requests: 2
   use_previous_model: True
   initial_model: random
   initial_pass:
diff --git a/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml b/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml
index 780656bd4..c5697972c 100644
--- a/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml
+++ b/benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml
@@ -44,7 +44,9 @@ training:
   gpus: 1
   device: "cuda:0"
   amp: True
-  dataloader_workers: 8
+  dataloader_workers: 16
+  num_prefetched_partitions: 4
+  parallel_prefetch_requests: 2
   use_previous_model: True
   initial_model: random
   initial_pass:
diff --git a/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml b/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml
index 1646e561b..e4a51eff4 100644
--- a/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml
+++ b/benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml
@@ -44,7 +44,9 @@ training:
   gpus: 1
   device: "cuda:0"
   amp: True
-  dataloader_workers: 8
+  dataloader_workers: 16
+  num_prefetched_partitions: 4
+  parallel_prefetch_requests: 2
   use_previous_model: False # Same amount of computation (retraining on all data), but on different starting weights
   initial_model: random
   initial_pass:
diff --git a/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml b/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml
index 1a4ec65a1..eadfb1341 100644
--- a/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml
+++ b/benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml
@@ -44,7 +44,9 @@ training:
   gpus: 1
   device: "cuda:0"
   amp: True
-  dataloader_workers: 8
+  dataloader_workers: 16
+  num_prefetched_partitions: 4
+  parallel_prefetch_requests: 2
   use_previous_model: False
   initial_model: random
   initial_pass:
diff --git a/benchmark/wildtime_benchmarks/benchmark_utils.py b/benchmark/wildtime_benchmarks/benchmark_utils.py
index 10cfebd63..553a25908 100644
--- a/benchmark/wildtime_benchmarks/benchmark_utils.py
+++ b/benchmark/wildtime_benchmarks/benchmark_utils.py
@@ -31,6 +31,19 @@ def setup_argparser_wildtime(dataset: str) -> argparse.ArgumentParser:
         "--dir", type=pathlib.Path, action="store", help="Path to data directory"
     )
 
+    parser_.add_argument(
+        "--all", action="store_true", help="Store all the available data, including the validation and test sets."
+    )
+    parser_.add_argument(
+        "--dummyyear", action="store_true", help="Add a final dummy year to train also on the last trigger in Modyn"
+    )
+
+    if dataset == "fMoW":
+        parser_.add_argument(
+            "--daily", action="store_true", help="If specified, data is stored with real timestamps (dd/mm/yy)."
+                                                 "Otherwise, only the year is considered (as done in the other "
+                                                 "datasets).")
+
     return parser_
 
 
diff --git a/benchmark/wildtime_benchmarks/data_generation_arxiv.py b/benchmark/wildtime_benchmarks/data_generation_arxiv.py
index 7191d035d..250bf455e 100644
--- a/benchmark/wildtime_benchmarks/data_generation_arxiv.py
+++ b/benchmark/wildtime_benchmarks/data_generation_arxiv.py
@@ -14,7 +14,7 @@ def main():
     args = parser.parse_args()
 
     logger.info(f"Downloading data to {args.dir}")
-    ArXivDownloader(args.dir).store_data()
+    ArXivDownloader(args.dir).store_data(args.all, args.dummyyear)
 
 
 class ArXivDownloader(Dataset):
@@ -43,16 +43,19 @@ def __init__(self,  data_dir):
         self._dataset = datasets
         self.path = data_dir
 
-    def store_data(self):
+    def store_data(self, store_all_data: bool, add_final_dummy_year: bool):
         for year in tqdm(self._dataset):
             # for simplicity, instead of using years we map each day to a year from 1970
             year_timestamp = create_timestamp(year=1970, month=1, day=year-2006)
             year_rows = []
-            for i in range(len(self._dataset[year][0]["title"])):
-                text = self._dataset[year][0]["title"][i].replace("\n", " ")
-                label = self._dataset[year][0]["category"][i]
-                csv_row = f"{text}\t{label}"
-                year_rows.append(csv_row)
+
+            splits = [0, 1] if store_all_data else [0]
+            for split in splits:
+                for i in range(len(self._dataset[year][split]["title"])):
+                    text = self._dataset[year][split]["title"][i].replace("\n", " ")
+                    label = self._dataset[year][split]["category"][i]
+                    csv_row = f"{text}\t{label}"
+                    year_rows.append(csv_row)
 
             # store the year file
             text_file = os.path.join(self.path, f"{year}.csv")
@@ -62,6 +65,16 @@ def store_data(self):
             # set timestamp
             os.utime(text_file, (year_timestamp, year_timestamp))
 
+        if add_final_dummy_year:
+            dummy_year = year + 1
+            year_timestamp = create_timestamp(year=1970, month=1, day= dummy_year - 2006)
+            text_file = os.path.join(self.path, f"{dummy_year}.csv")
+            with open(text_file, "w", encoding="utf-8") as f:
+                f.write("\n".join(["dummy\t0"]))
+
+            # set timestamp
+            os.utime(text_file, (year_timestamp, year_timestamp))
+
         os.remove(os.path.join(self.path, "arxiv.pkl"))
 
 
diff --git a/benchmark/wildtime_benchmarks/data_generation_fmow.py b/benchmark/wildtime_benchmarks/data_generation_fmow.py
index cfa98c40a..17b7754ec 100644
--- a/benchmark/wildtime_benchmarks/data_generation_fmow.py
+++ b/benchmark/wildtime_benchmarks/data_generation_fmow.py
@@ -4,7 +4,7 @@
 import shutil
 from datetime import datetime
 
-from benchmark_utils import download_if_not_exists, setup_argparser_wildtime, setup_logger
+from benchmark_utils import create_timestamp, download_if_not_exists, setup_argparser_wildtime, setup_logger
 from torch.utils.data import Dataset
 from tqdm import tqdm
 from wilds import get_dataset
@@ -13,13 +13,13 @@
 
 
 def main() -> None:
-    parser = setup_argparser_wildtime("FMoW")
+    parser = setup_argparser_wildtime("fMoW")
     args = parser.parse_args()
 
     logger.info(f"Downloading data to {args.dir}")
 
     downloader = FMOWDownloader(args.dir)
-    downloader.store_data()
+    downloader.store_data(args.daily, args.all, args.dummyyear)
     downloader.clean_folder()
 
 
@@ -59,30 +59,52 @@ def move_file_and_rename(self, index: int) -> None:
             new_name = os.path.join(self.data_dir, f"{index}.png")
             os.rename(dest_file, new_name)
 
-    def store_data(self) -> None:
+    def store_data(self, store_daily: bool, store_all_data: bool, add_final_dummy_year: bool) -> None:
 
         for year in tqdm(self._dataset):
-            split = 0  # just use training split for now
-            for i in range(len(self._dataset[year][split]["image_idxs"])):
-                index = self._dataset[year][split]["image_idxs"][i]
-                label = self._dataset[year][split]["labels"][i]
-                raw_timestamp = self.metadata[index]["timestamp"]
-
-                if len(raw_timestamp) == 24:
-                    timestamp = datetime.strptime(raw_timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')
-                else:
-                    timestamp = datetime.strptime(raw_timestamp, '%Y-%m-%dT%H:%M:%SZ')
-
-                # save label
-                label_file = os.path.join(self.data_dir, f"{index}.label")
-                with open(label_file, "w", encoding="utf-8") as f:
-                    f.write(str(int(label)))
-                os.utime(label_file, (timestamp.timestamp(), timestamp.timestamp()))
-
-                # set image timestamp
-                self.move_file_and_rename(index)
-                image_file = os.path.join(self.data_dir, f"{index}.png")
-                os.utime(image_file, (timestamp.timestamp(), timestamp.timestamp()))
+            splits = [0, 1] if store_all_data else [0]
+            for split in splits:
+                for i in range(len(self._dataset[year][split]["image_idxs"])):
+                    index = self._dataset[year][split]["image_idxs"][i]
+                    label = self._dataset[year][split]["labels"][i]
+
+                    if store_daily:
+                        raw_timestamp = self.metadata[index]["timestamp"]
+
+                        if len(raw_timestamp) == 24:
+                            timestamp = datetime.strptime(raw_timestamp, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp()
+                        else:
+                            timestamp = datetime.strptime(raw_timestamp, '%Y-%m-%dT%H:%M:%SZ').timestamp()
+                    else:
+                        timestamp = create_timestamp(year=1970, month=1, day=year+1)
+
+                    # save label
+                    label_file = os.path.join(self.data_dir, f"{index}.label")
+                    with open(label_file, "w", encoding="utf-8") as f:
+                        f.write(str(int(label)))
+                    os.utime(label_file, (timestamp, timestamp))
+
+                    # set image timestamp
+                    self.move_file_and_rename(index)
+                    image_file = os.path.join(self.data_dir, f"{index}.png")
+                    os.utime(image_file, (timestamp, timestamp))
+
+        if add_final_dummy_year:
+            dummy_year = year + 1
+            timestamp = create_timestamp(year=1970, month=1, day=dummy_year+1)
+            dummy_index = 1000000 #not used by any real sample (last: 99999)
+
+            to_copy_image_file = os.path.join(self.data_dir, f"{index}.png")
+            dummy_image_file = os.path.join(self.data_dir, f"{dummy_index}.png")
+            shutil.copy(to_copy_image_file, dummy_image_file)
+            os.utime(dummy_image_file, (timestamp, timestamp))
+
+            to_copy_label_file = os.path.join(self.data_dir, f"{index}.label")
+            dummy_label_file = os.path.join(self.data_dir, f"{dummy_index}.label")
+            shutil.copy(to_copy_label_file, dummy_label_file)
+            os.utime(dummy_label_file, (timestamp, timestamp))
+
+
 
     @staticmethod
     def parse_metadata(data_dir: str) -> list:
diff --git a/benchmark/wildtime_benchmarks/data_generation_huffpost.py b/benchmark/wildtime_benchmarks/data_generation_huffpost.py
index 497829e89..5befa7d34 100644
--- a/benchmark/wildtime_benchmarks/data_generation_huffpost.py
+++ b/benchmark/wildtime_benchmarks/data_generation_huffpost.py
@@ -15,7 +15,7 @@ def main():
     args = parser.parse_args()
 
     logger.info(f"Downloading data to {args.dir}")
-    HuffpostDownloader(args.dir).store_data()
+    HuffpostDownloader(args.dir).store_data(args.all, args.dummyyear)
 
 
 class HuffpostDownloader(Dataset):
@@ -44,15 +44,17 @@ def __init__(self, data_dir: str):
         self._dataset = datasets
         self.path = data_dir
 
-    def store_data(self) -> None:
+    def store_data(self, store_all_data: bool, add_final_dummy_year: bool) -> None:
         for year in tqdm(self._dataset):
             year_timestamp = create_timestamp(year=1970, month=1, day=year-2011)
             year_rows = []
-            for i in range(len(self._dataset[year][0]["headline"])):
-                text = self._dataset[year][0]["headline"][i]
-                label = self._dataset[year][0]["category"][i]
-                csv_row = f"{text}\t{label}"
-                year_rows.append(csv_row)
+            splits = [0, 1] if store_all_data else [0]
+            for split in splits:
+                for i in range(len(self._dataset[year][split]["headline"])):
+                    text = self._dataset[year][split]["headline"][i]
+                    label = self._dataset[year][split]["category"][i]
+                    csv_row = f"{text}\t{label}"
+                    year_rows.append(csv_row)
 
             # store the sentences
             text_file = os.path.join(self.path, f"{year}.csv")
@@ -62,6 +64,17 @@ def store_data(self) -> None:
             # set timestamp
             os.utime(text_file, (year_timestamp, year_timestamp))
 
+        if add_final_dummy_year:
+            dummy_year = year + 1
+            year_timestamp = create_timestamp(year=1970, month=1, day= dummy_year - 2011)
+            text_file = os.path.join(self.path, f"{dummy_year}.csv")
+            with open(text_file, "w", encoding="utf-8") as f:
+                f.write("\n".join(["dummy\t0"]))
+
+            # set timestamp
+            os.utime(text_file, (year_timestamp, year_timestamp))
+
+
         os.remove(os.path.join(self.path, "huffpost.pkl"))
 
 
diff --git a/benchmark/wildtime_benchmarks/data_generation_yearbook.py b/benchmark/wildtime_benchmarks/data_generation_yearbook.py
index 01fe9e6f1..4a6e675a5 100644
--- a/benchmark/wildtime_benchmarks/data_generation_yearbook.py
+++ b/benchmark/wildtime_benchmarks/data_generation_yearbook.py
@@ -17,7 +17,7 @@ def main():
     logger.info(f"Downloading data to {args.dir}")
 
     downloader = YearbookDownloader(args.dir)
-    downloader.store_data()
+    downloader.store_data(args.all, args.dummyyear)
 
 
 class YearbookDownloader(Dataset):
@@ -38,35 +38,44 @@ def __init__(self, data_dir: str):
         self._dataset = datasets
         self.data_dir = data_dir
 
-    def _get_year_data(self, year: int) -> list[Tuple]:
+    def _get_year_data(self, year: int, store_all_data: bool) -> list[Tuple]:
+        splits = [0, 1] if store_all_data else [0]
         images = torch.FloatTensor(
             np.array(
                 [   # transpose to transform from HWC to CHW (H=height, W=width, C=channels).
                     # Pytorch requires CHW format
-                    img.transpose(2, 0, 1)[0].reshape(*self.input_dim)
-                    # _dataset has 3 dimensions [years][train=0,valid=1]["images"/"labels"]
-                    for img in self._dataset[year][0]["images"]
+                    img.transpose(2, 0, 1)[split].reshape(*self.input_dim)
+                    # _dataset has 3 dimensions [years][train=0,valid=1,test=2]["images"/"labels"]
+                    for split in splits # just train if --all not specified, else test, train and val
+                    for img in self._dataset[year][split]["images"]
                 ]
             )
         )
-        labels = torch.LongTensor(self._dataset[year][0]["labels"])
+        labels = torch.cat([torch.LongTensor(self._dataset[year][split]["labels"]) for split in splits])
         return [(images[i], labels[i]) for i in range(len(images))]
 
     def __len__(self) -> int:
         return len(self._dataset["labels"])
 
-    def store_data(self) -> None:
+    def store_data(self, store_all_data: bool, add_final_dummy_year: bool) -> None:
         # create directories
         if not os.path.exists(self.data_dir):
             os.mkdir(self.data_dir)
 
         for year in self.time_steps:
             print(f"Saving data for year {year}")
-            ds = self._get_year_data(year)
+            ds = self._get_year_data(year, store_all_data)
             self.create_binary_file(ds,
                                     os.path.join(self.data_dir, f"{year}.bin"),
                                     create_fake_timestamp(year, base_year=1930))
 
+        if add_final_dummy_year:
+            dummy_year = year + 1
+            dummy_data = [ ds[0] ] # get one sample from the previous year
+            self.create_binary_file(dummy_data,
+                                    os.path.join(self.data_dir, f"{dummy_year}.bin"),
+                                    create_fake_timestamp(dummy_year, base_year=1930))
+
         os.remove(os.path.join(self.data_dir, "yearbook.pkl"))
 
     @staticmethod
diff --git a/black.toml b/black.toml
index b42befc20..0be9cced6 100644
--- a/black.toml
+++ b/black.toml
@@ -7,4 +7,5 @@ extend-exclude = """\
     .*/*\\_pb2.py|\
     .*/generated/.*\
     .*/benchmark/.*\
+    .*/plotting/.*\
 """
diff --git a/environment.yml b/environment.yml
index 4d5f24566..58a94ed99 100644
--- a/environment.yml
+++ b/environment.yml
@@ -19,7 +19,8 @@ dependencies:
   - tqdm
   - conda-forge::enlighten
   - protobuf
-  - grpcio
+  - pip:
+    - grpcio
   - jsonschema
   - psycopg2
   - sqlalchemy>=2.0
@@ -27,6 +28,7 @@ dependencies:
   - numpy
   - pandas
   - tensorboard
+  - scipy
   - pyftpdlib
   - types-protobuf
   - types-psycopg2
diff --git a/experiments/criteo_online_dataset/README.md b/experiments/criteo_online_dataset/README.md
new file mode 100644
index 000000000..fa8e785cc
--- /dev/null
+++ b/experiments/criteo_online_dataset/README.md
@@ -0,0 +1 @@
+This is an experiment to evaluate the performance of the OnlineDataset with the Criteo dataset. If you are just a user and not developer of Modyn, you can safely ignore this.
\ No newline at end of file
diff --git a/experiments/criteo_online_dataset/pipelines/16workers_4prefetch_2parallel.yml b/experiments/criteo_online_dataset/pipelines/16workers_4prefetch_2parallel.yml
new file mode 100644
index 000000000..1084d0b5d
--- /dev/null
+++ b/experiments/criteo_online_dataset/pipelines/16workers_4prefetch_2parallel.yml
@@ -0,0 +1,121 @@
+pipeline:
+    name: prefetch8
+    description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
+    version: 1.0.0
+model:
+  id: DLRM
+  config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo:  https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
+    embedding_dim: 128
+    interaction_op: "cuda_dot"
+    hash_indices: False
+    bottom_mlp_sizes: [512, 256, 128]
+    top_mlp_sizes: [1024, 1024, 512, 256, 1]
+    embedding_type: "joint_fused"
+    num_numerical_features: 13
+    use_cpp_mlp: True
+    categorical_features_info:
+      cat_0: 7912889
+      cat_1: 33823
+      cat_2: 17139
+      cat_3: 7339
+      cat_4: 20046
+      cat_5: 4
+      cat_6: 7105
+      cat_7: 1382
+      cat_8: 63
+      cat_9: 5554114
+      cat_10: 582469
+      cat_11: 245828
+      cat_12: 11
+      cat_13: 2209
+      cat_14: 10667
+      cat_15: 104
+      cat_16: 4
+      cat_17: 968
+      cat_18: 15
+      cat_19: 8165896
+      cat_20: 2675940
+      cat_21: 7156453
+      cat_22: 302516
+      cat_23: 12022
+      cat_24: 97
+      cat_25: 35
+training:
+  gpus: 1
+  device: "cuda:0"
+  amp: True
+  dataloader_workers: 16
+  num_prefetched_partitions: 4
+  parallel_prefetch_requests: 2
+  use_previous_model: True
+  initial_model: random
+  initial_pass:
+    activated: False
+  batch_size: 65536
+  optimizers:
+    - name: "mlp"
+      algorithm: "FusedSGD"
+      source: "APEX"
+      param_groups:
+        - module: "model.top_model"
+          config:
+            lr: 24
+        - module: "model.bottom_model.mlp"
+          config:
+            lr: 24
+    - name: "opt_1"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model.bottom_model.embeddings"
+          config:
+            lr: 24
+  lr_scheduler:
+    name: "DLRMScheduler"
+    source: "Custom"
+    optimizers: ["mlp", "opt_1"]
+    config:
+      base_lrs: [[24, 24], [24]]
+      warmup_steps: 8000
+      warmup_factor: 0
+      decay_steps: 24000
+      decay_start_step: 48000
+      decay_power: 2
+      end_lr_factor: 0
+  optimization_criterion:
+    name: "BCEWithLogitsLoss"
+  grad_scaler_config:
+    growth_interval: 1000000000
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 2000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: criteo
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(x: bytes) -> dict:
+      num_features = x[:52]
+      cat_features = x[52:]
+      num_features_array = np.frombuffer(num_features, dtype=np.float32)
+      cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
+      return {
+        "numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
+        "categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
+      }
+  label_transformer_function: |
+    import torch
+    # we need to convert our integer-type labels to floats,
+    # since the BCEWithLogitsLoss function does not work with integers.
+    def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
+      return x.to(torch.float32)
+trigger:
+  id: DataAmountTrigger
+  trigger_config:
+    data_points_for_trigger: 20000000
+
diff --git a/experiments/criteo_online_dataset/pipelines/4workers_8prefetch_8parallel.yml b/experiments/criteo_online_dataset/pipelines/4workers_8prefetch_8parallel.yml
new file mode 100644
index 000000000..477d7d3f3
--- /dev/null
+++ b/experiments/criteo_online_dataset/pipelines/4workers_8prefetch_8parallel.yml
@@ -0,0 +1,121 @@
+pipeline:
+    name: 4workers_8prefetch_8parallel
+    description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
+    version: 1.0.0
+model:
+  id: DLRM
+  config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo:  https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
+    embedding_dim: 128
+    interaction_op: "cuda_dot"
+    hash_indices: False
+    bottom_mlp_sizes: [512, 256, 128]
+    top_mlp_sizes: [1024, 1024, 512, 256, 1]
+    embedding_type: "joint_fused"
+    num_numerical_features: 13
+    use_cpp_mlp: True
+    categorical_features_info:
+      cat_0: 7912889
+      cat_1: 33823
+      cat_2: 17139
+      cat_3: 7339
+      cat_4: 20046
+      cat_5: 4
+      cat_6: 7105
+      cat_7: 1382
+      cat_8: 63
+      cat_9: 5554114
+      cat_10: 582469
+      cat_11: 245828
+      cat_12: 11
+      cat_13: 2209
+      cat_14: 10667
+      cat_15: 104
+      cat_16: 4
+      cat_17: 968
+      cat_18: 15
+      cat_19: 8165896
+      cat_20: 2675940
+      cat_21: 7156453
+      cat_22: 302516
+      cat_23: 12022
+      cat_24: 97
+      cat_25: 35
+training:
+  gpus: 1
+  device: "cuda:0"
+  amp: True
+  dataloader_workers: 4
+  parallel_prefetch_requests: 8
+  num_prefetched_partitions: 8
+  use_previous_model: True
+  initial_model: random
+  initial_pass:
+    activated: False
+  batch_size: 65536
+  optimizers:
+    - name: "mlp"
+      algorithm: "FusedSGD"
+      source: "APEX"
+      param_groups:
+        - module: "model.top_model"
+          config:
+            lr: 24
+        - module: "model.bottom_model.mlp"
+          config:
+            lr: 24
+    - name: "opt_1"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model.bottom_model.embeddings"
+          config:
+            lr: 24
+  lr_scheduler:
+    name: "DLRMScheduler"
+    source: "Custom"
+    optimizers: ["mlp", "opt_1"]
+    config:
+      base_lrs: [[24, 24], [24]]
+      warmup_steps: 8000
+      warmup_factor: 0
+      decay_steps: 24000
+      decay_start_step: 48000
+      decay_power: 2
+      end_lr_factor: 0
+  optimization_criterion:
+    name: "BCEWithLogitsLoss"
+  grad_scaler_config:
+    growth_interval: 1000000000
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 2000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: criteo
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(x: bytes) -> dict:
+      num_features = x[:52]
+      cat_features = x[52:]
+      num_features_array = np.frombuffer(num_features, dtype=np.float32)
+      cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
+      return {
+        "numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
+        "categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
+      }
+  label_transformer_function: |
+    import torch
+    # we need to convert our integer-type labels to floats,
+    # since the BCEWithLogitsLoss function does not work with integers.
+    def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
+      return x.to(torch.float32)
+trigger:
+  id: DataAmountTrigger
+  trigger_config:
+    data_points_for_trigger: 20000000
+
diff --git a/experiments/criteo_online_dataset/pipelines/8workers_0prefetch_0parallel.yml b/experiments/criteo_online_dataset/pipelines/8workers_0prefetch_0parallel.yml
new file mode 100644
index 000000000..fb46f03a0
--- /dev/null
+++ b/experiments/criteo_online_dataset/pipelines/8workers_0prefetch_0parallel.yml
@@ -0,0 +1,121 @@
+pipeline:
+    name: 8workers_0prefetch_0parallel
+    description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
+    version: 1.0.0
+model:
+  id: DLRM
+  config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo:  https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
+    embedding_dim: 128
+    interaction_op: "cuda_dot"
+    hash_indices: False
+    bottom_mlp_sizes: [512, 256, 128]
+    top_mlp_sizes: [1024, 1024, 512, 256, 1]
+    embedding_type: "joint_fused"
+    num_numerical_features: 13
+    use_cpp_mlp: True
+    categorical_features_info:
+      cat_0: 7912889
+      cat_1: 33823
+      cat_2: 17139
+      cat_3: 7339
+      cat_4: 20046
+      cat_5: 4
+      cat_6: 7105
+      cat_7: 1382
+      cat_8: 63
+      cat_9: 5554114
+      cat_10: 582469
+      cat_11: 245828
+      cat_12: 11
+      cat_13: 2209
+      cat_14: 10667
+      cat_15: 104
+      cat_16: 4
+      cat_17: 968
+      cat_18: 15
+      cat_19: 8165896
+      cat_20: 2675940
+      cat_21: 7156453
+      cat_22: 302516
+      cat_23: 12022
+      cat_24: 97
+      cat_25: 35
+training:
+  gpus: 1
+  device: "cuda:0"
+  amp: True
+  dataloader_workers: 8
+  num_prefetched_partitions: 0
+  parallel_prefetch_requests: 1
+  use_previous_model: True
+  initial_model: random
+  initial_pass:
+    activated: False
+  batch_size: 65536
+  optimizers:
+    - name: "mlp"
+      algorithm: "FusedSGD"
+      source: "APEX"
+      param_groups:
+        - module: "model.top_model"
+          config:
+            lr: 24
+        - module: "model.bottom_model.mlp"
+          config:
+            lr: 24
+    - name: "opt_1"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model.bottom_model.embeddings"
+          config:
+            lr: 24
+  lr_scheduler:
+    name: "DLRMScheduler"
+    source: "Custom"
+    optimizers: ["mlp", "opt_1"]
+    config:
+      base_lrs: [[24, 24], [24]]
+      warmup_steps: 8000
+      warmup_factor: 0
+      decay_steps: 24000
+      decay_start_step: 48000
+      decay_power: 2
+      end_lr_factor: 0
+  optimization_criterion:
+    name: "BCEWithLogitsLoss"
+  grad_scaler_config:
+    growth_interval: 1000000000
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 2000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: criteo
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(x: bytes) -> dict:
+      num_features = x[:52]
+      cat_features = x[52:]
+      num_features_array = np.frombuffer(num_features, dtype=np.float32)
+      cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
+      return {
+        "numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
+        "categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
+      }
+  label_transformer_function: |
+    import torch
+    # we need to convert our integer-type labels to floats,
+    # since the BCEWithLogitsLoss function does not work with integers.
+    def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
+      return x.to(torch.float32)
+trigger:
+  id: DataAmountTrigger
+  trigger_config:
+    data_points_for_trigger: 20000000
+
diff --git a/experiments/criteo_online_dataset/pipelines/8workers_16prefetch_4parallel.yml b/experiments/criteo_online_dataset/pipelines/8workers_16prefetch_4parallel.yml
new file mode 100644
index 000000000..520d63458
--- /dev/null
+++ b/experiments/criteo_online_dataset/pipelines/8workers_16prefetch_4parallel.yml
@@ -0,0 +1,121 @@
+pipeline:
+    name: 16workers_4prefetch_2parallel
+    description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
+    version: 1.0.0
+model:
+  id: DLRM
+  config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo:  https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
+    embedding_dim: 128
+    interaction_op: "cuda_dot"
+    hash_indices: False
+    bottom_mlp_sizes: [512, 256, 128]
+    top_mlp_sizes: [1024, 1024, 512, 256, 1]
+    embedding_type: "joint_fused"
+    num_numerical_features: 13
+    use_cpp_mlp: True
+    categorical_features_info:
+      cat_0: 7912889
+      cat_1: 33823
+      cat_2: 17139
+      cat_3: 7339
+      cat_4: 20046
+      cat_5: 4
+      cat_6: 7105
+      cat_7: 1382
+      cat_8: 63
+      cat_9: 5554114
+      cat_10: 582469
+      cat_11: 245828
+      cat_12: 11
+      cat_13: 2209
+      cat_14: 10667
+      cat_15: 104
+      cat_16: 4
+      cat_17: 968
+      cat_18: 15
+      cat_19: 8165896
+      cat_20: 2675940
+      cat_21: 7156453
+      cat_22: 302516
+      cat_23: 12022
+      cat_24: 97
+      cat_25: 35
+training:
+  gpus: 1
+  device: "cuda:0"
+  amp: True
+  dataloader_workers: 8
+  num_prefetched_partitions: 16
+  parallel_prefetch_requests: 4
+  use_previous_model: True
+  initial_model: random
+  initial_pass:
+    activated: False
+  batch_size: 65536
+  optimizers:
+    - name: "mlp"
+      algorithm: "FusedSGD"
+      source: "APEX"
+      param_groups:
+        - module: "model.top_model"
+          config:
+            lr: 24
+        - module: "model.bottom_model.mlp"
+          config:
+            lr: 24
+    - name: "opt_1"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model.bottom_model.embeddings"
+          config:
+            lr: 24
+  lr_scheduler:
+    name: "DLRMScheduler"
+    source: "Custom"
+    optimizers: ["mlp", "opt_1"]
+    config:
+      base_lrs: [[24, 24], [24]]
+      warmup_steps: 8000
+      warmup_factor: 0
+      decay_steps: 24000
+      decay_start_step: 48000
+      decay_power: 2
+      end_lr_factor: 0
+  optimization_criterion:
+    name: "BCEWithLogitsLoss"
+  grad_scaler_config:
+    growth_interval: 1000000000
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 2000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: criteo
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(x: bytes) -> dict:
+      num_features = x[:52]
+      cat_features = x[52:]
+      num_features_array = np.frombuffer(num_features, dtype=np.float32)
+      cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
+      return {
+        "numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
+        "categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
+      }
+  label_transformer_function: |
+    import torch
+    # we need to convert our integer-type labels to floats,
+    # since the BCEWithLogitsLoss function does not work with integers.
+    def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
+      return x.to(torch.float32)
+trigger:
+  id: DataAmountTrigger
+  trigger_config:
+    data_points_for_trigger: 20000000
+
diff --git a/experiments/criteo_online_dataset/pipelines/8workers_1prefetch_1parallel.yml b/experiments/criteo_online_dataset/pipelines/8workers_1prefetch_1parallel.yml
new file mode 100644
index 000000000..2b67e940d
--- /dev/null
+++ b/experiments/criteo_online_dataset/pipelines/8workers_1prefetch_1parallel.yml
@@ -0,0 +1,121 @@
+pipeline:
+    name: 8workers_1prefetch_1parallel
+    description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
+    version: 1.0.0
+model:
+  id: DLRM
+  config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo:  https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
+    embedding_dim: 128
+    interaction_op: "cuda_dot"
+    hash_indices: False
+    bottom_mlp_sizes: [512, 256, 128]
+    top_mlp_sizes: [1024, 1024, 512, 256, 1]
+    embedding_type: "joint_fused"
+    num_numerical_features: 13
+    use_cpp_mlp: True
+    categorical_features_info:
+      cat_0: 7912889
+      cat_1: 33823
+      cat_2: 17139
+      cat_3: 7339
+      cat_4: 20046
+      cat_5: 4
+      cat_6: 7105
+      cat_7: 1382
+      cat_8: 63
+      cat_9: 5554114
+      cat_10: 582469
+      cat_11: 245828
+      cat_12: 11
+      cat_13: 2209
+      cat_14: 10667
+      cat_15: 104
+      cat_16: 4
+      cat_17: 968
+      cat_18: 15
+      cat_19: 8165896
+      cat_20: 2675940
+      cat_21: 7156453
+      cat_22: 302516
+      cat_23: 12022
+      cat_24: 97
+      cat_25: 35
+training:
+  gpus: 1
+  device: "cuda:0"
+  amp: True
+  dataloader_workers: 8
+  num_prefetched_partitions: 1
+  parallel_prefetch_requests: 1
+  use_previous_model: True
+  initial_model: random
+  initial_pass:
+    activated: False
+  batch_size: 65536
+  optimizers:
+    - name: "mlp"
+      algorithm: "FusedSGD"
+      source: "APEX"
+      param_groups:
+        - module: "model.top_model"
+          config:
+            lr: 24
+        - module: "model.bottom_model.mlp"
+          config:
+            lr: 24
+    - name: "opt_1"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model.bottom_model.embeddings"
+          config:
+            lr: 24
+  lr_scheduler:
+    name: "DLRMScheduler"
+    source: "Custom"
+    optimizers: ["mlp", "opt_1"]
+    config:
+      base_lrs: [[24, 24], [24]]
+      warmup_steps: 8000
+      warmup_factor: 0
+      decay_steps: 24000
+      decay_start_step: 48000
+      decay_power: 2
+      end_lr_factor: 0
+  optimization_criterion:
+    name: "BCEWithLogitsLoss"
+  grad_scaler_config:
+    growth_interval: 1000000000
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 2000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: criteo
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(x: bytes) -> dict:
+      num_features = x[:52]
+      cat_features = x[52:]
+      num_features_array = np.frombuffer(num_features, dtype=np.float32)
+      cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
+      return {
+        "numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
+        "categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
+      }
+  label_transformer_function: |
+    import torch
+    # we need to convert our integer-type labels to floats,
+    # since the BCEWithLogitsLoss function does not work with integers.
+    def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
+      return x.to(torch.float32)
+trigger:
+  id: DataAmountTrigger
+  trigger_config:
+    data_points_for_trigger: 20000000
+
diff --git a/experiments/criteo_online_dataset/pipelines/8workers_2prefetch_2parallel.yml b/experiments/criteo_online_dataset/pipelines/8workers_2prefetch_2parallel.yml
new file mode 100644
index 000000000..6be587029
--- /dev/null
+++ b/experiments/criteo_online_dataset/pipelines/8workers_2prefetch_2parallel.yml
@@ -0,0 +1,121 @@
+pipeline:
+    name: 8workers_2prefetch_2parallel
+    description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
+    version: 1.0.0
+model:
+  id: DLRM
+  config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo:  https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
+    embedding_dim: 128
+    interaction_op: "cuda_dot"
+    hash_indices: False
+    bottom_mlp_sizes: [512, 256, 128]
+    top_mlp_sizes: [1024, 1024, 512, 256, 1]
+    embedding_type: "joint_fused"
+    num_numerical_features: 13
+    use_cpp_mlp: True
+    categorical_features_info:
+      cat_0: 7912889
+      cat_1: 33823
+      cat_2: 17139
+      cat_3: 7339
+      cat_4: 20046
+      cat_5: 4
+      cat_6: 7105
+      cat_7: 1382
+      cat_8: 63
+      cat_9: 5554114
+      cat_10: 582469
+      cat_11: 245828
+      cat_12: 11
+      cat_13: 2209
+      cat_14: 10667
+      cat_15: 104
+      cat_16: 4
+      cat_17: 968
+      cat_18: 15
+      cat_19: 8165896
+      cat_20: 2675940
+      cat_21: 7156453
+      cat_22: 302516
+      cat_23: 12022
+      cat_24: 97
+      cat_25: 35
+training:
+  gpus: 1
+  device: "cuda:0"
+  amp: True
+  dataloader_workers: 8
+  num_prefetched_partitions: 2
+  parallel_prefetch_requests: 2
+  use_previous_model: True
+  initial_model: random
+  initial_pass:
+    activated: False
+  batch_size: 65536
+  optimizers:
+    - name: "mlp"
+      algorithm: "FusedSGD"
+      source: "APEX"
+      param_groups:
+        - module: "model.top_model"
+          config:
+            lr: 24
+        - module: "model.bottom_model.mlp"
+          config:
+            lr: 24
+    - name: "opt_1"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model.bottom_model.embeddings"
+          config:
+            lr: 24
+  lr_scheduler:
+    name: "DLRMScheduler"
+    source: "Custom"
+    optimizers: ["mlp", "opt_1"]
+    config:
+      base_lrs: [[24, 24], [24]]
+      warmup_steps: 8000
+      warmup_factor: 0
+      decay_steps: 24000
+      decay_start_step: 48000
+      decay_power: 2
+      end_lr_factor: 0
+  optimization_criterion:
+    name: "BCEWithLogitsLoss"
+  grad_scaler_config:
+    growth_interval: 1000000000
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 2000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: criteo
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(x: bytes) -> dict:
+      num_features = x[:52]
+      cat_features = x[52:]
+      num_features_array = np.frombuffer(num_features, dtype=np.float32)
+      cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
+      return {
+        "numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
+        "categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
+      }
+  label_transformer_function: |
+    import torch
+    # we need to convert our integer-type labels to floats,
+    # since the BCEWithLogitsLoss function does not work with integers.
+    def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
+      return x.to(torch.float32)
+trigger:
+  id: DataAmountTrigger
+  trigger_config:
+    data_points_for_trigger: 20000000
+
diff --git a/experiments/criteo_online_dataset/pipelines/8workers_4prefetch_2parallel.yml b/experiments/criteo_online_dataset/pipelines/8workers_4prefetch_2parallel.yml
new file mode 100644
index 000000000..e2a4eecae
--- /dev/null
+++ b/experiments/criteo_online_dataset/pipelines/8workers_4prefetch_2parallel.yml
@@ -0,0 +1,121 @@
+pipeline:
+    name: 8workers_4prefetch_2parallel
+    description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
+    version: 1.0.0
+model:
+  id: DLRM
+  config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo:  https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
+    embedding_dim: 128
+    interaction_op: "cuda_dot"
+    hash_indices: False
+    bottom_mlp_sizes: [512, 256, 128]
+    top_mlp_sizes: [1024, 1024, 512, 256, 1]
+    embedding_type: "joint_fused"
+    num_numerical_features: 13
+    use_cpp_mlp: True
+    categorical_features_info:
+      cat_0: 7912889
+      cat_1: 33823
+      cat_2: 17139
+      cat_3: 7339
+      cat_4: 20046
+      cat_5: 4
+      cat_6: 7105
+      cat_7: 1382
+      cat_8: 63
+      cat_9: 5554114
+      cat_10: 582469
+      cat_11: 245828
+      cat_12: 11
+      cat_13: 2209
+      cat_14: 10667
+      cat_15: 104
+      cat_16: 4
+      cat_17: 968
+      cat_18: 15
+      cat_19: 8165896
+      cat_20: 2675940
+      cat_21: 7156453
+      cat_22: 302516
+      cat_23: 12022
+      cat_24: 97
+      cat_25: 35
+training:
+  gpus: 1
+  device: "cuda:0"
+  amp: True
+  dataloader_workers: 8
+  num_prefetched_partitions: 4
+  parallel_prefetch_requests: 2
+  use_previous_model: True
+  initial_model: random
+  initial_pass:
+    activated: False
+  batch_size: 65536
+  optimizers:
+    - name: "mlp"
+      algorithm: "FusedSGD"
+      source: "APEX"
+      param_groups:
+        - module: "model.top_model"
+          config:
+            lr: 24
+        - module: "model.bottom_model.mlp"
+          config:
+            lr: 24
+    - name: "opt_1"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model.bottom_model.embeddings"
+          config:
+            lr: 24
+  lr_scheduler:
+    name: "DLRMScheduler"
+    source: "Custom"
+    optimizers: ["mlp", "opt_1"]
+    config:
+      base_lrs: [[24, 24], [24]]
+      warmup_steps: 8000
+      warmup_factor: 0
+      decay_steps: 24000
+      decay_start_step: 48000
+      decay_power: 2
+      end_lr_factor: 0
+  optimization_criterion:
+    name: "BCEWithLogitsLoss"
+  grad_scaler_config:
+    growth_interval: 1000000000
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 2000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: criteo
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(x: bytes) -> dict:
+      num_features = x[:52]
+      cat_features = x[52:]
+      num_features_array = np.frombuffer(num_features, dtype=np.float32)
+      cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
+      return {
+        "numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
+        "categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
+      }
+  label_transformer_function: |
+    import torch
+    # we need to convert our integer-type labels to floats,
+    # since the BCEWithLogitsLoss function does not work with integers.
+    def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
+      return x.to(torch.float32)
+trigger:
+  id: DataAmountTrigger
+  trigger_config:
+    data_points_for_trigger: 20000000
+
diff --git a/experiments/criteo_online_dataset/pipelines/8workers_4prefetch_4parallel.yml b/experiments/criteo_online_dataset/pipelines/8workers_4prefetch_4parallel.yml
new file mode 100644
index 000000000..5a0a1bb5b
--- /dev/null
+++ b/experiments/criteo_online_dataset/pipelines/8workers_4prefetch_4parallel.yml
@@ -0,0 +1,121 @@
+pipeline:
+    name: 8workers_4prefetch_4parallel
+    description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
+    version: 1.0.0
+model:
+  id: DLRM
+  config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo:  https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
+    embedding_dim: 128
+    interaction_op: "cuda_dot"
+    hash_indices: False
+    bottom_mlp_sizes: [512, 256, 128]
+    top_mlp_sizes: [1024, 1024, 512, 256, 1]
+    embedding_type: "joint_fused"
+    num_numerical_features: 13
+    use_cpp_mlp: True
+    categorical_features_info:
+      cat_0: 7912889
+      cat_1: 33823
+      cat_2: 17139
+      cat_3: 7339
+      cat_4: 20046
+      cat_5: 4
+      cat_6: 7105
+      cat_7: 1382
+      cat_8: 63
+      cat_9: 5554114
+      cat_10: 582469
+      cat_11: 245828
+      cat_12: 11
+      cat_13: 2209
+      cat_14: 10667
+      cat_15: 104
+      cat_16: 4
+      cat_17: 968
+      cat_18: 15
+      cat_19: 8165896
+      cat_20: 2675940
+      cat_21: 7156453
+      cat_22: 302516
+      cat_23: 12022
+      cat_24: 97
+      cat_25: 35
+training:
+  gpus: 1
+  device: "cuda:0"
+  amp: True
+  dataloader_workers: 8
+  num_prefetched_partitions: 4
+  parallel_prefetch_requests: 4
+  use_previous_model: True
+  initial_model: random
+  initial_pass:
+    activated: False
+  batch_size: 65536
+  optimizers:
+    - name: "mlp"
+      algorithm: "FusedSGD"
+      source: "APEX"
+      param_groups:
+        - module: "model.top_model"
+          config:
+            lr: 24
+        - module: "model.bottom_model.mlp"
+          config:
+            lr: 24
+    - name: "opt_1"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model.bottom_model.embeddings"
+          config:
+            lr: 24
+  lr_scheduler:
+    name: "DLRMScheduler"
+    source: "Custom"
+    optimizers: ["mlp", "opt_1"]
+    config:
+      base_lrs: [[24, 24], [24]]
+      warmup_steps: 8000
+      warmup_factor: 0
+      decay_steps: 24000
+      decay_start_step: 48000
+      decay_power: 2
+      end_lr_factor: 0
+  optimization_criterion:
+    name: "BCEWithLogitsLoss"
+  grad_scaler_config:
+    growth_interval: 1000000000
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 2000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: criteo
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(x: bytes) -> dict:
+      num_features = x[:52]
+      cat_features = x[52:]
+      num_features_array = np.frombuffer(num_features, dtype=np.float32)
+      cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
+      return {
+        "numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
+        "categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
+      }
+  label_transformer_function: |
+    import torch
+    # we need to convert our integer-type labels to floats,
+    # since the BCEWithLogitsLoss function does not work with integers.
+    def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
+      return x.to(torch.float32)
+trigger:
+  id: DataAmountTrigger
+  trigger_config:
+    data_points_for_trigger: 20000000
+
diff --git a/experiments/criteo_online_dataset/pipelines/8workers_8prefetch_4parallel.yml b/experiments/criteo_online_dataset/pipelines/8workers_8prefetch_4parallel.yml
new file mode 100644
index 000000000..8f94cebe0
--- /dev/null
+++ b/experiments/criteo_online_dataset/pipelines/8workers_8prefetch_4parallel.yml
@@ -0,0 +1,121 @@
+pipeline:
+    name: 8workers_8prefetch_4parallel
+    description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
+    version: 1.0.0
+model:
+  id: DLRM
+  config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo:  https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
+    embedding_dim: 128
+    interaction_op: "cuda_dot"
+    hash_indices: False
+    bottom_mlp_sizes: [512, 256, 128]
+    top_mlp_sizes: [1024, 1024, 512, 256, 1]
+    embedding_type: "joint_fused"
+    num_numerical_features: 13
+    use_cpp_mlp: True
+    categorical_features_info:
+      cat_0: 7912889
+      cat_1: 33823
+      cat_2: 17139
+      cat_3: 7339
+      cat_4: 20046
+      cat_5: 4
+      cat_6: 7105
+      cat_7: 1382
+      cat_8: 63
+      cat_9: 5554114
+      cat_10: 582469
+      cat_11: 245828
+      cat_12: 11
+      cat_13: 2209
+      cat_14: 10667
+      cat_15: 104
+      cat_16: 4
+      cat_17: 968
+      cat_18: 15
+      cat_19: 8165896
+      cat_20: 2675940
+      cat_21: 7156453
+      cat_22: 302516
+      cat_23: 12022
+      cat_24: 97
+      cat_25: 35
+training:
+  gpus: 1
+  device: "cuda:0"
+  amp: True
+  dataloader_workers: 8
+  num_prefetched_partitions: 8
+  parallel_prefetch_requests: 4
+  use_previous_model: True
+  initial_model: random
+  initial_pass:
+    activated: False
+  batch_size: 65536
+  optimizers:
+    - name: "mlp"
+      algorithm: "FusedSGD"
+      source: "APEX"
+      param_groups:
+        - module: "model.top_model"
+          config:
+            lr: 24
+        - module: "model.bottom_model.mlp"
+          config:
+            lr: 24
+    - name: "opt_1"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model.bottom_model.embeddings"
+          config:
+            lr: 24
+  lr_scheduler:
+    name: "DLRMScheduler"
+    source: "Custom"
+    optimizers: ["mlp", "opt_1"]
+    config:
+      base_lrs: [[24, 24], [24]]
+      warmup_steps: 8000
+      warmup_factor: 0
+      decay_steps: 24000
+      decay_start_step: 48000
+      decay_power: 2
+      end_lr_factor: 0
+  optimization_criterion:
+    name: "BCEWithLogitsLoss"
+  grad_scaler_config:
+    growth_interval: 1000000000
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 2000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: criteo
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(x: bytes) -> dict:
+      num_features = x[:52]
+      cat_features = x[52:]
+      num_features_array = np.frombuffer(num_features, dtype=np.float32)
+      cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
+      return {
+        "numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
+        "categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
+      }
+  label_transformer_function: |
+    import torch
+    # we need to convert our integer-type labels to floats,
+    # since the BCEWithLogitsLoss function does not work with integers.
+    def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
+      return x.to(torch.float32)
+trigger:
+  id: DataAmountTrigger
+  trigger_config:
+    data_points_for_trigger: 20000000
+
diff --git a/experiments/criteo_online_dataset/pipelines/8workers_8prefetch_8parallel.yml b/experiments/criteo_online_dataset/pipelines/8workers_8prefetch_8parallel.yml
new file mode 100644
index 000000000..68149e4f1
--- /dev/null
+++ b/experiments/criteo_online_dataset/pipelines/8workers_8prefetch_8parallel.yml
@@ -0,0 +1,121 @@
+pipeline:
+    name: 8workers_8prefetch_8parallel
+    description: DLRM/Criteo Training. Finetuning, i.e., updating model over time.
+    version: 1.0.0
+model:
+  id: DLRM
+  config: # these parameters are consistent with the parameters used for the experiments shown in the NVIDIA repo:  https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/DLRM
+    embedding_dim: 128
+    interaction_op: "cuda_dot"
+    hash_indices: False
+    bottom_mlp_sizes: [512, 256, 128]
+    top_mlp_sizes: [1024, 1024, 512, 256, 1]
+    embedding_type: "joint_fused"
+    num_numerical_features: 13
+    use_cpp_mlp: True
+    categorical_features_info:
+      cat_0: 7912889
+      cat_1: 33823
+      cat_2: 17139
+      cat_3: 7339
+      cat_4: 20046
+      cat_5: 4
+      cat_6: 7105
+      cat_7: 1382
+      cat_8: 63
+      cat_9: 5554114
+      cat_10: 582469
+      cat_11: 245828
+      cat_12: 11
+      cat_13: 2209
+      cat_14: 10667
+      cat_15: 104
+      cat_16: 4
+      cat_17: 968
+      cat_18: 15
+      cat_19: 8165896
+      cat_20: 2675940
+      cat_21: 7156453
+      cat_22: 302516
+      cat_23: 12022
+      cat_24: 97
+      cat_25: 35
+training:
+  gpus: 1
+  device: "cuda:0"
+  amp: True
+  dataloader_workers: 8
+  num_prefetched_partitions: 8
+  parallel_prefetch_requests: 8
+  use_previous_model: True
+  initial_model: random
+  initial_pass:
+    activated: False
+  batch_size: 65536
+  optimizers:
+    - name: "mlp"
+      algorithm: "FusedSGD"
+      source: "APEX"
+      param_groups:
+        - module: "model.top_model"
+          config:
+            lr: 24
+        - module: "model.bottom_model.mlp"
+          config:
+            lr: 24
+    - name: "opt_1"
+      algorithm: "SGD"
+      source: "PyTorch"
+      param_groups:
+        - module: "model.bottom_model.embeddings"
+          config:
+            lr: 24
+  lr_scheduler:
+    name: "DLRMScheduler"
+    source: "Custom"
+    optimizers: ["mlp", "opt_1"]
+    config:
+      base_lrs: [[24, 24], [24]]
+      warmup_steps: 8000
+      warmup_factor: 0
+      decay_steps: 24000
+      decay_start_step: 48000
+      decay_power: 2
+      end_lr_factor: 0
+  optimization_criterion:
+    name: "BCEWithLogitsLoss"
+  grad_scaler_config:
+    growth_interval: 1000000000
+  checkpointing:
+    activated: False
+  selection_strategy:
+    name: NewDataStrategy
+    maximum_keys_in_memory: 2000000
+    config:
+      limit: -1
+      reset_after_trigger: True
+data:
+  dataset_id: criteo
+  bytes_parser_function: |
+    import torch
+    import numpy as np
+    def bytes_parser_function(x: bytes) -> dict:
+      num_features = x[:52]
+      cat_features = x[52:]
+      num_features_array = np.frombuffer(num_features, dtype=np.float32)
+      cat_features_array = np.frombuffer(cat_features, dtype=np.int32)
+      return {
+        "numerical_input": torch.asarray(num_features_array, copy=True, dtype=torch.float32),
+        "categorical_input": torch.asarray(cat_features_array, copy=True, dtype=torch.long)
+      }
+  label_transformer_function: |
+    import torch
+    # we need to convert our integer-type labels to floats,
+    # since the BCEWithLogitsLoss function does not work with integers.
+    def label_transformer_function(x: torch.Tensor) -> torch.Tensor:
+      return x.to(torch.float32)
+trigger:
+  id: DataAmountTrigger
+  trigger_config:
+    data_points_for_trigger: 20000000
+
diff --git a/experiments/criteo_online_dataset/run_prefetch_exp.sh b/experiments/criteo_online_dataset/run_prefetch_exp.sh
new file mode 100644
index 000000000..b26443310
--- /dev/null
+++ b/experiments/criteo_online_dataset/run_prefetch_exp.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+BASEDIR="/modyn_host/eval/criteo_dataset_$(date +%s)"
+
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+MODYN_CONFIG_PATH="$SCRIPT_DIR/../../modyn/config/examples/modyn_config.yaml"
+
+for filename in $SCRIPT_DIR/pipelines/*.yml; do
+    BASE=$(basename "$filename" | cut -d. -f1)
+    EVAL_DIR="$BASEDIR/$BASE"
+    mkdir -p $EVAL_DIR
+    modyn-supervisor --start-replay-at 0 --maximum-triggers 1 $filename $MODYN_CONFIG_PATH $EVAL_DIR
+done
diff --git a/integrationtests/model_storage/integrationtest_model_storage.py b/integrationtests/model_storage/integrationtest_model_storage.py
index 70299dbb0..f1e7e64d1 100644
--- a/integrationtests/model_storage/integrationtest_model_storage.py
+++ b/integrationtests/model_storage/integrationtest_model_storage.py
@@ -68,7 +68,7 @@ def delete_dummy_file_from_trainer(config: dict):
 
 def insert_trigger_into_database(config: dict) -> (int, int):
     with MetadataDatabaseConnection(config) as database:
-        pipeline_id = database.register_pipeline(2)
+        pipeline_id = database.register_pipeline(2, "{}")
 
         trigger = Trigger(trigger_id=10, pipeline_id=pipeline_id)
         database.session.add(trigger)
diff --git a/integrationtests/online_dataset/test_online_dataset.py b/integrationtests/online_dataset/test_online_dataset.py
new file mode 100644
index 000000000..646e1e7f6
--- /dev/null
+++ b/integrationtests/online_dataset/test_online_dataset.py
@@ -0,0 +1,386 @@
+import gc
+import json
+import math
+import os
+import pathlib
+import random
+import shutil
+import time
+from typing import Iterable, Tuple
+
+import grpc
+import modyn.storage.internal.grpc.generated.storage_pb2 as storage_pb2
+import torch
+import yaml
+from modyn.selector.internal.grpc.generated.selector_pb2 import DataInformRequest, JsonString, RegisterPipelineRequest
+from modyn.selector.internal.grpc.generated.selector_pb2_grpc import SelectorStub
+from modyn.storage.internal.grpc.generated.storage_pb2 import (
+    DatasetAvailableRequest,
+    GetDatasetSizeRequest,
+    GetDatasetSizeResponse,
+    GetNewDataSinceRequest,
+    GetNewDataSinceResponse,
+    RegisterNewDatasetRequest,
+)
+from modyn.storage.internal.grpc.generated.storage_pb2_grpc import StorageStub
+from modyn.trainer_server.internal.dataset.data_utils import prepare_dataloaders
+from modyn.utils import grpc_connection_established
+from PIL import Image
+from torchvision import transforms
+
+SCRIPT_PATH = pathlib.Path(os.path.realpath(__file__))
+
+TIMEOUT = 120  # seconds
+CONFIG_FILE = SCRIPT_PATH.parent.parent.parent / "modyn" / "config" / "examples" / "modyn_config.yaml"
+# The following path leads to a directory that is mounted into the docker container and shared with the
+# storage container.
+DATASET_PATH = pathlib.Path("/app") / "storage" / "datasets" / "test_dataset"
+
+# Because we have no mapping of file to key (happens in the storage service), we have to keep
+# track of the images we added to the dataset ourselves and compare them to the images we get
+# from the storage service.
+FIRST_ADDED_IMAGES = []
+SECOND_ADDED_IMAGES = []
+IMAGE_UPDATED_TIME_STAMPS = []
+
+
+def get_modyn_config() -> dict:
+    with open(CONFIG_FILE, "r", encoding="utf-8") as config_file:
+        config = yaml.safe_load(config_file)
+
+    return config
+
+
+def connect_to_selector_servicer() -> grpc.Channel:
+    selector_address = get_selector_address()
+    selector_channel = grpc.insecure_channel(selector_address)
+
+    if not grpc_connection_established(selector_channel):
+        raise ConnectionError(f"Could not establish gRPC connection to selector at {selector_address}.")
+
+    return selector_channel
+
+
+def get_storage_address() -> str:
+    config = get_modyn_config()
+    return f"{config['storage']['hostname']}:{config['storage']['port']}"
+
+
+def get_selector_address() -> str:
+    config = get_modyn_config()
+    return f"{config['selector']['hostname']}:{config['selector']['port']}"
+
+
+def connect_to_storage() -> grpc.Channel:
+    storage_address = get_storage_address()
+    storage_channel = grpc.insecure_channel(storage_address)
+
+    if not grpc_connection_established(storage_channel) or storage_channel is None:
+        raise ConnectionError(f"Could not establish gRPC connection to storage at {storage_address}.")
+
+    return storage_channel
+
+
+def register_new_dataset() -> None:
+    storage_channel = connect_to_storage()
+
+    storage = StorageStub(storage_channel)
+
+    request = RegisterNewDatasetRequest(
+        base_path=str(DATASET_PATH),
+        dataset_id="test_dataset",
+        description="Test dataset for integration tests.",
+        file_wrapper_config=json.dumps({"file_extension": ".png", "label_file_extension": ".txt"}),
+        file_wrapper_type="SingleSampleFileWrapper",
+        filesystem_wrapper_type="LocalFilesystemWrapper",
+        version="0.1.0",
+    )
+
+    response = storage.RegisterNewDataset(request)
+
+    assert response.success, "Could not register new dataset."
+
+
+def check_dataset_availability() -> None:
+    storage_channel = connect_to_storage()
+
+    storage = StorageStub(storage_channel)
+
+    request = DatasetAvailableRequest(dataset_id="test_dataset")
+    response = storage.CheckAvailability(request)
+
+    assert response.available, "Dataset is not available."
+
+
+def check_dataset_size(expected_size: int) -> None:
+    storage_channel = connect_to_storage()
+
+    storage = StorageStub(storage_channel)
+    request = GetDatasetSizeRequest(dataset_id="test_dataset")
+    response: GetDatasetSizeResponse = storage.GetDatasetSize(request)
+
+    assert response.success, "Dataset is not available."
+    assert response.num_keys == expected_size
+
+
+def check_dataset_size_invalid() -> None:
+    storage_channel = connect_to_storage()
+
+    storage = StorageStub(storage_channel)
+    request = GetDatasetSizeRequest(dataset_id="unknown_dataset")
+    response: GetDatasetSizeResponse = storage.GetDatasetSize(request)
+
+    assert not response.success, "Dataset is available (even though it should not be)."
+
+
+def check_get_current_timestamp() -> None:
+    storage_channel = connect_to_storage()
+    storage = StorageStub(storage_channel)
+    empty = storage_pb2.google_dot_protobuf_dot_empty__pb2.Empty()
+    response = storage.GetCurrentTimestamp(empty)
+
+    assert response.timestamp > 0, "Timestamp is not valid."
+
+
+def create_dataset_dir() -> None:
+    pathlib.Path(DATASET_PATH).mkdir(parents=True, exist_ok=True)
+
+
+def cleanup_dataset_dir() -> None:
+    shutil.rmtree(DATASET_PATH)
+
+
+def cleanup_storage_database() -> None:
+    storage_channel = connect_to_storage()
+    storage = StorageStub(storage_channel)
+    request = DatasetAvailableRequest(dataset_id="test_dataset")
+    response = storage.DeleteDataset(request)
+
+    assert response.success, "Could not cleanup storage database."
+
+
+def add_image_to_dataset(image: Image, name: str) -> None:
+    image.save(DATASET_PATH / name)
+    IMAGE_UPDATED_TIME_STAMPS.append(int(round(os.path.getmtime(DATASET_PATH / name) * 1000)))
+
+
+def create_random_image() -> Image:
+    image = Image.new("RGB", (100, 100))
+    random_x = random.randint(0, 99)
+    random_y = random.randint(0, 99)
+
+    random_r = random.randint(0, 254)
+    random_g = random.randint(0, 254)
+    random_b = random.randint(0, 254)
+
+    image.putpixel((random_x, random_y), (random_r, random_g, random_b))
+
+    return image
+
+
+def add_images_to_dataset(start_number: int, end_number: int, images_added: list[bytes]) -> None:
+    create_dataset_dir()
+
+    for i in range(start_number, end_number):
+        image = create_random_image()
+        add_image_to_dataset(image, f"image_{i}.png")
+        images_added.append(image.tobytes())
+        with open(DATASET_PATH / f"image_{i}.txt", "w") as label_file:
+            label_file.write(f"{i}")
+
+
+def prepare_selector(num_dataworkers: int, keys: list[int]) -> Tuple[int, int]:
+    selector_channel = connect_to_selector_servicer()
+    selector = SelectorStub(selector_channel)
+    # We test the NewData strategy for finetuning on the new data, i.e., we reset without limit
+    # We also enforce high partitioning (maximum_keys_in_memory == 2) to ensure that works
+
+    strategy_config = {
+        "name": "NewDataStrategy",
+        "maximum_keys_in_memory": 2,
+        "config": {"limit": -1, "reset_after_trigger": True},
+    }
+
+    pipeline_id = selector.register_pipeline(
+        RegisterPipelineRequest(
+            num_workers=max(num_dataworkers, 1), selection_strategy=JsonString(value=json.dumps(strategy_config))
+        )
+    ).pipeline_id
+
+    trigger_id = selector.inform_data_and_trigger(
+        DataInformRequest(
+            pipeline_id=pipeline_id,
+            keys=keys,
+            timestamps=[2 for _ in range(len(keys))],
+            labels=[3 for _ in range(len(keys))],
+        )
+    ).trigger_id
+
+    return pipeline_id, trigger_id
+
+
+def get_new_data_since(timestamp: int) -> Iterable[GetNewDataSinceResponse]:
+    storage_channel = connect_to_storage()
+
+    storage = StorageStub(storage_channel)
+
+    request = GetNewDataSinceRequest(
+        dataset_id="test_dataset",
+        timestamp=timestamp,
+    )
+
+    responses = storage.GetNewDataSince(request)
+    return responses
+
+
+def get_data_keys() -> list[int]:
+    response = None
+    keys = []
+    for i in range(60):
+        responses = list(get_new_data_since(0))
+        assert len(responses) < 2, f"Received batched response, shouldn't happen: {responses}"
+        if len(responses) == 1:
+            response = responses[0]
+            keys = list(response.keys)
+            if len(keys) == 10:
+                break
+        time.sleep(1)
+
+    assert response is not None, "Did not get any response from Storage"
+    assert len(keys) == 10, f"Not all images were returned. Images returned: {response.keys}"
+
+    return keys
+
+
+def get_bytes_parser() -> str:
+    return """
+from PIL import Image
+import io
+def bytes_parser_function(data: bytes) -> Image:
+    return Image.open(io.BytesIO(data)).convert("RGB")"""
+
+
+def tensor_in_list(tensor: torch.Tensor, tensor_list: list[torch.Tensor]) -> bool:
+    return any([(tensor == c_).all() for c_ in tensor_list])
+
+
+def test_dataset_impl(
+    num_dataworkers: int,
+    batch_size: int,
+    prefetched_partitions: int,
+    parallel_prefetch_requests: int,
+    pipeline_id: int,
+    trigger_id: int,
+    items: list[int],
+) -> None:
+    dataloader, _ = prepare_dataloaders(
+        pipeline_id,
+        trigger_id,
+        "test_dataset",
+        num_dataworkers,
+        batch_size,
+        get_bytes_parser(),
+        ["transforms.ToTensor()"],
+        get_storage_address(),
+        get_selector_address(),
+        42,
+        prefetched_partitions,
+        parallel_prefetch_requests,
+        None,
+        None,
+    )
+
+    expected_min_batches = math.floor(len(items) / batch_size)
+    # max one excess batch per worker
+    expected_max_batches = expected_min_batches if num_dataworkers <= 1 else expected_min_batches + num_dataworkers
+
+    all_samples = []
+    all_data = []
+    all_labels = []
+
+    for batch_number, batch in enumerate(dataloader):
+        sample_ids = batch[0]
+        if isinstance(sample_ids, torch.Tensor):
+            sample_ids = sample_ids.tolist()
+        elif isinstance(sample_ids, tuple):
+            sample_ids = list(sample_ids)
+
+        assert isinstance(sample_ids, list), "Cannot parse result from DataLoader"
+        assert isinstance(batch[1], torch.Tensor) and isinstance(batch[2], torch.Tensor)
+
+        all_samples.extend(sample_ids)
+        for sample in batch[1]:
+            all_data.append(sample)  # iterate over batch dimension to extract samples
+        all_labels.extend(batch[2].tolist())
+
+    assert len(all_samples) == len(items)
+    assert len(all_labels) == len(items)
+    assert len(all_data) == len(items)
+
+    assert expected_min_batches <= batch_number + 1 <= expected_max_batches, (
+        f"[{num_dataworkers}][{batch_size}][{prefetched_partitions}]"
+        + f"Wrong number of batches: {batch_number + 1}. num_items = {len(items)}."
+        + f"expected_min = {expected_min_batches}, expected_max = {expected_max_batches}"
+    )
+
+    assert set(all_samples) == set(items)
+    assert set(all_labels) == set(range(len(items)))
+
+    trans = transforms.Compose([transforms.ToPILImage()])
+
+    assert len(FIRST_ADDED_IMAGES) == len(all_data)
+
+    for idx, image_tensor in enumerate(all_data):
+        pil_image = trans(image_tensor).convert("RGB")
+        image_bytes = pil_image.tobytes()
+        if image_bytes not in FIRST_ADDED_IMAGES:
+            raise ValueError(f"Could not find image {idx} in created images, all_samples = {all_samples}")
+
+
+def test_dataset() -> None:
+    NUM_IMAGES = 10
+
+    check_get_current_timestamp()  # Check if the storage service is available.
+    create_dataset_dir()
+    add_images_to_dataset(0, NUM_IMAGES, FIRST_ADDED_IMAGES)  # Add images to the dataset.
+    register_new_dataset()
+    check_dataset_availability()  # Check if the dataset is available.
+    check_dataset_size_invalid()
+
+    keys = get_data_keys()
+
+    for num_dataworkers in [0, 1, 2, 4, 8, 16]:
+        pipeline_id, trigger_id = prepare_selector(num_dataworkers, keys)
+        for prefetched_partitions in [0, 1, 2, 3, 4, 5, 999]:
+            ppr_list = [999]
+            if prefetched_partitions == 5:
+                ppr_list = [1, 2, 5, 999]
+
+            for parallel_prefetch_requests in ppr_list:
+                for batch_size in [1, 2, 10]:
+                    print(
+                        f"Testing num_workers = {num_dataworkers}, partitions = {prefetched_partitions},"
+                        + f"batch_size = {batch_size}, parallel_prefetch_requests={parallel_prefetch_requests}"
+                    )
+                    test_dataset_impl(
+                        num_dataworkers,
+                        batch_size,
+                        prefetched_partitions,
+                        parallel_prefetch_requests,
+                        pipeline_id,
+                        trigger_id,
+                        keys,
+                    )
+                    gc.collect()
+
+
+def main() -> None:
+    try:
+        test_dataset()
+    finally:
+        cleanup_dataset_dir()
+        cleanup_storage_database()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/integrationtests/run.sh b/integrationtests/run.sh
index bc25824f3..2f8c363f5 100755
--- a/integrationtests/run.sh
+++ b/integrationtests/run.sh
@@ -7,6 +7,7 @@ echo "Running as user $USER"
 
 echo "Running basic availability tests"
 python $SCRIPT_DIR/test_docker_compose.py
+echo "Running FTP availability tests"
 python $SCRIPT_DIR/test_ftp_connections.py
 echo "Running storage integration tests"
 python $SCRIPT_DIR/storage/integrationtest_storage.py
@@ -14,6 +15,8 @@ python $SCRIPT_DIR/storage/integrationtest_storage_csv.py
 python $SCRIPT_DIR/storage/integrationtest_storage_binary.py
 echo "Running selector integration tests"
 python $SCRIPT_DIR/selector/integrationtest_selector.py
+echo "Running online datasets integration tests"
+python $SCRIPT_DIR/online_dataset/test_online_dataset.py
 echo "Running model storage integration tests"
 python $SCRIPT_DIR/model_storage/integrationtest_model_storage.py
 echo "Successfuly ran all integration tests."
diff --git a/integrationtests/storage/integrationtest_storage.py b/integrationtests/storage/integrationtest_storage.py
index edadc7699..86693bbbc 100644
--- a/integrationtests/storage/integrationtest_storage.py
+++ b/integrationtests/storage/integrationtest_storage.py
@@ -271,7 +271,7 @@ def test_storage() -> None:
 
     add_images_to_dataset(10, 20, SECOND_ADDED_IMAGES)  # Add more images to the dataset.
 
-    for i in range(20):
+    for i in range(60):
         responses = list(get_new_data_since(IMAGE_UPDATED_TIME_STAMPS[9] + 1))
         assert len(responses) < 2, f"Received batched response, shouldn't happen: {responses}"
         if len(responses) == 1:
diff --git a/integrationtests/test_docker_compose.py b/integrationtests/test_docker_compose.py
index 81759de72..1879d0437 100644
--- a/integrationtests/test_docker_compose.py
+++ b/integrationtests/test_docker_compose.py
@@ -7,7 +7,7 @@
 from modyn.storage.internal.grpc.generated.storage_pb2_grpc import StorageStub  # noqa: F401
 from modyn.utils import grpc_connection_established
 
-TIMEOUT = 60  # seconds
+TIMEOUT = 180  # seconds
 
 
 def terminate_on_timeout(start_time: int) -> None:
@@ -29,6 +29,8 @@ def storage_running() -> bool:
         print(f"Could not establish gRPC connection to storage at {storage_address}. Retrying.")
         return False
 
+    print("Sucessfully connected to storage!")
+
     return True
 
 
@@ -42,6 +44,8 @@ def model_storage_running() -> bool:
         print(f"Could not establish gRPC connection to model storage at {model_storage_address}. Retrying.")
         return False
 
+    print("Sucessfully connected to model storage!")
+
     return True
 
 
@@ -55,6 +59,8 @@ def evaluator_running() -> bool:
         print(f"Could not establish gRPC connection to evaluator at {evaluator_address}. Retrying.")
         return False
 
+    print("Sucessfully connected to evaluator!")
+
     return True
 
 
@@ -68,6 +74,8 @@ def trainer_server_running() -> bool:
         print(f"Could not establish gRPC connection to trainer server at {trainer_server_address}. Retrying.")
         return False
 
+    print("Sucessfully connected to trainer server!")
+
     return True
 
 
@@ -83,6 +91,8 @@ def storage_db_running() -> bool:
             connect_timeout=5,
         )
 
+        print("Sucessfully connected to storage database!")
+
         return True
     except (Exception, psycopg2.DatabaseError) as error:
         print("Error while connecting to the database: " + str(error))
@@ -101,6 +111,8 @@ def metadata_db_running() -> bool:
             connect_timeout=5,
         )
 
+        print("Sucessfully connected to metadata database!")
+
         return True
     except (Exception, psycopg2.DatabaseError) as error:
         print("Error while connecting to the database: " + str(error))
@@ -117,6 +129,8 @@ def selector_running() -> bool:
         print(f"Could not establish gRPC connection to selector at {selector_address}. Retrying.")
         return False
 
+    print("Sucessfully connected to selector!")
+
     return True
 
 
diff --git a/modyn/common/grpc/__init__.py b/modyn/common/grpc/__init__.py
new file mode 100644
index 000000000..6040a0a16
--- /dev/null
+++ b/modyn/common/grpc/__init__.py
@@ -0,0 +1,10 @@
+"""
+This submodule implements functions to run gRPC servers using multiprocessing.
+"""
+import os
+
+from .grpc_helpers import GenericGRPCServer  # noqa: F401
+
+files = os.listdir(os.path.dirname(__file__))
+files.remove("__init__.py")
+__all__ = [f[:-3] for f in files if f.endswith(".py")]
diff --git a/modyn/common/grpc/grpc_helpers.py b/modyn/common/grpc/grpc_helpers.py
new file mode 100644
index 000000000..e85527d37
--- /dev/null
+++ b/modyn/common/grpc/grpc_helpers.py
@@ -0,0 +1,118 @@
+import contextlib
+import datetime
+import logging
+import multiprocessing as mp
+import os
+import socket
+import time
+from concurrent import futures
+from typing import Any, Callable, Generator, Optional
+
+import grpc
+from modyn.utils import MAX_MESSAGE_SIZE
+
+logger = logging.getLogger(__name__)
+
+# Minimum 2 processes and 4 threads per process, currently max 64 processes
+CPU_CORES = os.cpu_count()
+if CPU_CORES is None:  # cannot do that in single expression due to mypy...
+    CPU_CORES = 64
+NUM_GPRC_PROCESSES = max(2, min(64, CPU_CORES))
+PROCESS_THREAD_WORKERS = max(4, int(NUM_GPRC_PROCESSES / 4))
+
+
+@contextlib.contextmanager
+def reserve_port(port: str) -> Generator:
+    """Find and reserve a port for all subprocesses to use."""
+    sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
+    if sock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT) == 0:
+        raise RuntimeError("Failed to set SO_REUSEPORT.")
+    sock.bind(("", int(port)))
+    try:
+        assert sock.getsockname()[1] == int(port)
+        yield port
+    finally:
+        sock.close()
+
+
+def _wait_forever(server: Any) -> None:
+    try:
+        while True:
+            time.sleep(datetime.timedelta(days=1).total_seconds())
+    except KeyboardInterrupt:
+        server.stop(None)
+
+
+def _run_server_worker(
+    bind_address: str, add_servicer_callback: Callable, modyn_config: dict, callback_kwargs: dict
+) -> None:
+    """Start a server in a subprocess."""
+    logging.info(f"[{os.getpid()}] Starting new gRPC server process.")
+
+    server = grpc.server(
+        futures.ThreadPoolExecutor(
+            max_workers=PROCESS_THREAD_WORKERS,
+        ),
+        options=[
+            ("grpc.max_receive_message_length", MAX_MESSAGE_SIZE),
+            ("grpc.max_send_message_length", MAX_MESSAGE_SIZE),
+            ("grpc.so_reuseport", 1),
+        ],
+    )
+
+    add_servicer_callback(modyn_config, server, **callback_kwargs)
+    server.add_insecure_port(bind_address)
+    server.start()
+    _wait_forever(server)
+
+
+class GenericGRPCServer:
+    def __init__(
+        self, modyn_config: dict, port: str, add_servicer_callback: Callable, callback_kwargs: Optional[dict] = None
+    ) -> None:
+        """Initialize the GRPC server."""
+        self.port = port
+        self.modyn_config = modyn_config
+        self.add_servicer_callback = add_servicer_callback
+        self.callback_kwargs = callback_kwargs if callback_kwargs is not None else {}
+        self.workers: list[mp.Process] = []
+
+    def __enter__(self) -> Any:
+        """Enter the context manager.
+
+        Returns:
+            grpc.Server: GRPC server
+        """
+        logger.info(f"[{os.getpid()}] Starting server. Listening on port {self.port}")
+        with reserve_port(self.port) as port:
+            bind_address = "[::]:" + port
+            for _ in range(NUM_GPRC_PROCESSES):
+                worker = mp.Process(
+                    target=_run_server_worker,
+                    args=(bind_address, self.add_servicer_callback, self.modyn_config, self.callback_kwargs),
+                )
+                worker.start()
+                self.workers.append(worker)
+
+        return self
+
+    def __getstate__(self) -> dict:
+        state = self.__dict__.copy()
+        del state["add_servicer_callback"]
+        return state
+
+    def wait_for_termination(self) -> None:
+        for worker in self.workers:
+            worker.join()
+
+    def __exit__(self, exc_type: type, exc_val: Exception, exc_tb: Exception) -> None:
+        """Exit the context manager.
+
+        Args:
+            exc_type (type): exception type
+            exc_val (Exception): exception value
+            exc_tb (Exception): exception traceback
+        """
+        self.wait_for_termination()
+        del self.workers
diff --git a/modyn/common/trigger_sample/trigger_sample_storage.py b/modyn/common/trigger_sample/trigger_sample_storage.py
index ae671d925..79fa52ec9 100644
--- a/modyn/common/trigger_sample/trigger_sample_storage.py
+++ b/modyn/common/trigger_sample/trigger_sample_storage.py
@@ -232,6 +232,13 @@ def _parse_file(self, file_path: Path) -> np.ndarray:
         Returns:
             list[tuple[int, float]]: List of trigger samples.
         """
+
+        # When there are few samples, it may happen that some workers don't have any samples to store.
+        # Therefore, they do not write anything; you get an error if you try to read their file.
+        # This way, it returns an empty array if the worker has not written anything.
+        if not os.path.isfile(file_path):
+            return np.ndarray(0, dtype="i8,f8")
+
         return np.load(file_path, allow_pickle=False, fix_imports=False)
 
     def _parse_file_subset(self, file_path: Path, start_index: int, end_index: int) -> np.memmap:
diff --git a/modyn/config/schema/pipeline-schema.yaml b/modyn/config/schema/pipeline-schema.yaml
index 7984fb27e..8bbdcf792 100644
--- a/modyn/config/schema/pipeline-schema.yaml
+++ b/modyn/config/schema/pipeline-schema.yaml
@@ -48,6 +48,14 @@ properties:
         type: number
         description: |
           The number of epochs per trigger. Defaults to 1, if not given.
+      num_prefetched_partitions:
+        type: number
+        description: |
+          The number of partitions that are prefetched per DataLoader worker. Defaults to 1, if not given.
+      parallel_prefetch_requests:
+        type: number
+        description: |
+          The number of parallel prefetch requests per DataLoader worker. Defaults to 1, if not given. Values bigger than num_prefetched_partitions are equal to num_prefetched_partitions. 
       device:
         type: string
         description: |
diff --git a/modyn/metadata_database/metadata_database_connection.py b/modyn/metadata_database/metadata_database_connection.py
index ac337bc1f..311a8c3f0 100644
--- a/modyn/metadata_database/metadata_database_connection.py
+++ b/modyn/metadata_database/metadata_database_connection.py
@@ -67,16 +67,17 @@ def create_tables(self) -> None:
         """
         MetadataBase.metadata.create_all(self.engine)
 
-    def register_pipeline(self, num_workers: int) -> int:
+    def register_pipeline(self, num_workers: int, selection_strategy: str) -> int:
         """Register a new pipeline in the database.
 
         Args:
             num_workers (int): Number of workers in the pipeline.
+            selection_strategy (str): The selection strategy to use
 
         Returns:
             int: Id of the newly created pipeline.
         """
-        pipeline = Pipeline(num_workers=num_workers)
+        pipeline = Pipeline(num_workers=num_workers, selection_strategy=selection_strategy)
         self.session.add(pipeline)
         self.session.commit()
         pipeline_id = pipeline.pipeline_id
diff --git a/modyn/metadata_database/models/pipelines.py b/modyn/metadata_database/models/pipelines.py
index 4094b3f95..cd8370c7e 100644
--- a/modyn/metadata_database/models/pipelines.py
+++ b/modyn/metadata_database/models/pipelines.py
@@ -1,7 +1,7 @@
 """Pipeline model."""
 
 from modyn.metadata_database.metadata_base import MetadataBase
-from sqlalchemy import Column, Integer
+from sqlalchemy import Column, Integer, Text
 
 
 class Pipeline(MetadataBase):
@@ -12,6 +12,7 @@ class Pipeline(MetadataBase):
     __table_args__ = {"extend_existing": True}
     pipeline_id = Column("pipeline_id", Integer, primary_key=True)
     num_workers = Column("num_workers", Integer, nullable=False)
+    selection_strategy = Column("selection_strategy", Text, nullable=False)
 
     def __repr__(self) -> str:
         """Return string representation."""
diff --git a/modyn/models/articlenet/articlenet.py b/modyn/models/articlenet/articlenet.py
index 244e5c623..e4f961bdb 100644
--- a/modyn/models/articlenet/articlenet.py
+++ b/modyn/models/articlenet/articlenet.py
@@ -1,3 +1,4 @@
+# pylint: disable=W0223
 from typing import Any
 
 import torch
@@ -19,7 +20,7 @@ def __init__(self, model_configuration: dict[str, Any], device: str, amp: bool)
         self.model.to(device)
 
 
-class DistilBertFeaturizer(DistilBertModel):
+class DistilBertFeaturizer(DistilBertModel):  # pylint: disable=abstract-method
     def __init__(self, config: Any) -> None:
         super().__init__(config)
         self.d_out = config.hidden_size
diff --git a/modyn/protos/trainer_server.proto b/modyn/protos/trainer_server.proto
index 9f52f7f6d..cf31c2850 100644
--- a/modyn/protos/trainer_server.proto
+++ b/modyn/protos/trainer_server.proto
@@ -53,8 +53,10 @@ message StartTrainingRequest {
   PythonString label_transformer = 19;
   JsonString grad_scaler_configuration = 20;
   int32 epochs_per_trigger = 21;
-  optional int32 seed = 22;
-  optional PythonString tokenizer = 23;
+  int32 num_prefetched_partitions = 22;
+  int32 parallel_prefetch_requests = 23;
+  optional int32 seed = 24;
+  optional PythonString tokenizer = 25;
 }
 
 message StartTrainingResponse {
diff --git a/modyn/selector/internal/grpc/selector_grpc_servicer.py b/modyn/selector/internal/grpc/selector_grpc_servicer.py
index a1bea06ec..0db6cf8f6 100644
--- a/modyn/selector/internal/grpc/selector_grpc_servicer.py
+++ b/modyn/selector/internal/grpc/selector_grpc_servicer.py
@@ -1,5 +1,7 @@
 import json
 import logging
+import os
+import threading
 from typing import Iterable
 
 import grpc
@@ -59,8 +61,11 @@ def get_sample_keys_and_weights(  # pylint: disable-next=unused-argument
             request.worker_id,
             request.partition_id,
         )
+        tid = threading.get_native_id()
+        pid = os.getpid()
+
         logger.info(
-            f"[Pipeline {pipeline_id}]: Fetching samples for trigger id {trigger_id}"
+            f"[{pid}][{tid}][Pipeline {pipeline_id}]: Fetching samples for trigger id {trigger_id}"
             + f" and worker id {worker_id} and partition id {partition_id}"
         )
 
diff --git a/modyn/selector/internal/grpc/selector_server.py b/modyn/selector/internal/grpc/selector_server.py
index 0d4345be0..80debf1ec 100644
--- a/modyn/selector/internal/grpc/selector_server.py
+++ b/modyn/selector/internal/grpc/selector_server.py
@@ -1,41 +1,44 @@
 import logging
-from concurrent import futures
+from typing import Any
 
-import grpc
+from modyn.common.grpc import GenericGRPCServer
 from modyn.selector.internal.grpc.generated.selector_pb2_grpc import add_SelectorServicer_to_server  # noqa: E402, E501
 from modyn.selector.internal.grpc.selector_grpc_servicer import SelectorGRPCServicer
 from modyn.selector.internal.selector_manager import SelectorManager
-from modyn.utils import MAX_MESSAGE_SIZE
 
 logger = logging.getLogger(__name__)
 
 
-class SelectorServer:
+class SelectorGRPCServer(GenericGRPCServer):
+    @staticmethod
+    def callback(modyn_config: dict, server: Any, selector_manager: SelectorManager) -> None:
+        add_SelectorServicer_to_server(
+            SelectorGRPCServicer(selector_manager, modyn_config["selector"]["sample_batch_size"]), server
+        )
+
     def __init__(self, modyn_config: dict) -> None:
         self.modyn_config = modyn_config
         self.selector_manager = SelectorManager(modyn_config)
-        self.grpc_servicer = SelectorGRPCServicer(
-            self.selector_manager, self.modyn_config["selector"]["sample_batch_size"]
-        )
-        self._add_servicer_to_server_func = add_SelectorServicer_to_server
-
-    def prepare_server(self) -> grpc.server:
-        server = grpc.server(
-            futures.ThreadPoolExecutor(max_workers=10),
-            options=[
-                ("grpc.max_receive_message_length", MAX_MESSAGE_SIZE),
-                ("grpc.max_send_message_length", MAX_MESSAGE_SIZE),
-            ],
-        )
-        self._add_servicer_to_server_func(self.grpc_servicer, server)
-        return server
-
-    def run(self) -> None:
-        server = self.prepare_server()
-        logger.info(f"Starting server. Listening on port {self.modyn_config['selector']['port']}.")
-        server.add_insecure_port("[::]:" + self.modyn_config["selector"]["port"])
-        server.start()
-        server.wait_for_termination()
+
+        callback_kwargs = {"selector_manager": self.selector_manager}
+        super().__init__(modyn_config, modyn_config["selector"]["port"], SelectorGRPCServer.callback, callback_kwargs)
+
+    def __getstate__(self) -> dict[str, Any]:
+        state = self.__dict__.copy()
+        if "add_servicer_callback" in state:
+            del state["add_servicer_callback"]
+
+        return state
+
+    def __exit__(self, exc_type: type, exc_val: Exception, exc_tb: Exception) -> None:
+        """Exit the context manager.
+
+        Args:
+            exc_type (type): exception type
+            exc_val (Exception): exception value
+            exc_tb (Exception): exception traceback
+        """
+        super().__exit__(exc_type, exc_val, exc_tb)
         if (
             "cleanup_trigger_samples_after_shutdown" in self.modyn_config["selector"]
             and self.modyn_config["selector"]["cleanup_trigger_samples_after_shutdown"]
diff --git a/modyn/selector/internal/selector_manager.py b/modyn/selector/internal/selector_manager.py
index 51fa6bf89..c3f2fed9f 100644
--- a/modyn/selector/internal/selector_manager.py
+++ b/modyn/selector/internal/selector_manager.py
@@ -4,10 +4,13 @@
 import logging
 import os
 import shutil
+from multiprocessing import Manager
+from multiprocessing.managers import DictProxy
 from pathlib import Path
-from threading import Lock
+from typing import Any, Optional
 
 from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection
+from modyn.metadata_database.models.pipelines import Pipeline
 from modyn.selector.internal.selector_strategies.abstract_selection_strategy import AbstractSelectionStrategy
 from modyn.selector.selector import Selector
 from modyn.utils.utils import dynamic_module_import, is_directory_writable
@@ -18,14 +21,25 @@
 class SelectorManager:
     def __init__(self, modyn_config: dict) -> None:
         self._modyn_config = modyn_config
+        self._manager = Manager()
         self._selectors: dict[int, Selector] = {}
-        self._selector_locks: dict[int, Lock] = {}
-        self._next_pipeline_lock = Lock()
+        self._selector_locks: DictProxy[int, Any] = self._manager.dict()
+        self._next_pipeline_lock = self._manager.Lock()
         self._selector_cache_size = self._modyn_config["selector"]["keys_in_selector_cache"]
 
+        # TODO(309): currently we have to prepare N locks and then share.
+        # This is because we cannot share the manager with subprocesses.
+        # For now not a big problem since we mostly run one pipeline but we might want to redesign this.
+        self._prepared_locks = [self._manager.Lock() for _ in range(64)]
+
         self.init_metadata_db()
         self._init_trigger_sample_directory()
 
+    def __getstate__(self) -> dict:
+        state = self.__dict__.copy()
+        del state["_manager"]
+        return state
+
     def init_metadata_db(self) -> None:
         with MetadataDatabaseConnection(self._modyn_config) as database:
             database.create_tables()
@@ -57,6 +71,30 @@ def _init_trigger_sample_directory(self) -> None:
                 + f"Directory info: {os.stat(trigger_sample_directory)}"
             )
 
+    def _populate_pipeline_if_exists(self, pipeline_id: int) -> None:
+        if pipeline_id in self._selectors:
+            return
+
+        with MetadataDatabaseConnection(self._modyn_config) as database:
+            pipeline: Optional[Pipeline] = database.session.get(Pipeline, pipeline_id)
+            if pipeline is None:
+                return
+            logging.info(
+                "[%d] Instantiating new selector for pipeline %d"
+                + " that was in the DB but previously unknown to this process",
+                os.getpid(),
+                pipeline_id,
+            )
+            self._selector_locks[pipeline_id] = self._prepared_locks[pipeline_id % len(self._prepared_locks)]
+
+            self._instantiate_selector(pipeline_id, pipeline.num_workers, pipeline.selection_strategy)
+
+    def _instantiate_selector(self, pipeline_id: int, num_workers: int, selection_strategy: str) -> None:
+        assert pipeline_id in self._selector_locks, f"Trying to register pipeline {pipeline_id} without existing lock!"
+        selection_strategy = self._instantiate_strategy(json.loads(selection_strategy), pipeline_id)
+        selector = Selector(selection_strategy, pipeline_id, num_workers, self._modyn_config, self._selector_cache_size)
+        self._selectors[pipeline_id] = selector
+
     def register_pipeline(self, num_workers: int, selection_strategy: str) -> int:
         """
         Registers a new pipeline at the Selector.
@@ -70,12 +108,11 @@ def register_pipeline(self, num_workers: int, selection_strategy: str) -> int:
 
         with self._next_pipeline_lock:
             with MetadataDatabaseConnection(self._modyn_config) as database:
-                pipeline_id = database.register_pipeline(num_workers)
+                pipeline_id = database.register_pipeline(num_workers, selection_strategy)
+
+        self._selector_locks[pipeline_id] = self._prepared_locks[pipeline_id % len(self._prepared_locks)]
+        self._instantiate_selector(pipeline_id, num_workers, selection_strategy)
 
-        selection_strategy = self._instantiate_strategy(json.loads(selection_strategy), pipeline_id)
-        selector = Selector(selection_strategy, pipeline_id, num_workers, self._selector_cache_size)
-        self._selectors[pipeline_id] = selector
-        self._selector_locks[pipeline_id] = Lock()
         return pipeline_id
 
     def get_sample_keys_and_weights(
@@ -92,6 +129,8 @@ def get_sample_keys_and_weights(
             List of tuples for the samples to be returned to that particular worker. The first
             index of the tuple will be the key, and the second index will be that sample's weight.
         """
+        self._populate_pipeline_if_exists(pipeline_id)
+
         if pipeline_id not in self._selectors:
             raise ValueError(f"Requested keys from pipeline {pipeline_id} which does not exist!")
 
@@ -104,6 +143,8 @@ def get_sample_keys_and_weights(
     def inform_data(
         self, pipeline_id: int, keys: list[int], timestamps: list[int], labels: list[int]
     ) -> dict[str, object]:
+        self._populate_pipeline_if_exists(pipeline_id)
+
         if pipeline_id not in self._selectors:
             raise ValueError(f"Informing pipeline {pipeline_id} of data. Pipeline does not exist!")
 
@@ -113,6 +154,8 @@ def inform_data(
     def inform_data_and_trigger(
         self, pipeline_id: int, keys: list[int], timestamps: list[int], labels: list[int]
     ) -> tuple[int, dict[str, object]]:
+        self._populate_pipeline_if_exists(pipeline_id)
+
         if pipeline_id not in self._selectors:
             raise ValueError(f"Informing pipeline {pipeline_id} of data and triggering. Pipeline does not exist!")
 
@@ -120,30 +163,40 @@ def inform_data_and_trigger(
             return self._selectors[pipeline_id].inform_data_and_trigger(keys, timestamps, labels)
 
     def get_number_of_samples(self, pipeline_id: int, trigger_id: int) -> int:
+        self._populate_pipeline_if_exists(pipeline_id)
+
         if pipeline_id not in self._selectors:
             raise ValueError(f"Requested number of samples from pipeline {pipeline_id} which does not exist!")
 
         return self._selectors[pipeline_id].get_number_of_samples(trigger_id)
 
     def get_status_bar_scale(self, pipeline_id: int) -> int:
+        self._populate_pipeline_if_exists(pipeline_id)
+
         if pipeline_id not in self._selectors:
             raise ValueError(f"Requested status bar scale from pipeline {pipeline_id} which does not exist!")
 
         return self._selectors[pipeline_id].get_status_bar_scale()
 
     def get_number_of_partitions(self, pipeline_id: int, trigger_id: int) -> int:
+        self._populate_pipeline_if_exists(pipeline_id)
+
         if pipeline_id not in self._selectors:
             raise ValueError(f"Requested number of partitions from pipeline {pipeline_id} which does not exist!")
 
         return self._selectors[pipeline_id].get_number_of_partitions(trigger_id)
 
     def get_available_labels(self, pipeline_id: int) -> list[int]:
+        self._populate_pipeline_if_exists(pipeline_id)
+
         if pipeline_id not in self._selectors:
             raise ValueError(f"Requested available labels from pipeline {pipeline_id} which does not exist!")
 
         return self._selectors[pipeline_id].get_available_labels()
 
     def uses_weights(self, pipeline_id: int) -> bool:
+        self._populate_pipeline_if_exists(pipeline_id)
+
         if pipeline_id not in self._selectors:
             raise ValueError(f"Requested whether the pipeline {pipeline_id} uses weights but it does not exist!")
 
@@ -169,6 +222,8 @@ def _instantiate_strategy(self, selection_strategy: dict, pipeline_id: int) -> A
         return strategy_handler(config, self._modyn_config, pipeline_id, maximum_keys_in_memory)
 
     def get_selection_strategy_remote(self, pipeline_id: int) -> tuple[bool, str, dict]:
+        self._populate_pipeline_if_exists(pipeline_id)
+
         if pipeline_id not in self._selectors:
             raise ValueError(f"Requested selection strategy for pipeline {pipeline_id} which does not exist!")
 
diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/__init__.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/__init__.py
index 617a8733c..86cc6d3e3 100644
--- a/modyn/selector/internal/selector_strategies/downsampling_strategies/__init__.py
+++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/__init__.py
@@ -1,10 +1,15 @@
 import os
 
 from .abstract_downsampling_strategy import AbstractDownsamplingStrategy  # noqa: F401
+from .craig_downsampling_strategy import CraigDownsamplingStrategy  # noqa: F401
 from .downsampling_scheduler import DownsamplingScheduler, instantiate_scheduler  # noqa: F401
+from .gradmatch_downsampling_strategy import GradMatchDownsamplingStrategy  # noqa: F401
 from .gradnorm_downsampling_strategy import GradNormDownsamplingStrategy  # noqa: F401
+from .kcentergreedy_downsampling_strategy import KcenterGreedyDownsamplingStrategy  # noqa: F401
 from .loss_downsampling_strategy import LossDownsamplingStrategy  # noqa: F401
 from .no_downsampling_strategy import NoDownsamplingStrategy  # noqa: F401
+from .submodular_downsampling_strategy import SubmodularDownsamplingStrategy  # noqa: F401
+from .uncertainty_downsampling_strategy import UncertaintyDownsamplingStrategy  # noqa: F401
 from .utils import instantiate_downsampler  # noqa: F401
 
 files = os.listdir(os.path.dirname(__file__))
diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py
index f08cfc319..91f2bdfc4 100644
--- a/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py
+++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/abstract_downsampling_strategy.py
@@ -49,6 +49,7 @@ def __init__(self, downsampling_config: dict, maximum_keys_in_memory: int) -> No
 
         self.requires_remote_computation = True
         self.maximum_keys_in_memory = maximum_keys_in_memory
+        self.downsampling_config = downsampling_config
         self.downsampling_params = self._build_downsampling_params()
         self.status_bar_scale = self._compute_status_bar_scale()
 
diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/craig_downsampling_strategy.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/craig_downsampling_strategy.py
new file mode 100644
index 000000000..3bea03b13
--- /dev/null
+++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/craig_downsampling_strategy.py
@@ -0,0 +1,18 @@
+from modyn.selector.internal.selector_strategies.downsampling_strategies import AbstractDownsamplingStrategy
+from modyn.utils import DownsamplingMode
+
+
+class CraigDownsamplingStrategy(AbstractDownsamplingStrategy):
+    def __init__(self, downsampling_config: dict, maximum_keys_in_memory: int):
+        super().__init__(downsampling_config, maximum_keys_in_memory)
+
+        self.remote_downsampling_strategy_name = "RemoteCraigDownsamplingStrategy"
+
+    def _build_downsampling_params(self) -> dict:
+        config = super()._build_downsampling_params()
+        config["selection_batch"] = self.downsampling_config.get("selection_batch", 64)
+        config["balance"] = self.downsampling_config.get("balance", False)
+        config["greedy"] = self.downsampling_config.get("greedy", "NaiveGreedy")
+        if config["balance"] and self.downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE:
+            raise ValueError("Balanced sampling (balance=True) can be used only in Sample then Batch mode.")
+        return config
diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/gradmatch_downsampling_strategy.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/gradmatch_downsampling_strategy.py
new file mode 100644
index 000000000..6b4bd4454
--- /dev/null
+++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/gradmatch_downsampling_strategy.py
@@ -0,0 +1,16 @@
+from modyn.selector.internal.selector_strategies.downsampling_strategies import AbstractDownsamplingStrategy
+from modyn.utils import DownsamplingMode
+
+
+class GradMatchDownsamplingStrategy(AbstractDownsamplingStrategy):
+    def __init__(self, downsampling_config: dict, maximum_keys_in_memory: int):
+        super().__init__(downsampling_config, maximum_keys_in_memory)
+
+        self.remote_downsampling_strategy_name = "RemoteGradMatchDownsamplingStrategy"
+
+    def _build_downsampling_params(self) -> dict:
+        config = super()._build_downsampling_params()
+        config["balance"] = self.downsampling_config.get("balance", False)
+        if config["balance"] and self.downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE:
+            raise ValueError("Balanced sampling (balance=True) can be used only in Sample then Batch mode.")
+        return config
diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/kcentergreedy_downsampling_strategy.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/kcentergreedy_downsampling_strategy.py
new file mode 100644
index 000000000..7b4dfe871
--- /dev/null
+++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/kcentergreedy_downsampling_strategy.py
@@ -0,0 +1,16 @@
+from modyn.selector.internal.selector_strategies.downsampling_strategies import AbstractDownsamplingStrategy
+from modyn.utils import DownsamplingMode
+
+
+class KcenterGreedyDownsamplingStrategy(AbstractDownsamplingStrategy):
+    def __init__(self, downsampling_config: dict, maximum_keys_in_memory: int):
+        super().__init__(downsampling_config, maximum_keys_in_memory)
+
+        self.remote_downsampling_strategy_name = "RemoteKcenterGreedyDownsamplingStrategy"
+
+    def _build_downsampling_params(self) -> dict:
+        config = super()._build_downsampling_params()
+        config["balance"] = self.downsampling_config.get("balance", False)
+        if config["balance"] and self.downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE:
+            raise ValueError("Balanced sampling (balance=True) can be used only in Sample then Batch mode.")
+        return config
diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/submodular_downsampling_strategy.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/submodular_downsampling_strategy.py
new file mode 100644
index 000000000..df41c2f47
--- /dev/null
+++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/submodular_downsampling_strategy.py
@@ -0,0 +1,33 @@
+from modyn.selector.internal.selector_strategies.downsampling_strategies import AbstractDownsamplingStrategy
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.submodular_function import (
+    SUBMODULAR_FUNCTIONS,
+)
+from modyn.utils import DownsamplingMode
+
+
+class SubmodularDownsamplingStrategy(AbstractDownsamplingStrategy):
+    def __init__(self, downsampling_config: dict, maximum_keys_in_memory: int):
+        super().__init__(downsampling_config, maximum_keys_in_memory)
+
+        self.remote_downsampling_strategy_name = "RemoteSubmodularDownsamplingStrategy"
+
+    def _build_downsampling_params(self) -> dict:
+        config = super()._build_downsampling_params()
+
+        if "submodular_function" not in self.downsampling_config:
+            raise ValueError(
+                f"Please specify the submodular function used to select the datapoints. "
+                f"Available functions: {SUBMODULAR_FUNCTIONS}, param submodular_function"
+            )
+        config["submodular_function"] = self.downsampling_config["submodular_function"]
+
+        if "submodular_optimizer" in self.downsampling_config:
+            config["submodular_optimizer"] = self.downsampling_config["submodular_optimizer"]
+
+        config["selection_batch"] = self.downsampling_config.get("selection_batch", 64)
+
+        config["balance"] = self.downsampling_config.get("balance", False)
+        if config["balance"] and self.downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE:
+            raise ValueError("Balanced sampling (balance=True) can be used only in Sample then Batch mode.")
+
+        return config
diff --git a/modyn/selector/internal/selector_strategies/downsampling_strategies/uncertainty_downsampling_strategy.py b/modyn/selector/internal/selector_strategies/downsampling_strategies/uncertainty_downsampling_strategy.py
new file mode 100644
index 000000000..4f709d099
--- /dev/null
+++ b/modyn/selector/internal/selector_strategies/downsampling_strategies/uncertainty_downsampling_strategy.py
@@ -0,0 +1,26 @@
+from modyn.selector.internal.selector_strategies.downsampling_strategies import AbstractDownsamplingStrategy
+from modyn.utils import DownsamplingMode
+
+
+class UncertaintyDownsamplingStrategy(AbstractDownsamplingStrategy):
+    def __init__(self, downsampling_config: dict, maximum_keys_in_memory: int):
+        super().__init__(downsampling_config, maximum_keys_in_memory)
+
+        self.remote_downsampling_strategy_name = "RemoteUncertaintyDownsamplingStrategy"
+
+    def _build_downsampling_params(self) -> dict:
+        config = super()._build_downsampling_params()
+
+        if "score_metric" not in self.downsampling_config:
+            raise ValueError(
+                "Please specify the metric used to score uncertainty for the datapoints. "
+                "Available metrics : LeastConfidence, Entropy, Margin"
+                "Use the pipeline parameter score_metric"
+            )
+        config["score_metric"] = self.downsampling_config["score_metric"]
+
+        config["balance"] = self.downsampling_config.get("balance", False)
+        if config["balance"] and self.downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE:
+            raise ValueError("Balanced sampling (balance=True) can be used only in Sample then Batch mode.")
+
+        return config
diff --git a/modyn/selector/selector.py b/modyn/selector/selector.py
index f2ee1ea9a..0fcc9a71d 100644
--- a/modyn/selector/selector.py
+++ b/modyn/selector/selector.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
+from modyn.metadata_database.metadata_database_connection import MetadataDatabaseConnection
+from modyn.metadata_database.models.triggers import Trigger
 from modyn.selector.internal.selector_strategies import CoresetStrategy
 from modyn.selector.internal.selector_strategies.abstract_selection_strategy import AbstractSelectionStrategy
 from modyn.utils.utils import flatten, get_partition_for_worker
@@ -13,12 +15,19 @@ class Selector:
     """
 
     def __init__(
-        self, strategy: AbstractSelectionStrategy, pipeline_id: int, num_workers: int, cache_size: int = 100000
+        self,
+        strategy: AbstractSelectionStrategy,
+        pipeline_id: int,
+        num_workers: int,
+        modyn_config: dict,
+        cache_size: int = 100000,
     ) -> None:
         self._strategy = strategy
         self._pipeline_id = pipeline_id
         self._num_workers = num_workers
+        self._modyn_config = modyn_config
 
+        # TODO(#308): Share partition cache between selector instances
         self._trigger_cache: Dict[int, list[list[tuple[int, float]]]] = {}
         self._maximum_keys_in_cache = cache_size
         self._current_keys_in_cache = 0
@@ -26,6 +35,22 @@ def __init__(
         self._trigger_size_cache: Dict[int, int] = {}
         self._trigger_partition_cache: Dict[int, int] = {}
 
+    def _populate_trigger_if_exists(self, trigger_id: int) -> None:
+        if trigger_id in self._trigger_size_cache:
+            assert trigger_id in self._trigger_partition_cache, "Inconsistent state"
+            return
+
+        if "metadata_database" not in self._modyn_config:  # Can happen in tests
+            return
+
+        with MetadataDatabaseConnection(self._modyn_config) as database:
+            trigger: Optional[Trigger] = database.session.get(Trigger, (trigger_id, self._pipeline_id))
+            if trigger is None:
+                return
+
+            self._trigger_size_cache[trigger_id] = trigger.num_keys
+            self._trigger_partition_cache[trigger_id] = trigger.num_partitions
+
     def get_sample_keys_and_weights(
         self, trigger_id: int, worker_id: int, partition_id: int
     ) -> list[tuple[int, float]]:
@@ -40,6 +65,8 @@ def get_sample_keys_and_weights(
             List of tuples for the samples to be returned to that particular worker. The first
             index of the tuple will be the key, and the second index will be that sample's weight.
         """
+        self._populate_trigger_if_exists(trigger_id)
+
         if trigger_id not in self._trigger_partition_cache or partition_id >= self._trigger_partition_cache[trigger_id]:
             raise ValueError(f"Invalid request: Trigger {trigger_id}, partition {partition_id}")
         if worker_id < 0 or worker_id >= self._num_workers:
@@ -95,6 +122,8 @@ def inform_data_and_trigger(
         return trigger_id, log
 
     def get_number_of_samples(self, trigger_id: int) -> int:
+        self._populate_trigger_if_exists(trigger_id)
+
         if trigger_id not in self._trigger_size_cache:
             raise ValueError(f"Trigger ID {trigger_id} does not exist!")
 
@@ -108,6 +137,8 @@ def get_status_bar_scale(self) -> int:
         return self._strategy.training_status_bar_scale
 
     def get_number_of_partitions(self, trigger_id: int) -> int:
+        self._populate_trigger_if_exists(trigger_id)
+
         if trigger_id not in self._trigger_partition_cache:
             raise ValueError(f"Trigger ID {trigger_id} does not exist!")
 
diff --git a/modyn/selector/selector_entrypoint.py b/modyn/selector/selector_entrypoint.py
index 152b4c125..0795819c1 100644
--- a/modyn/selector/selector_entrypoint.py
+++ b/modyn/selector/selector_entrypoint.py
@@ -1,9 +1,11 @@
 import argparse
 import logging
+import multiprocessing as mp
+import os
 import pathlib
 
 import yaml
-from modyn.selector.internal.grpc.selector_server import SelectorServer
+from modyn.selector.internal.grpc.selector_server import SelectorGRPCServer
 
 logging.basicConfig(
     level=logging.NOTSET,
@@ -12,6 +14,14 @@
 )
 logger = logging.getLogger(__name__)
 
+# We need to do this at the top because other dependencies otherwise set fork.
+try:
+    mp.set_start_method("spawn")
+except RuntimeError as error:
+    if mp.get_start_method() != "spawn" and "PYTEST_CURRENT_TEST" not in os.environ:
+        logger.error("Start method is already set to {}", mp.get_start_method())
+        raise error
+
 
 def setup_argparser() -> argparse.ArgumentParser:
     parser_ = argparse.ArgumentParser(description="Modyn Selector")
@@ -35,9 +45,9 @@ def main() -> None:
         modyn_config = yaml.safe_load(config_file)
 
     logger.info("Initializing selector server.")
-    selector = SelectorServer(modyn_config)
-    logger.info("Starting selector server.")
-    selector.run()
+
+    with SelectorGRPCServer(modyn_config):
+        pass
 
     logger.info("Selector server returned, exiting.")
 
diff --git a/modyn/supervisor/internal/grpc_handler.py b/modyn/supervisor/internal/grpc_handler.py
index c1328461b..8d2136ad8 100644
--- a/modyn/supervisor/internal/grpc_handler.py
+++ b/modyn/supervisor/internal/grpc_handler.py
@@ -300,6 +300,25 @@ def start_training(
         else:
             epochs_per_trigger = 1
 
+        if "num_prefetched_partitions" in pipeline_config["training"]:
+            num_prefetched_partitions = pipeline_config["training"]["num_prefetched_partitions"]
+        else:
+            if "prefetched_partitions" in pipeline_config["training"]:
+                raise ValueError(
+                    "Found `prefetched_partitions` instead of `num_prefetched_partitions`in training configuration."
+                    + " Please rename/remove that configuration"
+                )
+            logger.warning("Number of prefetched partitions not explicitly given in training config - defaulting to 1.")
+            num_prefetched_partitions = 1
+
+        if "parallel_prefetch_requests" in pipeline_config["training"]:
+            parallel_prefetch_requests = pipeline_config["training"]["parallel_prefetch_requests"]
+        else:
+            logger.warning(
+                "Number of parallel prefetch requests not explicitly given in training config - defaulting to 1."
+            )
+            parallel_prefetch_requests = 1
+
         if "seed" in pipeline_config["training"]:
             seed = pipeline_config["training"]["seed"]
         else:
@@ -366,6 +385,8 @@ def start_training(
             "lr_scheduler": TrainerServerJsonString(value=json.dumps(lr_scheduler_configs)),
             "grad_scaler_configuration": TrainerServerJsonString(value=json.dumps(grad_scaler_config)),
             "epochs_per_trigger": epochs_per_trigger,
+            "num_prefetched_partitions": num_prefetched_partitions,
+            "parallel_prefetch_requests": parallel_prefetch_requests,
             "seed": seed,
             "tokenizer": PythonString(value=tokenizer) if tokenizer is not None else None,
         }
diff --git a/modyn/tests/common/grpc/test_grpc_helpers.py b/modyn/tests/common/grpc/test_grpc_helpers.py
new file mode 100644
index 000000000..9c6d4014b
--- /dev/null
+++ b/modyn/tests/common/grpc/test_grpc_helpers.py
@@ -0,0 +1,7 @@
+from modyn.common.grpc import GenericGRPCServer
+
+# TODO(310): add more meaningful tests
+
+
+def test_init():
+    GenericGRPCServer({}, "1234", lambda x: None)
diff --git a/modyn/tests/metadata_database/models/test_pipelines.py b/modyn/tests/metadata_database/models/test_pipelines.py
index ba618fec2..cd78125e3 100644
--- a/modyn/tests/metadata_database/models/test_pipelines.py
+++ b/modyn/tests/metadata_database/models/test_pipelines.py
@@ -19,9 +19,7 @@ def session():
 
 
 def test_add_pipeline(session):
-    pipeline = Pipeline(
-        num_workers=10,
-    )
+    pipeline = Pipeline(num_workers=10, selection_strategy="{}")
     session.add(pipeline)
     session.commit()
 
@@ -30,9 +28,7 @@ def test_add_pipeline(session):
 
 
 def test_update_pipeline(session):
-    pipeline = Pipeline(
-        num_workers=10,
-    )
+    pipeline = Pipeline(num_workers=10, selection_strategy="{}")
     session.add(pipeline)
     session.commit()
 
@@ -41,12 +37,11 @@ def test_update_pipeline(session):
 
     assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first() is not None
     assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().num_workers == 20
+    assert session.query(Pipeline).filter(Pipeline.pipeline_id == 1).first().selection_strategy == "{}"
 
 
 def test_delete_pipeline(session):
-    pipeline = Pipeline(
-        num_workers=10,
-    )
+    pipeline = Pipeline(num_workers=10, selection_strategy="{}")
     session.add(pipeline)
     session.commit()
 
diff --git a/modyn/tests/metadata_database/test_metadata_database_connection.py b/modyn/tests/metadata_database/test_metadata_database_connection.py
index eb96e579d..51accd949 100644
--- a/modyn/tests/metadata_database/test_metadata_database_connection.py
+++ b/modyn/tests/metadata_database/test_metadata_database_connection.py
@@ -24,16 +24,16 @@ def test_database_connection():
 def test_register_pipeline():
     with MetadataDatabaseConnection(get_minimal_modyn_config()) as database:
         database.create_tables()
-        pipeline_id = database.register_pipeline(1)
+        pipeline_id = database.register_pipeline(1, "{}")
         assert pipeline_id == 1
-        pipeline_id = database.register_pipeline(1)
+        pipeline_id = database.register_pipeline(1, "{}")
         assert pipeline_id == 2
 
 
 def test_add_trained_model():
     with MetadataDatabaseConnection(get_minimal_modyn_config()) as database:
         database.create_tables()
-        pipeline_id = database.register_pipeline(1)
+        pipeline_id = database.register_pipeline(1, "{}")
         trigger = Trigger(pipeline_id=pipeline_id, trigger_id=5)
 
         database.session.add(trigger)
diff --git a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py
index 8e13a90f5..7a84c0533 100644
--- a/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py
+++ b/modyn/tests/model_storage/internal/grpc/test_model_storage_grpc_servicer.py
@@ -43,13 +43,13 @@ def setup():
     with MetadataDatabaseConnection(get_modyn_config()) as database:
         database.create_tables()
 
-        pipeline_id = database.register_pipeline(1)
+        pipeline_id = database.register_pipeline(1, "{}")
         trigger = Trigger(trigger_id=10, pipeline_id=pipeline_id)
 
         database.session.add(trigger)
         database.session.commit()
 
-        pipeline2 = database.register_pipeline(4)
+        pipeline2 = database.register_pipeline(4, "{}")
         trigger2 = Trigger(trigger_id=50, pipeline_id=pipeline2)
 
         database.session.add(trigger2)
diff --git a/modyn/tests/selector/internal/grpc/test_selector_server.py b/modyn/tests/selector/internal/grpc/test_selector_server.py
index 47c3e73e2..1f91013e0 100644
--- a/modyn/tests/selector/internal/grpc/test_selector_server.py
+++ b/modyn/tests/selector/internal/grpc/test_selector_server.py
@@ -1,9 +1,8 @@
 # pylint: disable=unused-argument,redefined-outer-name
 import tempfile
-from unittest import mock
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 
-from modyn.selector.internal.grpc.selector_server import SelectorServer
+from modyn.selector.internal.grpc.selector_server import SelectorGRPCServer
 from modyn.selector.internal.selector_manager import SelectorManager
 
 
@@ -27,42 +26,6 @@ def test_init():
     with tempfile.TemporaryDirectory() as tmp_dir:
         config = get_modyn_config()
         config["selector"]["trigger_sample_directory"] = tmp_dir
-        grpc_server = SelectorServer(config)
+        grpc_server = SelectorGRPCServer(config)
         assert grpc_server.modyn_config == config
-
-
-@patch.object(SelectorManager, "init_metadata_db", noop_init_metadata_db)
-def test_prepare_server():
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        config = get_modyn_config()
-        config["selector"]["trigger_sample_directory"] = tmp_dir
-        grpc_server = SelectorServer(config)
-        mock_add = mock.Mock()
-        grpc_server._add_servicer_to_server_func = mock_add
-
-        assert grpc_server.prepare_server() is not None
-
-        mock_add.assert_called_once()
-
-
-@patch.object(SelectorManager, "init_metadata_db", noop_init_metadata_db)
-@patch.object(SelectorServer, "prepare_server")
-def test_run(test_prepare_server: MagicMock):
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        config = get_modyn_config()
-        config["selector"]["trigger_sample_directory"] = tmp_dir
-        grpc_server = SelectorServer(config)
-
-        mock_start = mock.Mock()
-        mock_wait = mock.Mock()
-
-        server = grpc_server.prepare_server()
-        server.start = mock_start
-        server.wait_for_termination = mock_wait
-
-        test_prepare_server.return_value = server
-
-        grpc_server.run()
-
-        mock_start.assert_called_once()
-        mock_wait.assert_called_once()
+        assert isinstance(grpc_server.selector_manager, SelectorManager)
diff --git a/modyn/tests/selector/internal/test_selector_manager.py b/modyn/tests/selector/internal/test_selector_manager.py
index 045acbdcf..7936dd00a 100644
--- a/modyn/tests/selector/internal/test_selector_manager.py
+++ b/modyn/tests/selector/internal/test_selector_manager.py
@@ -43,8 +43,11 @@ def _reset_state(self) -> None:  # pylint: disable=unused-argument
 class MockDatabaseConnection:
     def __init__(self, modyn_config: dict):  # pylint: disable=super-init-not-called,unused-argument
         self.current_pipeline_id = 0
+        self.session = MockSession()
 
-    def register_pipeline(self, number_of_workers: int) -> Optional[int]:  # pylint: disable=unused-argument
+    def register_pipeline(
+        self, number_of_workers: int, selection_strategy: str  # pylint: disable=unused-argument
+    ) -> Optional[int]:
         pid = self.current_pipeline_id
         self.current_pipeline_id += 1
         return pid
@@ -56,6 +59,11 @@ def __exit__(self, exc_type: type, exc_val: Exception, exc_tb: Exception):
         pass
 
 
+class MockSession:
+    def get(self, some_type, pipeline_id):  # pylint: disable=unused-argument
+        return None
+
+
 def noop_init_metadata_db(self):  # pylint: disable=unused-argument
     pass
 
diff --git a/modyn/tests/selector/test_selector.py b/modyn/tests/selector/test_selector.py
index 0760b9888..78ab06d31 100644
--- a/modyn/tests/selector/test_selector.py
+++ b/modyn/tests/selector/test_selector.py
@@ -21,15 +21,16 @@ def _reset_state(self) -> None:
 
 
 def test_init():
-    selec = Selector(MockStrategy(), 42, 2)
+    selec = Selector(MockStrategy(), 42, 2, {})
     assert selec._pipeline_id == 42
     assert selec._num_workers == 2
 
 
 def test_get_sample_keys_and_weight_cached():
-    selector = Selector(MockStrategy(), 42, 3)
+    selector = Selector(MockStrategy(), 42, 3, {})
     selector._trigger_cache[42] = [[(10, 1.0), (11, 1.0)], [(12, 1.0), (13, 1.0)]]
     selector._trigger_partition_cache[42] = 2
+    selector._trigger_size_cache[42] = 4
 
     result = selector.get_sample_keys_and_weights(42, 0, 0)
     assert result == [(10, 1.0)]
@@ -46,9 +47,10 @@ def test_get_sample_keys_and_weight_cached():
 
 @patch.object(MockStrategy, "get_trigger_partition_keys")
 def test_get_sample_keys_and_weight_no_cache(test_get_trigger_partition_keys: MagicMock):
-    selector = Selector(MockStrategy(), 42, 3)
+    selector = Selector(MockStrategy(), 42, 3, {})
     selector._trigger_partition_cache[42] = 2
     test_get_trigger_partition_keys.return_value = [(10, 1.0), (11, 1.0)]
+    selector._trigger_size_cache[42] = 2
 
     result = selector.get_sample_keys_and_weights(42, 2, 0)
     assert result == [(10, 1.0), (11, 1.0)]
@@ -59,7 +61,7 @@ def test_get_sample_keys_and_weight_no_cache(test_get_trigger_partition_keys: Ma
 
 @patch.object(MockStrategy, "inform_data")
 def test_inform_data(test_inform_data: MagicMock):
-    selector = Selector(MockStrategy(), 42, 3)
+    selector = Selector(MockStrategy(), 42, 3, {})
     selector.inform_data([10, 11, 12], [0, 1, 2], ["cat", "dog", "cat"])
 
     test_inform_data.assert_called_once_with([10, 11, 12], [0, 1, 2], ["cat", "dog", "cat"])
@@ -71,7 +73,7 @@ def test_inform_data(test_inform_data: MagicMock):
 def test_inform_data_and_trigger_caching(
     test_get_trigger_partition_keys: MagicMock, test_trigger: MagicMock, test_inform_data: MagicMock
 ):
-    selector = Selector(MockStrategy(), 42, 3)
+    selector = Selector(MockStrategy(), 42, 3, {})
     assert selector._current_keys_in_cache == 0
 
     test_trigger.return_value = (42, 2, 2, {})  # 2 keys in trigger, 2 partitions
@@ -88,6 +90,7 @@ def test_inform_data_and_trigger_caching(
     # This test configures the selector to store the partitions in memory
     assert selector._trigger_cache[42] == [[(10, 1.0)], [(10, 1.0)]]
     assert selector._trigger_partition_cache[42] == 2
+    assert selector._trigger_size_cache[42] == 2
 
 
 @patch.object(MockStrategy, "inform_data")
@@ -96,7 +99,7 @@ def test_inform_data_and_trigger_caching(
 def test_inform_data_and_trigger_nocaching(
     test_get_trigger_partition_keys: MagicMock, test_trigger: MagicMock, test_inform_data: MagicMock
 ):
-    selector = Selector(MockStrategy(), 42, 3)
+    selector = Selector(MockStrategy(), 42, 3, {})
     assert selector._current_keys_in_cache == 0
 
     test_trigger.return_value = (42, 2, 2, {})  # 2 keys in trigger, 2 partitions
@@ -116,8 +119,9 @@ def test_inform_data_and_trigger_nocaching(
 
 
 def test_get_number_of_samples():
-    selector = Selector(MockStrategy(), 42, 3)
+    selector = Selector(MockStrategy(), 42, 3, {})
     selector._trigger_size_cache[42] = 2
+    selector._trigger_partition_cache[42] = 1
 
     assert selector.get_number_of_samples(42) == 2
 
@@ -126,8 +130,9 @@ def test_get_number_of_samples():
 
 
 def test_get_number_of_partitions():
-    selector = Selector(MockStrategy(), 42, 3)
+    selector = Selector(MockStrategy(), 42, 3, {})
     selector._trigger_partition_cache[42] = 2
+    selector._trigger_size_cache[42] = 2
 
     assert selector.get_number_of_partitions(42) == 2
 
diff --git a/modyn/tests/selector/test_selector_entrypoint.py b/modyn/tests/selector/test_selector_entrypoint.py
index 1b7083efe..53c70fba8 100644
--- a/modyn/tests/selector/test_selector_entrypoint.py
+++ b/modyn/tests/selector/test_selector_entrypoint.py
@@ -6,7 +6,7 @@
 import pathlib
 from unittest.mock import patch
 
-from modyn.selector.internal.grpc.selector_server import SelectorServer
+from modyn.selector.internal.grpc.selector_server import SelectorGRPCServer
 
 SCRIPT_PATH = pathlib.Path(os.path.realpath(__file__))
 
@@ -19,19 +19,25 @@ def noop_constructor_mock(self, modyn_config: dict) -> None:
     pass
 
 
-def noop_run(self) -> None:
+def noop_enter(self) -> None:
     pass
 
 
-@patch.object(SelectorServer, "__init__", noop_constructor_mock)
-@patch.object(SelectorServer, "run", noop_run)
+def noop_exit(self, exc_type, exc_val, exc_tb) -> None:
+    pass
+
+
+@patch.object(SelectorGRPCServer, "__init__", noop_constructor_mock)
+@patch.object(SelectorGRPCServer, "__enter__", noop_enter)
+@patch.object(SelectorGRPCServer, "__exit__", noop_exit)
 def test_trainer_server_script_runs(script_runner):
     ret = script_runner.run("_modyn_selector", str(EXAMPLE_SYSTEM_CONFIG))
     assert ret.success
 
 
-@patch.object(SelectorServer, "__init__", noop_constructor_mock)
-@patch.object(SelectorServer, "run", noop_run)
+@patch.object(SelectorGRPCServer, "__init__", noop_constructor_mock)
+@patch.object(SelectorGRPCServer, "__enter__", noop_enter)
+@patch.object(SelectorGRPCServer, "__exit__", noop_exit)
 def test_trainer_server_fails_on_non_existing_system_config(script_runner):
     ret = script_runner.run("_modyn_selector", str(NO_FILE))
     assert not ret.success
diff --git a/modyn/tests/trainer_server/internal/data/key_sources/test_local_key_source.py b/modyn/tests/trainer_server/internal/data/key_sources/test_local_key_source.py
index 9f953b58d..7d6931aa1 100644
--- a/modyn/tests/trainer_server/internal/data/key_sources/test_local_key_source.py
+++ b/modyn/tests/trainer_server/internal/data/key_sources/test_local_key_source.py
@@ -1,6 +1,5 @@
 import math
 
-import pytest
 import torch
 from modyn.trainer_server.internal.dataset.key_sources import LocalKeySource
 from modyn.trainer_server.internal.dataset.local_dataset_writer import LocalDatasetWriter
@@ -75,9 +74,9 @@ def test_read_dirty_directory():
     write_directory(other_pipeline, 1, TMP_PATH_TEST, number_of_files=10, maximum_keys_in_memory=maximum_keys_in_memory)
 
     keysource = LocalKeySource(pipeline_id=current_pipeline, trigger_id=1, offline_dataset_path=TMP_PATH_TEST)
+
     assert keysource.get_num_data_partitions() == 0
-    with pytest.raises(FileNotFoundError):
-        keysource.get_keys_and_weights(0, 0)
+    assert keysource.get_keys_and_weights(0, 0) == ([], [])
 
     write_directory(
         current_pipeline, 1, TMP_PATH_TEST, number_of_files=4, maximum_keys_in_memory=maximum_keys_in_memory
diff --git a/modyn/tests/trainer_server/internal/data/test_data_utils.py b/modyn/tests/trainer_server/internal/data/test_data_utils.py
index 8a2729d67..ad7fea1d1 100644
--- a/modyn/tests/trainer_server/internal/data/test_data_utils.py
+++ b/modyn/tests/trainer_server/internal/data/test_data_utils.py
@@ -30,7 +30,7 @@ def test_prepare_dataloaders(
     test_weights, test_insecure_channel, test_grpc_connection_established, test_grpc_connection_established_selector
 ):
     train_dataloader, _ = prepare_dataloaders(
-        1, 1, "MNIST", 4, 128, get_mock_bytes_parser(), [], "", "", 42, None, None
+        1, 1, "MNIST", 4, 128, get_mock_bytes_parser(), [], "", "", 42, 5, 5, None, None
     )
 
     assert train_dataloader.num_workers == 4
diff --git a/modyn/tests/trainer_server/internal/data/test_online_dataset.py b/modyn/tests/trainer_server/internal/data/test_online_dataset.py
index 7a391290a..6b8d935e7 100644
--- a/modyn/tests/trainer_server/internal/data/test_online_dataset.py
+++ b/modyn/tests/trainer_server/internal/data/test_online_dataset.py
@@ -1,4 +1,4 @@
-# pylint: disable=unused-argument, no-name-in-module
+# pylint: disable=unused-argument, no-name-in-module, too-many-locals
 import platform
 from unittest.mock import patch
 
@@ -67,6 +67,8 @@ def test_invalid_bytes_parser(test_weights, test_grpc_connection_established):
             training_id=42,
             tokenizer=None,
             log_path=None,
+            num_prefetched_partitions=1,
+            parallel_prefetch_requests=1,
         )._init_transforms()
 
     with pytest.raises(ValueError):
@@ -81,6 +83,8 @@ def test_invalid_bytes_parser(test_weights, test_grpc_connection_established):
             training_id=42,
             tokenizer="",
             log_path=None,
+            num_prefetched_partitions=1,
+            parallel_prefetch_requests=1,
         )._init_transforms()
 
 
@@ -104,6 +108,8 @@ def test_init(test_insecure_channel, test_grpc_connection_established, test_grpc
         training_id=42,
         tokenizer=None,
         log_path=None,
+        num_prefetched_partitions=1,
+        parallel_prefetch_requests=1,
     )
     assert online_dataset._pipeline_id == 1
     assert online_dataset._trigger_id == 1
@@ -136,6 +142,8 @@ def test_get_keys_and_weights_from_selector(
             "training_id": 42,
             "tokenizer": None,
             "log_path": None,
+            "num_prefetched_partitions": 1,
+            "parallel_prefetch_requests": 1,
         }
 
         online_dataset = OnlineDataset(**kwargs)
@@ -170,18 +178,38 @@ def test_get_data_from_storage(
         training_id=42,
         tokenizer=None,
         log_path=None,
+        num_prefetched_partitions=0,
+        parallel_prefetch_requests=1,
     )
     online_dataset._init_grpc()
-    assert online_dataset._get_data_from_storage(list(range(10))) == (
+    keys = []
+    data = []
+    labels = []
+
+    for key_list, data_list, label_list, _ in online_dataset._get_data_from_storage(list(range(10))):
+        keys.extend(key_list)
+        data.extend(data_list)
+        labels.extend(label_list)
+
+    assert (keys, data, labels) == (
+        list(range(10)),
         [bytes(f"sample{x}", "utf-8") for x in range(10)],
         list(range(10)),
     )
 
+    result_keys = []
+    result_samples = []
+    result_labels = []
+
     permuted_list = [0, 9, 6, 5, 4, 3]
-    assert online_dataset._get_data_from_storage(permuted_list) == (
-        [b"sample0", b"sample9", b"sample6", b"sample5", b"sample4", b"sample3"],
-        [0, 9, 6, 5, 4, 3],
-    )
+    for rkey, rsam, rlbl, _ in online_dataset._get_data_from_storage(permuted_list):
+        result_keys.extend(rkey)
+        result_samples.extend(rsam)
+        result_labels.extend(rlbl)
+
+    assert set(result_keys) == set(keys)
+    assert set(result_samples) == set(data)
+    assert set(result_labels) == set(labels)
 
 
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", MockSelectorStub)
@@ -229,6 +257,8 @@ def test_deserialize_torchvision_transforms(
         training_id=42,
         tokenizer=None,
         log_path=None,
+        num_prefetched_partitions=1,
+        parallel_prefetch_requests=1,
     )
     online_dataset._bytes_parser_function = bytes_parser_function
     online_dataset._setup_composed_transform()
@@ -238,6 +268,8 @@ def test_deserialize_torchvision_transforms(
         assert transform1.__dict__ == transform2.__dict__
 
 
+@pytest.mark.parametrize("parallel_prefetch_requests", [1, 5, 999999])
+@pytest.mark.parametrize("prefetched_partitions", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999999])
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", MockSelectorStub)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.StorageStub", MockStorageStub)
 @patch(
@@ -247,7 +279,9 @@ def test_deserialize_torchvision_transforms(
 @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True)
 @patch.object(grpc, "insecure_channel", return_value=None)
 @patch.object(
-    OnlineDataset, "_get_data_from_storage", return_value=([bytes(f"sample{x}", "utf-8") for x in range(10)], [1] * 10)
+    OnlineDataset,
+    "_get_data_from_storage",
+    return_value=[(list(range(10)), [bytes(f"sample{x}", "utf-8") for x in range(10)], [1] * 10, 0)],
 )
 @patch.object(SelectorKeySource, "get_keys_and_weights", return_value=(list(range(10)), None))
 @patch.object(SelectorKeySource, "get_num_data_partitions", return_value=1)
@@ -258,6 +292,8 @@ def test_dataset_iter(
     test_insecure_channel,
     test_grpc_connection_established,
     test_grpc_connection_established_selector,
+    prefetched_partitions,
+    parallel_prefetch_requests,
 ):
     online_dataset = OnlineDataset(
         pipeline_id=1,
@@ -268,6 +304,8 @@ def test_dataset_iter(
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=prefetched_partitions,
+        parallel_prefetch_requests=parallel_prefetch_requests,
         tokenizer=None,
         log_path=None,
     )
@@ -278,6 +316,8 @@ def test_dataset_iter(
     assert [x[2] for x in all_data] == [1] * 10
 
 
+@pytest.mark.parametrize("parallel_prefetch_requests", [1, 5, 999999])
+@pytest.mark.parametrize("prefetched_partitions", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999999])
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", MockSelectorStub)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.StorageStub", MockStorageStub)
 @patch(
@@ -287,7 +327,9 @@ def test_dataset_iter(
 @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True)
 @patch.object(grpc, "insecure_channel", return_value=None)
 @patch.object(
-    OnlineDataset, "_get_data_from_storage", return_value=([bytes(f"sample{x}", "utf-8") for x in range(10)], [1] * 10)
+    OnlineDataset,
+    "_get_data_from_storage",
+    return_value=[(list(range(10)), [bytes(f"sample{x}", "utf-8") for x in range(10)], [1] * 10, 0)],
 )
 @patch.object(SelectorKeySource, "get_keys_and_weights", return_value=(list(range(10)), None))
 @patch.object(SelectorKeySource, "get_num_data_partitions", return_value=1)
@@ -298,6 +340,8 @@ def test_dataset_iter_with_parsing(
     test_insecure_channel,
     test_grpc_connection_established,
     test_grpc_connection_established_selector,
+    prefetched_partitions,
+    parallel_prefetch_requests,
 ):
     online_dataset = OnlineDataset(
         pipeline_id=1,
@@ -308,6 +352,8 @@ def test_dataset_iter_with_parsing(
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=prefetched_partitions,
+        parallel_prefetch_requests=parallel_prefetch_requests,
         tokenizer=None,
         log_path=None,
     )
@@ -318,6 +364,8 @@ def test_dataset_iter_with_parsing(
     assert [x[2] for x in all_data] == [1] * 10
 
 
+@pytest.mark.parametrize("parallel_prefetch_requests", [1, 5, 999999])
+@pytest.mark.parametrize("prefetched_partitions", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999999])
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", MockSelectorStub)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.StorageStub", MockStorageStub)
 @patch(
@@ -327,7 +375,9 @@ def test_dataset_iter_with_parsing(
 @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True)
 @patch.object(grpc, "insecure_channel", return_value=None)
 @patch.object(
-    OnlineDataset, "_get_data_from_storage", return_value=([x.to_bytes(2, "big") for x in range(16)], [1] * 16)
+    OnlineDataset,
+    "_get_data_from_storage",
+    return_value=[(list(range(16)), [x.to_bytes(2, "big") for x in range(16)], [1] * 16, 0)],
 )
 @patch.object(SelectorKeySource, "get_keys_and_weights", return_value=(list(range(16)), None))
 @patch.object(SelectorKeySource, "get_num_data_partitions", return_value=1)
@@ -338,6 +388,8 @@ def test_dataloader_dataset(
     test_insecure_channel,
     test_grpc_connection_established,
     test_grpc_connection_established_selector,
+    prefetched_partitions,
+    parallel_prefetch_requests,
 ):
     online_dataset = OnlineDataset(
         pipeline_id=1,
@@ -348,6 +400,8 @@ def test_dataloader_dataset(
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=prefetched_partitions,
+        parallel_prefetch_requests=parallel_prefetch_requests,
         tokenizer=None,
         log_path=None,
     )
@@ -359,6 +413,8 @@ def test_dataloader_dataset(
         assert torch.equal(batch[2], torch.ones(4, dtype=torch.float64))
 
 
+@pytest.mark.parametrize("parallel_prefetch_requests", [1, 5, 999999])
+@pytest.mark.parametrize("prefetched_partitions", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999999])
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", WeightedMockSelectorStub)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.StorageStub", MockStorageStub)
 @patch(
@@ -368,7 +424,9 @@ def test_dataloader_dataset(
 @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True)
 @patch.object(grpc, "insecure_channel", return_value=None)
 @patch.object(
-    OnlineDataset, "_get_data_from_storage", return_value=([x.to_bytes(2, "big") for x in range(16)], [1] * 16)
+    OnlineDataset,
+    "_get_data_from_storage",
+    return_value=[(list(range(16)), [x.to_bytes(2, "big") for x in range(16)], [1] * 16, 0)],
 )
 @patch.object(SelectorKeySource, "get_keys_and_weights", return_value=(list(range(16)), [2.0] * 16))
 @patch.object(SelectorKeySource, "get_num_data_partitions", return_value=1)
@@ -379,6 +437,8 @@ def test_dataloader_dataset_weighted(
     test_insecure_channel,
     test_grpc_connection_established,
     test_grpc_connection_established_selector,
+    prefetched_partitions,
+    parallel_prefetch_requests,
 ):
     online_dataset = OnlineDataset(
         pipeline_id=1,
@@ -389,6 +449,8 @@ def test_dataloader_dataset_weighted(
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=prefetched_partitions,
+        parallel_prefetch_requests=parallel_prefetch_requests,
         tokenizer=None,
         log_path=None,
     )
@@ -401,6 +463,9 @@ def test_dataloader_dataset_weighted(
         assert torch.equal(batch[3], 2 * torch.ones(4, dtype=torch.float64))
 
 
+@pytest.mark.parametrize("parallel_prefetch_requests", [1, 5, 999999])
+@pytest.mark.parametrize("num_workers", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+@pytest.mark.parametrize("prefetched_partitions", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999999])
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", MockSelectorStub)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.StorageStub", MockStorageStub)
 @patch(
@@ -409,7 +474,11 @@ def test_dataloader_dataset_weighted(
 )
 @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True)
 @patch.object(grpc, "insecure_channel", return_value=None)
-@patch.object(OnlineDataset, "_get_data_from_storage", return_value=([x.to_bytes(2, "big") for x in range(4)], [1] * 4))
+@patch.object(
+    OnlineDataset,
+    "_get_data_from_storage",
+    return_value=[(list(range(4)), [x.to_bytes(2, "big") for x in range(4)], [1] * 4, 0)],
+)
 @patch.object(SelectorKeySource, "get_keys_and_weights", return_value=(list(range(4)), None))
 @patch.object(SelectorKeySource, "get_num_data_partitions", return_value=1)
 def test_dataloader_dataset_multi_worker(
@@ -419,6 +488,9 @@ def test_dataloader_dataset_multi_worker(
     test_insecure_channel,
     test_grpc_connection_established,
     test_grpc_connection_established_selector,
+    prefetched_partitions,
+    num_workers,
+    parallel_prefetch_requests,
 ):
     if platform.system() == "Darwin":
         # On macOS, spawn is the default, which loses the mocks
@@ -434,10 +506,12 @@ def test_dataloader_dataset_multi_worker(
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=prefetched_partitions,
+        parallel_prefetch_requests=parallel_prefetch_requests,
         tokenizer=None,
         log_path=None,
     )
-    dataloader = torch.utils.data.DataLoader(online_dataset, batch_size=4, num_workers=4)
+    dataloader = torch.utils.data.DataLoader(online_dataset, batch_size=4, num_workers=num_workers)
     for batch in dataloader:
         assert len(batch) == 3
         assert torch.equal(batch[0], torch.Tensor([0, 1, 2, 3]))
@@ -463,6 +537,8 @@ def test_init_grpc(test_insecure_channel, test_grpc_connection_established, test
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=1,
+        parallel_prefetch_requests=1,
         tokenizer=None,
         log_path=None,
     )
@@ -485,7 +561,9 @@ def test_init_grpc(test_insecure_channel, test_grpc_connection_established, test
 @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True)
 @patch.object(grpc, "insecure_channel", return_value=None)
 def test_init_transforms(
-    test_insecure_channel, test_grpc_connection_established, test_grpc_connection_established_selector
+    test_insecure_channel,
+    test_grpc_connection_established,
+    test_grpc_connection_established_selector,
 ):
     online_dataset = OnlineDataset(
         pipeline_id=1,
@@ -496,6 +574,8 @@ def test_init_transforms(
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=1,
+        parallel_prefetch_requests=1,
         tokenizer=None,
         log_path=None,
     )
@@ -513,6 +593,12 @@ def test_init_transforms(
         tv_ds.assert_called_once()
 
 
+def iter_multi_partition_data_side_effect(keys):
+    yield (list(keys), [x.to_bytes(2, "big") for x in keys], [1] * len(keys), 0)
+
+
+@pytest.mark.parametrize("parallel_prefetch_requests", [1, 5, 999999])
+@pytest.mark.parametrize("prefetched_partitions", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999999])
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", MockSelectorStub)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.StorageStub", MockStorageStub)
 @patch(
@@ -521,24 +607,15 @@ def test_init_transforms(
 )
 @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True)
 @patch.object(grpc, "insecure_channel", return_value=None)
-@patch.object(
-    OnlineDataset,
-    "_get_data_from_storage",
-    side_effect=[
-        ([x.to_bytes(2, "big") for x in range(16)], [1] * 16),
-        ([x.to_bytes(2, "big") for x in range(16, 32)], [1] * 16),
-        ([x.to_bytes(2, "big") for x in range(32, 48)], [1] * 16),
-        ([x.to_bytes(2, "big") for x in range(48, 64)], [1] * 16),
-    ],
-)
+@patch.object(OnlineDataset, "_get_data_from_storage", side_effect=iter_multi_partition_data_side_effect)
 @patch.object(
     SelectorKeySource,
     "get_keys_and_weights",
     side_effect=[
-        ([str(i) for i in range(16)], None),
-        ([str(i) for i in range(16, 32)], None),
-        ([str(i) for i in range(32, 48)], None),
-        ([str(i) for i in range(48, 64)], None),
+        (list(range(16)), None),
+        (list(range(16, 32)), None),
+        (list(range(32, 48)), None),
+        (list(range(48, 64)), None),
     ],
 )
 @patch.object(SelectorKeySource, "get_num_data_partitions", return_value=4)
@@ -549,6 +626,8 @@ def test_iter_multi_partition(
     test_insecure_channel,
     test_grpc_connection_established,
     test_grpc_connection_established_selector,
+    prefetched_partitions,
+    parallel_prefetch_requests,
 ):
     online_dataset = OnlineDataset(
         pipeline_id=1,
@@ -559,20 +638,23 @@ def test_iter_multi_partition(
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=prefetched_partitions,
+        parallel_prefetch_requests=parallel_prefetch_requests,
         tokenizer=None,
         log_path=None,
     )
     dataloader = torch.utils.data.DataLoader(online_dataset, batch_size=4)
-
     idx = 0
     for idx, batch in enumerate(dataloader):
         assert len(batch) == 3
-        assert batch[0] == (str(4 * idx), str(4 * idx + 1), str(4 * idx + 2), str(4 * idx + 3))
+        assert torch.equal(batch[0], torch.Tensor([4 * idx, 4 * idx + 1, 4 * idx + 2, 4 * idx + 3]))
         assert torch.equal(batch[1], torch.Tensor([4 * idx, 4 * idx + 1, 4 * idx + 2, 4 * idx + 3]))
         assert torch.equal(batch[2], torch.ones(4, dtype=torch.float64))
     assert idx == 15
 
 
+@pytest.mark.parametrize("parallel_prefetch_requests", [1, 5, 999999])
+@pytest.mark.parametrize("prefetched_partitions", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999999])
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", WeightedMockSelectorStub)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.StorageStub", MockStorageStub)
 @patch(
@@ -581,24 +663,15 @@ def test_iter_multi_partition(
 )
 @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True)
 @patch.object(grpc, "insecure_channel", return_value=None)
-@patch.object(
-    OnlineDataset,
-    "_get_data_from_storage",
-    side_effect=[
-        ([x.to_bytes(2, "big") for x in range(16)], [1] * 16),
-        ([x.to_bytes(2, "big") for x in range(16, 32)], [1] * 16),
-        ([x.to_bytes(2, "big") for x in range(32, 48)], [1] * 16),
-        ([x.to_bytes(2, "big") for x in range(48, 64)], [1] * 16),
-    ],
-)
+@patch.object(OnlineDataset, "_get_data_from_storage", side_effect=iter_multi_partition_data_side_effect)
 @patch.object(
     SelectorKeySource,
     "get_keys_and_weights",
     side_effect=[
-        ([str(i) for i in range(16)], [0.9] * 16),
-        ([str(i) for i in range(16, 32)], [0.9] * 16),
-        ([str(i) for i in range(32, 48)], [0.9] * 16),
-        ([str(i) for i in range(48, 64)], [0.9] * 16),
+        (list(range(16)), [0.9] * 16),
+        (list(range(16, 32)), [0.9] * 16),
+        (list(range(32, 48)), [0.9] * 16),
+        (list(range(48, 64)), [0.9] * 16),
     ],
 )
 @patch.object(SelectorKeySource, "get_num_data_partitions", return_value=4)
@@ -609,6 +682,8 @@ def test_iter_multi_partition_weighted(
     test_insecure_channel,
     test_grpc_connection_established,
     test_grpc_connection_established_selector,
+    prefetched_partitions,
+    parallel_prefetch_requests,
 ):
     online_dataset = OnlineDataset(
         pipeline_id=1,
@@ -619,6 +694,8 @@ def test_iter_multi_partition_weighted(
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=prefetched_partitions,
+        parallel_prefetch_requests=parallel_prefetch_requests,
         tokenizer=None,
         log_path=None,
     )
@@ -628,13 +705,15 @@ def test_iter_multi_partition_weighted(
     idx = 0
     for idx, batch in enumerate(dataloader):
         assert len(batch) == 4
-        assert batch[0] == (str(4 * idx), str(4 * idx + 1), str(4 * idx + 2), str(4 * idx + 3))
+        assert torch.equal(batch[0], torch.Tensor([4 * idx, 4 * idx + 1, 4 * idx + 2, 4 * idx + 3]))
         assert torch.equal(batch[1], torch.Tensor([4 * idx, 4 * idx + 1, 4 * idx + 2, 4 * idx + 3]))
         assert torch.equal(batch[2], torch.ones(4, dtype=torch.float64))
         assert torch.equal(batch[3], 0.9 * torch.ones(4, dtype=torch.float64))
     assert idx == 15
 
 
+@pytest.mark.parametrize("parallel_prefetch_requests", [1, 5, 999999])
+@pytest.mark.parametrize("prefetched_partitions", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999999])
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", MockSelectorStub)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.StorageStub", MockStorageStub)
 @patch(
@@ -643,24 +722,15 @@ def test_iter_multi_partition_weighted(
 )
 @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True)
 @patch.object(grpc, "insecure_channel", return_value=None)
-@patch.object(
-    OnlineDataset,
-    "_get_data_from_storage",
-    side_effect=[
-        ([x.to_bytes(2, "big") for x in range(16)], [1] * 16),
-        ([x.to_bytes(2, "big") for x in range(16, 32)], [1] * 16),
-        ([x.to_bytes(2, "big") for x in range(32, 48)], [1] * 16),
-        ([x.to_bytes(2, "big") for x in range(48, 64)], [1] * 16),
-    ],
-)
+@patch.object(OnlineDataset, "_get_data_from_storage", side_effect=iter_multi_partition_data_side_effect)
 @patch.object(
     SelectorKeySource,
     "get_keys_and_weights",
     side_effect=[
-        ([str(i) for i in range(16)], None),
-        ([str(i) for i in range(16, 32)], None),
-        ([str(i) for i in range(32, 48)], None),
-        ([str(i) for i in range(48, 64)], None),
+        (list(range(16)), None),
+        (list(range(16, 32)), None),
+        (list(range(32, 48)), None),
+        (list(range(48, 64)), None),
     ],
 )
 @patch.object(SelectorKeySource, "get_num_data_partitions", return_value=4)
@@ -671,6 +741,8 @@ def test_iter_multi_partition_cross(
     test_insecure_channel,
     test_grpc_connection_established,
     test_grpc_connection_established_selector,
+    prefetched_partitions,
+    parallel_prefetch_requests,
 ):
     online_dataset = OnlineDataset(
         pipeline_id=1,
@@ -681,34 +753,35 @@ def test_iter_multi_partition_cross(
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=prefetched_partitions,
+        parallel_prefetch_requests=parallel_prefetch_requests,
         tokenizer=None,
         log_path=None,
     )
+    # Note batch size 6 instead of 4 here
     dataloader = torch.utils.data.DataLoader(online_dataset, batch_size=6)
 
     idx = 0
     for idx, batch in enumerate(dataloader):
         assert len(batch) == 3
         if idx < 10:
-            assert batch[0] == (
-                str(6 * idx),
-                str(6 * idx + 1),
-                str(6 * idx + 2),
-                str(6 * idx + 3),
-                str(6 * idx + 4),
-                str(6 * idx + 5),
+            assert torch.equal(
+                batch[0], torch.Tensor([6 * idx, 6 * idx + 1, 6 * idx + 2, 6 * idx + 3, 6 * idx + 4, 6 * idx + 5])
             )
             assert torch.equal(
                 batch[1], torch.Tensor([6 * idx, 6 * idx + 1, 6 * idx + 2, 6 * idx + 3, 6 * idx + 4, 6 * idx + 5])
             )
             assert torch.equal(batch[2], torch.ones(6, dtype=torch.float64))
         else:
-            assert batch[0] == ("60", "61", "62", "63")
+            assert torch.equal(batch[0], torch.Tensor([60, 61, 62, 63]))
             assert torch.equal(batch[1], torch.Tensor([60, 61, 62, 63]))
             assert torch.equal(batch[2], torch.ones(4, dtype=torch.float64))
     assert idx == 10
 
 
+@pytest.mark.parametrize("parallel_prefetch_requests", [1, 5, 999999])
+@pytest.mark.parametrize("num_workers", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+@pytest.mark.parametrize("prefetched_partitions", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999999])
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", MockSelectorStub)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.StorageStub", MockStorageStub)
 @patch(
@@ -720,10 +793,7 @@ def test_iter_multi_partition_cross(
 @patch.object(
     OnlineDataset,
     "_get_data_from_storage",
-    side_effect=[
-        ([x.to_bytes(2, "big") for x in range(4)], [1] * 4),
-        ([x.to_bytes(2, "big") for x in range(4)], [1] * 4),
-    ],
+    side_effect=iter_multi_partition_data_side_effect,
 )
 @patch.object(
     SelectorKeySource,
@@ -738,6 +808,9 @@ def test_iter_multi_partition_multi_workers(
     test_insecure_channel,
     test_grpc_connection_established,
     test_grpc_connection_established_selector,
+    prefetched_partitions,
+    num_workers,
+    parallel_prefetch_requests,
 ):
     if platform.system() == "Darwin":
         # On macOS, spawn is the default, which loses the mocks
@@ -753,19 +826,27 @@ def test_iter_multi_partition_multi_workers(
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=prefetched_partitions,
+        parallel_prefetch_requests=parallel_prefetch_requests,
         tokenizer=None,
         log_path=None,
     )
-    dataloader = torch.utils.data.DataLoader(online_dataset, batch_size=4, num_workers=4)
+    dataloader = torch.utils.data.DataLoader(online_dataset, batch_size=4, num_workers=num_workers)
     idx = 0
     for idx, batch in enumerate(dataloader):
         assert len(batch) == 3
         assert torch.equal(batch[0], torch.Tensor([0, 1, 2, 3]))
         assert torch.equal(batch[1], torch.Tensor([0, 1, 2, 3]))
         assert torch.equal(batch[2], torch.ones(4, dtype=int))
-    assert idx == 7
 
+    if num_workers % 2 == 0:
+        # only test this for even number of workers to avoid fractions
+        # each worker gets 8 items from get_keys_and_weights; batch size 4; minus one for zero indexing
+        assert idx == ((max(num_workers, 1) * 8) / 4) - 1
 
+
+@pytest.mark.parametrize("parallel_prefetch_requests", [1, 5, 999999])
+@pytest.mark.parametrize("prefetched_partitions", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999999])
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", MockSelectorStub)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.StorageStub", MockStorageStub)
 @patch(
@@ -775,7 +856,9 @@ def test_iter_multi_partition_multi_workers(
 @patch("modyn.trainer_server.internal.dataset.online_dataset.grpc_connection_established", return_value=True)
 @patch.object(grpc, "insecure_channel", return_value=None)
 @patch.object(
-    OnlineDataset, "_get_data_from_storage", return_value=([x.to_bytes(2, "big") for x in range(100)], [1] * 100)
+    OnlineDataset,
+    "_get_data_from_storage",
+    return_value=iter([(list(range(100)), [x.to_bytes(2, "big") for x in range(100)], [1] * 100, 0)]),
 )
 @patch.object(SelectorKeySource, "get_keys_and_weights", return_value=(list(range(100)), None))
 @patch.object(SelectorKeySource, "get_num_data_partitions", return_value=1)
@@ -786,6 +869,8 @@ def test_multi_epoch_dataloader_dataset(
     test_insecure_channel,
     test_grpc_connection_established,
     test_grpc_connection_established_selecotr,
+    prefetched_partitions,
+    parallel_prefetch_requests,
 ):
     online_dataset = OnlineDataset(
         pipeline_id=1,
@@ -796,6 +881,8 @@ def test_multi_epoch_dataloader_dataset(
         storage_address="localhost:1234",
         selector_address="localhost:1234",
         training_id=42,
+        num_prefetched_partitions=prefetched_partitions,
+        parallel_prefetch_requests=parallel_prefetch_requests,
         tokenizer=None,
         log_path=None,
     )
diff --git a/modyn/tests/trainer_server/internal/data/test_per_class_online_dataset.py b/modyn/tests/trainer_server/internal/data/test_per_class_online_dataset.py
index 5f554fdd3..764a7f85e 100644
--- a/modyn/tests/trainer_server/internal/data/test_per_class_online_dataset.py
+++ b/modyn/tests/trainer_server/internal/data/test_per_class_online_dataset.py
@@ -3,6 +3,7 @@
 from unittest.mock import patch
 
 import grpc
+import pytest
 import torch
 from modyn.selector.internal.grpc.generated.selector_pb2 import SamplesResponse, UsesWeightsResponse
 from modyn.storage.internal.grpc.generated.storage_pb2 import GetResponse
@@ -34,6 +35,8 @@ def Get(self, request):  # pylint: disable=invalid-name
             )
 
 
+@pytest.mark.parametrize("parallel_prefetch_requests", [1, 2, 5, 7, 8, 9, 10, 100, 999999])
+@pytest.mark.parametrize("prefetched_partitions", [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999999])
 @patch("modyn.trainer_server.internal.dataset.key_sources.selector_key_source.SelectorStub", MockSelectorStub)
 @patch("modyn.trainer_server.internal.dataset.online_dataset.StorageStub", MockStorageStub)
 @patch(
@@ -45,7 +48,7 @@ def Get(self, request):  # pylint: disable=invalid-name
 @patch.object(
     PerClassOnlineDataset,
     "_get_data_from_storage",
-    return_value=([x.to_bytes(2, "big") for x in range(16)], [0, 1, 2, 3, 0, 0, 0, 1] * 2),
+    return_value=[(list(range(16)), [x.to_bytes(2, "big") for x in range(16)], [0, 1, 2, 3, 0, 0, 0, 1] * 2, 0)],
 )
 @patch.object(SelectorKeySource, "get_keys_and_weights", return_value=(list(range(16)), None))
 @patch.object(SelectorKeySource, "get_num_data_partitions", return_value=1)
@@ -56,6 +59,8 @@ def test_dataloader_dataset(
     test_insecure_channel,
     test_grpc_connection_established,
     test_grpc_connection_established_selector,
+    prefetched_partitions,
+    parallel_prefetch_requests,
 ):
     online_dataset = PerClassOnlineDataset(
         pipeline_id=1,
@@ -67,6 +72,8 @@ def test_dataloader_dataset(
         selector_address="localhost:1234",
         training_id=42,
         initial_filtered_label=0,
+        num_prefetched_partitions=prefetched_partitions,
+        parallel_prefetch_requests=parallel_prefetch_requests,
         tokenizer=None,
     )
     dataloader = torch.utils.data.DataLoader(online_dataset, batch_size=4)
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/deepcore_comparison_tests_utils.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/deepcore_comparison_tests_utils.py
new file mode 100644
index 000000000..fe905d5f0
--- /dev/null
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/deepcore_comparison_tests_utils.py
@@ -0,0 +1,27 @@
+import numpy as np
+import torch
+from modyn.models.coreset_methods_support import CoresetSupportingModule
+from torch import nn
+
+
+class DummyModel(CoresetSupportingModule):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hidden_layer = nn.Linear(in_features=1, out_features=10)
+        self.output_layer = nn.Linear(in_features=10, out_features=1)
+
+    def forward(self, input_tensor):
+        input_tensor = torch.relu(self.hidden_layer(input_tensor))
+        input_tensor = self.embedding_recorder(input_tensor)
+        outputs = self.output_layer(input_tensor)
+        return outputs
+
+    def get_last_layer(self):
+        return self.output_layer
+
+
+def assert_close_matrices(matrix1, matrix2):
+    for row1, row2 in zip(matrix1, matrix2):
+        assert len(row1) == len(row2)
+        for el1, el2 in zip(row1, row2):
+            assert np.isclose(el1, el2, 1e-2)
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py
new file mode 100644
index 000000000..e2007974b
--- /dev/null
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_matrix_downsampling_strategy.py
@@ -0,0 +1,123 @@
+# pylint: disable=abstract-class-instantiated,unused-argument
+from unittest.mock import patch
+
+import numpy as np
+import torch
+from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_matrix_downsampling_strategy import (
+    AbstractMatrixDownsamplingStrategy,
+    MatrixContent,
+)
+
+
+def get_sampler_config(balance=False):
+    downsampling_ratio = 50
+    per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+
+    params_from_selector = {
+        "downsampling_ratio": downsampling_ratio,
+        "sample_then_batch": False,
+        "args": {},
+        "balance": balance,
+    }
+    return 0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu"
+
+
+@patch.multiple(AbstractMatrixDownsamplingStrategy, __abstractmethods__=set())
+def test_init():
+    amds = AbstractMatrixDownsamplingStrategy(*get_sampler_config())
+
+    assert amds.requires_coreset_supporting_module
+    assert not amds.matrix_elements
+    assert amds.matrix_content is None
+
+
+@patch.multiple(AbstractMatrixDownsamplingStrategy, __abstractmethods__=set())
+def test_collect_embeddings():
+    amds = AbstractMatrixDownsamplingStrategy(*get_sampler_config())
+
+    amds.matrix_content = MatrixContent.EMBEDDINGS
+
+    assert amds.requires_coreset_supporting_module
+    assert not amds.matrix_elements  # thank you pylint! amds.matrix_elements == []
+
+    first_embedding = torch.randn((4, 5))
+    second_embedding = torch.randn((3, 5))
+    amds.inform_samples([1, 2, 3, 4], None, None, first_embedding)
+    amds.inform_samples([21, 31, 41], None, None, second_embedding)
+
+    assert np.concatenate(amds.matrix_elements).shape == (7, 5)
+    assert all(torch.equal(el1, el2) for el1, el2 in zip(amds.matrix_elements, [first_embedding, second_embedding]))
+    assert amds.index_sampleid_map == [1, 2, 3, 4, 21, 31, 41]
+
+    third_embedding = torch.randn((23, 5))
+    amds.inform_samples(list(range(1000, 1023)), None, None, third_embedding)
+
+    assert np.concatenate(amds.matrix_elements).shape == (30, 5)
+    assert all(
+        torch.equal(el1, el2)
+        for el1, el2 in zip(amds.matrix_elements, [first_embedding, second_embedding, third_embedding])
+    )
+    assert amds.index_sampleid_map == [1, 2, 3, 4, 21, 31, 41] + list(range(1000, 1023))
+
+
+@patch.multiple(AbstractMatrixDownsamplingStrategy, __abstractmethods__=set())
+@patch.object(
+    AbstractMatrixDownsamplingStrategy, "_select_indexes_from_matrix", return_value=([0, 2], torch.Tensor([1.0, 3.0]))
+)
+def test_collect_embedding_balance(test_amds):
+    amds = AbstractMatrixDownsamplingStrategy(*get_sampler_config(True))
+
+    amds.matrix_content = MatrixContent.EMBEDDINGS
+
+    assert amds.requires_coreset_supporting_module
+    assert amds.requires_data_label_by_label
+    assert not amds.matrix_elements  # thank you pylint! amds.matrix_elements == []
+
+    first_embedding = torch.randn((4, 5))
+    second_embedding = torch.randn((3, 5))
+    amds.inform_samples([1, 2, 3, 4], None, None, first_embedding)
+    amds.inform_samples([21, 31, 41], None, None, second_embedding)
+
+    assert np.concatenate(amds.matrix_elements).shape == (7, 5)
+    assert all(torch.equal(el1, el2) for el1, el2 in zip(amds.matrix_elements, [first_embedding, second_embedding]))
+    assert amds.index_sampleid_map == [1, 2, 3, 4, 21, 31, 41]
+
+    amds.inform_end_of_current_label()
+
+    third_embedding = torch.randn((23, 5))
+    assert len(amds.matrix_elements) == 0
+    amds.inform_samples(list(range(1000, 1023)), None, None, third_embedding)
+
+    assert np.concatenate(amds.matrix_elements).shape == (23, 5)
+    assert all(torch.equal(el1, el2) for el1, el2 in zip(amds.matrix_elements, [third_embedding]))
+    assert amds.index_sampleid_map == list(range(1000, 1023))
+    assert amds.already_selected_samples == [1, 3]
+    amds.inform_end_of_current_label()
+    assert amds.already_selected_samples == [1, 3, 1000, 1002]
+
+
+@patch.multiple(AbstractMatrixDownsamplingStrategy, __abstractmethods__=set())
+def test_collect_gradients():
+    amds = AbstractMatrixDownsamplingStrategy(*get_sampler_config())
+    amds.matrix_content = MatrixContent.GRADIENTS
+
+    first_output = torch.randn((4, 2))
+    first_output.requires_grad = True
+    first_target = torch.tensor([1, 1, 1, 0])
+    first_embedding = torch.randn((4, 5))
+    amds.inform_samples([1, 2, 3, 4], first_output, first_target, first_embedding)
+
+    second_output = torch.randn((3, 2))
+    second_output.requires_grad = True
+    second_target = torch.tensor([0, 1, 0])
+    second_embedding = torch.randn((3, 5))
+    amds.inform_samples([21, 31, 41], second_output, second_target, second_embedding)
+
+    assert len(amds.matrix_elements) == 2
+
+    # expected shape = (a,b)
+    # a = 7 (4 samples in the first batch and 3 samples in the second batch)
+    # b = 5 * 2 + 2 where 5 is the input dimension of the last layer and 2 is the output one
+    assert np.concatenate(amds.matrix_elements).shape == (7, 12)
+
+    assert amds.index_sampleid_map == [1, 2, 3, 4, 21, 31, 41]
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py
index ac357aae8..7309bc7be 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_abstract_remote_downsampling_strategy.py
@@ -11,7 +11,7 @@ def test_batch_then_sample_general():
     downsampling_ratio = 50
 
     params_from_selector = {"downsampling_ratio": downsampling_ratio}
-    sampler = AbstractRemoteDownsamplingStrategy(154, 128, 64, params_from_selector)
+    sampler = AbstractRemoteDownsamplingStrategy(154, 128, 64, params_from_selector, "cpu")
 
     assert hasattr(sampler, "downsampling_ratio")
     assert sampler.downsampling_ratio == 50
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py
new file mode 100644
index 000000000..c6f0465a2
--- /dev/null
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_craig_remote_downsampling.py
@@ -0,0 +1,476 @@
+# pylint: disable=too-many-locals
+import numpy as np
+import torch
+from modyn.tests.trainer_server.internal.trainer.remote_downsamplers.deepcore_comparison_tests_utils import (
+    DummyModel,
+    assert_close_matrices,
+)
+from modyn.trainer_server.internal.trainer.remote_downsamplers import RemoteCraigDownsamplingStrategy
+from torch.nn import BCEWithLogitsLoss
+
+
+def get_sampler_config(balance=False):
+    downsampling_ratio = 50
+    per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+
+    params_from_selector = {
+        "downsampling_ratio": downsampling_ratio,
+        "sample_then_batch": False,
+        "balance": balance,
+        "selection_batch": 64,
+        "greedy": "NaiveGreedy",
+    }
+    return 0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu"
+
+
+def test_inform_samples():
+    sampler = RemoteCraigDownsamplingStrategy(*get_sampler_config())
+    # Test data
+    sample_ids = [1, 2, 3]
+    forward_output = torch.randn(3, 5)  # 3 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1])  # 3 target labels
+    embedding = torch.randn(3, 10)  # 3 samples, embedding dimension 10
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    expected_shape = (1, 3, forward_output.shape[1] * (1 + embedding.shape[1]))
+    assert len(sampler.current_class_gradients) == 1
+    assert np.array(sampler.current_class_gradients).shape == expected_shape
+
+
+# Dummy distance matrix for testing
+initial_matrix = np.array([[0, 1], [1, 0]])
+
+
+def test_add_to_distance_matrix_single_submatrix():
+    sampler = RemoteCraigDownsamplingStrategy(*get_sampler_config())
+    submatrix = np.array([[2]])
+    sampler.add_to_distance_matrix(initial_matrix)
+    sampler.add_to_distance_matrix(submatrix)
+    expected_result = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 2]])
+    assert np.array_equal(sampler.distance_matrix, expected_result)
+
+
+def test_add_to_distance_matrix_multiple_submatrix():
+    sampler = RemoteCraigDownsamplingStrategy(*get_sampler_config())
+    sampler.add_to_distance_matrix(initial_matrix)
+    submatrix = np.array([[3, 4], [4, 3]])
+    sampler.add_to_distance_matrix(submatrix)
+    expected_result = np.array([[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 3, 4], [0, 0, 4, 3]])
+    assert np.array_equal(sampler.distance_matrix, expected_result)
+
+
+def test_add_to_distance_matrix_large_submatrix():
+    sampler = RemoteCraigDownsamplingStrategy(*get_sampler_config())
+    sampler.add_to_distance_matrix(initial_matrix)
+    submatrix = np.array([[5, 6, 7], [6, 5, 7], [7, 7, 5]])
+    sampler.add_to_distance_matrix(submatrix)
+    sampler.add_to_distance_matrix(np.array([[0, 0], [0, 0]]))
+    expected_result = np.array(
+        [
+            [0, 1, 0, 0, 0, 0, 0],
+            [1, 0, 0, 0, 0, 0, 0],
+            [0, 0, 5, 6, 7, 0, 0],
+            [0, 0, 6, 5, 7, 0, 0],
+            [0, 0, 7, 7, 5, 0, 0],
+            [0, 0, 0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0, 0, 0],
+        ]
+    )
+    assert np.array_equal(sampler.distance_matrix, expected_result)
+
+
+def test_inform_end_of_current_label_and_select():
+    sampler = RemoteCraigDownsamplingStrategy(*get_sampler_config())
+    sample_ids = [1, 2, 3]
+    forward_output = torch.randn(3, 5)  # 3 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1])
+    embedding = torch.randn(3, 10)  # 3 samples, embedding dimension 10
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert sampler.distance_matrix.shape == (0, 0)
+    sampler.inform_end_of_current_label()
+    assert sampler.distance_matrix.shape == (3, 3)
+    assert len(sampler.current_class_gradients) == 0
+
+    sample_ids = [10, 11, 12, 13]
+    forward_output = torch.randn(4, 5)  # 4 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([0, 0, 0, 0])  # 4 target labels
+    embedding = torch.randn(4, 10)  # 4 samples, embedding dimension 10
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert sampler.distance_matrix.shape == (3, 3)
+    sampler.inform_end_of_current_label()
+    assert sampler.distance_matrix.shape == (7, 7)
+    assert len(sampler.current_class_gradients) == 0
+    assert sampler.index_sampleid_map == [1, 2, 3, 10, 11, 12, 13]
+
+    selected_points, selected_weights = sampler.select_points()
+
+    assert len(selected_points) == 3
+    assert len(selected_weights) == 3
+    assert all(weight > 0 for weight in selected_weights)
+    assert all(id in [1, 2, 3, 10, 11, 12, 13] for id in selected_points)
+
+
+def test_inform_end_of_current_label_and_select_balanced():
+    sampler = RemoteCraigDownsamplingStrategy(*get_sampler_config(True))
+    sample_ids = [1, 2, 3, 4]
+    forward_output = torch.randn(4, 5)
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1, 1])
+    embedding = torch.randn(4, 10)
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert sampler.distance_matrix.shape == (0, 0)
+    sampler.inform_end_of_current_label()
+    assert len(sampler.already_selected_samples) == 2
+    assert len(sampler.already_selected_weights) == 2
+    assert sampler.distance_matrix.shape == (0, 0)
+    assert len(sampler.current_class_gradients) == 0
+
+    sample_ids = [10, 11, 12, 13, 14, 15]
+    forward_output = torch.randn(6, 5)  # 4 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([0, 0, 0, 0, 0, 0])  # 4 target labels
+    embedding = torch.randn(6, 10)  # 4 samples, embedding dimension 10
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert sampler.distance_matrix.shape == (0, 0)
+    sampler.inform_end_of_current_label()
+    assert len(sampler.already_selected_samples) == 5
+    assert len(sampler.already_selected_weights) == 5
+    assert sampler.distance_matrix.shape == (0, 0)
+    assert len(sampler.current_class_gradients) == 0
+
+    selected_points, selected_weights = sampler.select_points()
+
+    assert len(selected_points) == 5
+    assert len(selected_weights) == 5
+    assert all(weight > 0 for weight in selected_weights)
+    assert all(id in [1, 2, 3, 4, 10, 11, 12, 13, 14, 15] for id in selected_points)
+    assert sum(id in [1, 2, 3, 4] for id in selected_points) == 2
+    assert sum(id in [10, 11, 12, 13, 14, 15] for id in selected_points) == 3
+
+
+def test_bts():
+    sampler = RemoteCraigDownsamplingStrategy(*get_sampler_config())
+    sample_ids = [1, 2, 3, 10, 11, 12, 13]
+    forward_output = torch.randn(7, 5)  # 7 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1, 0, 0, 0, 1])
+    embedding = torch.randn(7, 10)  # 7 samples, embedding dimension 10
+
+    assert sampler.distance_matrix.shape == (0, 0)
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+    sampler.inform_end_of_current_label()
+    assert sampler.distance_matrix.shape == (7, 7)
+    assert len(sampler.current_class_gradients) == 0
+
+    assert sampler.index_sampleid_map == [10, 11, 12, 1, 2, 3, 13]
+
+    selected_points, selected_weights = sampler.select_points()
+
+    assert len(selected_points) == 3
+    assert len(selected_weights) == 3
+    assert all(weight > 0 for weight in selected_weights)
+    assert all(id in [1, 2, 3, 10, 11, 12, 13] for id in selected_points)
+
+
+def test_bts_equals_stb():
+    # data
+    sample_ids = [1, 2, 3, 10, 11, 12, 13]
+    forward_output = torch.randn(7, 5)  # 7 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1, 0, 0, 0, 1])
+    embedding = torch.randn(7, 10)  # 7 samples, embedding dimension 10
+
+    # BTS, all in one call
+    bts_sampler = RemoteCraigDownsamplingStrategy(*get_sampler_config())
+    bts_sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    bts_selected_points, bts_selected_weights = bts_sampler.select_points()
+
+    # STB, first class 0 and then class 1
+    class0 = target == 0
+    class1 = target == 1
+    stb_sampler = RemoteCraigDownsamplingStrategy(*get_sampler_config())
+    stb_sampler.inform_samples(
+        [sample_ids[i] for i, keep in enumerate(class0) if keep],
+        forward_output[class0],
+        target[class0],
+        embedding[class0],
+    )
+    stb_sampler.inform_end_of_current_label()
+    stb_sampler.inform_samples(
+        [sample_ids[i] for i, keep in enumerate(class1) if keep],
+        forward_output[class1],
+        target[class1],
+        embedding[class1],
+    )
+    stb_selected_points, stb_selected_weights = stb_sampler.select_points()
+
+    assert bts_sampler.index_sampleid_map == stb_sampler.index_sampleid_map == [10, 11, 12, 1, 2, 3, 13]
+    assert bts_sampler.index_sampleid_map == stb_sampler.index_sampleid_map
+    assert stb_selected_points == bts_selected_points
+    assert torch.equal(stb_selected_weights, bts_selected_weights)
+
+
+def test_matching_results_with_deepcore():
+    # RESULTS OBTAINED USING DEEPCORE IN THE SAME SETTING
+    expected_distance_matrix = [
+        [0.23141611747646584, 0.0010000000000000009, 0.08913177049160004, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [0.0010000000000000009, 0.23141611747646584, 0.13688503003120422, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [0.08913177049160004, 0.13688503003120422, 0.23141611747646584, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [
+            0.0,
+            0.0,
+            0.0,
+            0.07399794256687164,
+            0.013805931270122529,
+            0.04436375929415226,
+            0.0010000000000000009,
+            0.047306565403938294,
+            0.07344558201637119,
+            0.037069154739379884,
+        ],
+        [
+            0.0,
+            0.0,
+            0.0,
+            0.013805931270122529,
+            0.07405797772312417,
+            0.042330792009830476,
+            0.06103372650593519,
+            0.03912345489859581,
+            0.014334088027477265,
+            0.050075888469815255,
+        ],
+        [
+            0.0,
+            0.0,
+            0.0,
+            0.04436375929415226,
+            0.042330792009830476,
+            0.07399794256687164,
+            0.029320956975221635,
+            0.07084722916968167,
+            0.044916815891861916,
+            0.06630006742104888,
+        ],
+        [
+            0.0,
+            0.0,
+            0.0,
+            0.0010000000000000009,
+            0.06103372650593519,
+            0.029320956975221635,
+            0.07397266097884858,
+            0.026116726756095887,
+            0.0015217790603637704,
+            0.03705782613158226,
+        ],
+        [
+            0.0,
+            0.0,
+            0.0,
+            0.047306565403938294,
+            0.03912345489859581,
+            0.07084722916968167,
+            0.026116726756095887,
+            0.07405797772312417,
+            0.04786216451227665,
+            0.0630889374986291,
+        ],
+        [
+            0.0,
+            0.0,
+            0.0,
+            0.07344558201637119,
+            0.014334088027477265,
+            0.044916815891861916,
+            0.0015217790603637704,
+            0.04786216451227665,
+            0.07399794256687164,
+            0.03761516308784485,
+        ],
+        [
+            0.0,
+            0.0,
+            0.0,
+            0.037069154739379884,
+            0.050075888469815255,
+            0.06630006742104888,
+            0.03705782613158226,
+            0.0630889374986291,
+            0.03761516308784485,
+            0.07405797772312417,
+        ],
+    ]
+
+    selected_samples_deepcore = [2, 5]
+    selected_weights_deepcore = [4, 8]
+
+    torch.manual_seed(42)
+    dummy_model = DummyModel()
+    np.random.seed(42)
+    samples = torch.rand(10, 1)
+    targets = torch.tensor([0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
+
+    sampler = RemoteCraigDownsamplingStrategy(
+        0,
+        0,
+        5,
+        {"downsampling_ratio": 20, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy"},
+        BCEWithLogitsLoss(reduction="none"),
+        "cpu",
+    )
+    sample_ids = [0, 1, 2]
+    dummy_model.embedding_recorder.start_recording()
+    forward_output = dummy_model(samples[0:3]).float()
+    target = torch.tensor(targets[0:3]).unsqueeze(dim=1).float()
+    embedding = dummy_model.embedding
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert sampler.distance_matrix.shape == (0, 0)
+    sampler.inform_end_of_current_label()
+    assert sampler.distance_matrix.shape == (3, 3)
+    assert len(sampler.current_class_gradients) == 0
+
+    sample_ids = [3, 4, 5, 6, 7, 8, 9]
+    forward_output = dummy_model(samples[3:]).float()
+    target = torch.tensor(targets[3:]).unsqueeze(dim=1).float()
+    embedding = dummy_model.embedding
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert sampler.distance_matrix.shape == (3, 3)
+    sampler.inform_end_of_current_label()
+    assert sampler.distance_matrix.shape == (10, 10)
+    assert len(sampler.current_class_gradients) == 0
+    assert sampler.index_sampleid_map == list(range(10))
+
+    selected_samples, selected_weights = sampler.select_points()
+
+    assert len(selected_samples) == 2
+    assert len(selected_weights) == 2
+    assert_close_matrices(expected_distance_matrix, sampler.distance_matrix.tolist())
+    assert selected_samples_deepcore == selected_samples
+    assert selected_weights_deepcore == selected_weights.tolist()
+
+
+def test_matching_results_with_deepcore_permutation():
+    selected_samples_deepcore = [2, 1, 5]
+    selected_weights_deepcore = [4, 3, 6]
+
+    torch.manual_seed(42)
+    dummy_model = DummyModel()
+    np.random.seed(42)
+    samples = torch.rand(10, 1)
+    targets = torch.tensor([1, 1, 0, 0, 0, 1, 1, 1, 1, 1])
+
+    sampler = RemoteCraigDownsamplingStrategy(
+        0,
+        0,
+        5,
+        {"downsampling_ratio": 30, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy"},
+        BCEWithLogitsLoss(reduction="none"),
+        "cpu",
+    )
+    sample_ids = [2, 3, 4]
+    dummy_model.embedding_recorder.start_recording()
+    forward_output = dummy_model(samples[targets == 0]).float()
+    target = torch.tensor(targets[targets == 0]).unsqueeze(dim=1).float()
+    embedding = dummy_model.embedding
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert sampler.distance_matrix.shape == (0, 0)
+    sampler.inform_end_of_current_label()
+    assert sampler.distance_matrix.shape == (3, 3)
+    assert len(sampler.current_class_gradients) == 0
+
+    sample_ids = [0, 1, 5, 6, 7, 8, 9]
+    forward_output = dummy_model(samples[targets == 1]).float()
+    target = torch.tensor(targets[targets == 1]).unsqueeze(dim=1).float()
+    embedding = dummy_model.embedding
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert sampler.distance_matrix.shape == (3, 3)
+    sampler.inform_end_of_current_label()
+    assert sampler.distance_matrix.shape == (10, 10)
+    assert len(sampler.current_class_gradients) == 0
+
+    selected_samples, selected_weights = sampler.select_points()
+
+    assert len(selected_samples) == 3
+    assert len(selected_weights) == 3
+    assert selected_samples_deepcore == selected_samples
+    assert selected_weights_deepcore == selected_weights.tolist()
+
+
+def test_matching_results_with_deepcore_permutation_fancy_ids():
+    index_mapping = [45, 56, 98, 34, 781, 12, 432, 422, 5, 10]
+    selected_indices_deepcore = [2, 3, 4, 1, 9]
+    selected_samples_deepcore = [index_mapping[i] for i in selected_indices_deepcore]
+    # This test is a bit flaky - probably due to numerical issues. Sometimes, index 5 is selected instead of 1
+    selected_indices_deepcore2 = [2, 3, 4, 5, 9]
+    selected_samples_deepcore2 = [index_mapping[i] for i in selected_indices_deepcore2]
+    selected_weights_deepcore = [2, 2, 2, 3, 6]
+
+    torch.manual_seed(2)
+    dummy_model = DummyModel()
+    np.random.seed(3)
+    samples = torch.rand(10, 1)
+    targets = torch.tensor([1, 1, 0, 0, 0, 1, 1, 1, 1, 1])
+
+    sampler = RemoteCraigDownsamplingStrategy(
+        0,
+        0,
+        5,
+        {"downsampling_ratio": 50, "balance": False, "selection_batch": 64, "greedy": "NaiveGreedy"},
+        BCEWithLogitsLoss(reduction="none"),
+        "cpu",
+    )
+    sample_ids = [index_mapping[i] for i in [2, 3, 4]]
+    dummy_model.embedding_recorder.start_recording()
+    forward_output = dummy_model(samples[targets == 0]).float()
+    target = torch.tensor(targets[targets == 0]).unsqueeze(dim=1).float()
+    embedding = dummy_model.embedding
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert sampler.distance_matrix.shape == (0, 0)
+    sampler.inform_end_of_current_label()
+    assert sampler.distance_matrix.shape == (3, 3)
+    assert len(sampler.current_class_gradients) == 0
+
+    sample_ids = [index_mapping[i] for i in [0, 1, 5, 6, 7, 8, 9]]
+    forward_output = dummy_model(samples[targets == 1]).float()
+    target = torch.tensor(targets[targets == 1]).unsqueeze(dim=1).float()
+    embedding = dummy_model.embedding
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert sampler.distance_matrix.shape == (3, 3)
+    sampler.inform_end_of_current_label()
+    assert sampler.distance_matrix.shape == (10, 10)
+    assert len(sampler.current_class_gradients) == 0
+
+    selected_samples, selected_weights = sampler.select_points()
+
+    assert len(selected_samples) == 5
+    assert len(selected_weights) == 5
+
+    # Allow for flakyness with two options
+    assert selected_samples in (selected_samples_deepcore, selected_samples_deepcore2)
+    assert selected_weights_deepcore == selected_weights.tolist()
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradmatch_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradmatch_downsampling_strategy.py
new file mode 100644
index 000000000..ee95e9e5a
--- /dev/null
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradmatch_downsampling_strategy.py
@@ -0,0 +1,247 @@
+# pylint: disable=too-many-locals
+
+import numpy as np
+import torch
+from modyn.tests.trainer_server.internal.trainer.remote_downsamplers.deepcore_comparison_tests_utils import DummyModel
+from modyn.trainer_server.internal.trainer.remote_downsamplers.remote_grad_match_downsampling_strategy import (
+    RemoteGradMatchDownsamplingStrategy,
+)
+from torch.nn import BCEWithLogitsLoss
+
+
+def get_sampler_config(balance=False):
+    downsampling_ratio = 50
+    per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+
+    params_from_selector = {
+        "downsampling_ratio": downsampling_ratio,
+        "sample_then_batch": False,
+        "args": {},
+        "balance": balance,
+    }
+    return 0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu"
+
+
+def test_select():
+    sampler = RemoteGradMatchDownsamplingStrategy(*get_sampler_config())
+    sample_ids = [1, 2, 3]
+    forward_output = torch.randn(3, 5)  # 3 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1])
+    embedding = torch.randn(3, 10)
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert len(sampler.matrix_elements) == 1
+    assert sampler.matrix_elements[0].shape == (3, 55)
+
+    sample_ids = [10, 11, 12, 13]
+    forward_output = torch.randn(4, 5)  # 4 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1, 1])  # 4 target labels
+    embedding = torch.randn(4, 10)  # 4 samples, embedding dimension 10
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert len(sampler.matrix_elements) == 2
+    assert sampler.matrix_elements[0].shape == (3, 55)
+    assert sampler.matrix_elements[1].shape == (4, 55)
+    assert sampler.index_sampleid_map == [1, 2, 3, 10, 11, 12, 13]
+
+    selected_points, selected_weights = sampler.select_points()
+
+    assert len(selected_points) == 3
+    assert len(selected_weights) == 3
+    assert all(weight > 0 for weight in selected_weights)
+    assert all(id in [1, 2, 3, 10, 11, 12, 13] for id in selected_points)
+
+
+def test_select_balanced():
+    sampler = RemoteGradMatchDownsamplingStrategy(*get_sampler_config(True))
+    sample_ids = [1, 2, 3]
+    forward_output = torch.randn(3, 5)  # 3 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1])
+    embedding = torch.randn(3, 10)
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert len(sampler.matrix_elements) == 1
+    assert sampler.matrix_elements[0].shape == (3, 55)
+
+    sampler.inform_end_of_current_label()
+    assert len(sampler.matrix_elements) == 0
+    assert len(sampler.already_selected_samples) == 1
+    assert len(sampler.already_selected_weights) == 1
+
+    sample_ids = [10, 11, 12, 13]
+    forward_output = torch.randn(4, 5)  # 4 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1, 1])  # 4 target labels
+    embedding = torch.randn(4, 10)  # 4 samples, embedding dimension 10
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert len(sampler.matrix_elements) == 1
+    assert sampler.matrix_elements[0].shape == (4, 55)
+    assert sampler.index_sampleid_map == [10, 11, 12, 13]
+
+    sampler.inform_end_of_current_label()
+    assert len(sampler.matrix_elements) == 0
+    assert len(sampler.already_selected_samples) == 3
+    assert len(sampler.already_selected_weights) == 3
+
+    selected_points, selected_weights = sampler.select_points()
+
+    assert len(selected_points) == 3
+    assert len(selected_weights) == 3
+    assert all(weight > 0 for weight in selected_weights)
+    assert all(id in [1, 2, 3, 10, 11, 12, 13] for id in selected_points)
+
+
+def test_matching_results_with_deepcore():
+    # RESULTS OBTAINED USING DEEPCORE IN THE SAME SETTING (list[i]= result selecting i samples,
+    # None when gradmatch is meaningless, so when 0 samples are selected.
+    selected_samples_deepcore = [
+        None,
+        [7],
+        [6, 7],
+        [2, 6, 7],
+        [2, 3, 6, 7],
+        [0, 2, 3, 6, 7],
+        [0, 1, 2, 3, 6, 7],
+        [0, 1, 2, 3, 6, 7],
+        [0, 1, 2, 3, 6, 7],
+        [0, 1, 2, 3, 6, 7],
+    ]
+    selected_weights_deepcore = [
+        None,
+        [1.0],
+        [3.407759504625574e-05, 5.847577631357126e-05],
+        [5.103206058265641e-05, 3.4257751394761726e-05, 5.825896005262621e-05],
+        [5.083320866106078e-05, 4.821651236852631e-05, 3.4427150239935145e-05, 5.805644832435064e-05],
+        [
+            1.5012016774562653e-05,
+            5.077691821497865e-05,
+            4.8160465667024255e-05,
+            3.447928975219838e-05,
+            5.79994848521892e-05,
+        ],
+        [
+            1.5032787814561743e-05,
+            5.841296115249861e-06,
+            5.079848415334709e-05,
+            4.818197339773178e-05,
+            3.4458098525647074e-05,
+            5.802121086162515e-05,
+        ],
+        [
+            1.5071565940161236e-05,
+            5.801955467177322e-06,
+            5.083948781248182e-05,
+            4.822281334782019e-05,
+            3.441966327955015e-05,
+            5.806266199215315e-05,
+        ],
+        [
+            1.4997756807133555e-05,
+            5.878812771697994e-06,
+            5.076439629192464e-05,
+            4.8147816414712e-05,
+            3.449815994827077e-05,
+            5.798730853712186e-05,
+        ],
+        [
+            1.5097687537490856e-05,
+            5.778654667665251e-06,
+            5.08721532241907e-05,
+            4.825499854632653e-05,
+            3.440177897573449e-05,
+            5.809665162814781e-05,
+        ],
+    ]
+
+    torch.manual_seed(23)
+    dummy_model = DummyModel()
+    samples = torch.rand(10, 1)
+    target = torch.tensor([0, 1, 0, 0, 0, 1, 1, 0, 1, 1]).unsqueeze(1).float()
+    sample_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    dummy_model.embedding_recorder.start_recording()
+    forward_output = dummy_model(samples).float()
+    embedding = dummy_model.embedding
+
+    for num_of_target_samples in range(1, 10):
+        np.random.seed(42)
+
+        sampler = RemoteGradMatchDownsamplingStrategy(
+            0,
+            0,
+            5,
+            {"downsampling_ratio": 10 * num_of_target_samples, "balance": False},
+            BCEWithLogitsLoss(reduction="none"),
+            "cpu",
+        )
+        sampler.inform_samples(sample_ids, forward_output, target, embedding)
+        assert sampler.index_sampleid_map == list(range(10))
+        selected_samples, selected_weights = sampler.select_points()
+        assert len(selected_samples) == len(selected_weights)
+
+        # sort the results
+        combined = list(zip(selected_samples, selected_weights))
+        combined.sort(key=lambda x: x[0])
+        selected_samples_sorted, selected_weights_sorted = zip(*combined)
+
+        # sort the expected deepcore results
+        combined = list(
+            zip(selected_samples_deepcore[num_of_target_samples], selected_weights_deepcore[num_of_target_samples])
+        )
+        combined.sort(key=lambda x: x[0])
+        selected_samples_sorted_deepcore, selected_weights_sorted_deepcore = zip(*combined)
+
+        assert selected_samples_sorted_deepcore == selected_samples_sorted
+        assert all(
+            np.isclose(expected, computed)
+            for expected, computed in zip(selected_weights_sorted_deepcore, selected_weights_sorted)
+        )
+
+
+def test_matching_results_with_deepcore_permutation_fancy_ids():
+    index_mapping = [45, 56, 1, 2, 3, 12, 432, 422, 5, 4]
+    selected_indices_deepcore = [2, 3, 4, 9]
+    selected_samples_deepcore = [index_mapping[i] for i in selected_indices_deepcore]
+    selected_weights_deepcore = [
+        0.0004691047070082277,
+        0.0004625729052349925,
+        0.0005646746722050011,
+        0.0005694780265912414,
+    ]
+
+    torch.manual_seed(467)
+    dummy_model = DummyModel()
+    np.random.seed(67)
+    samples = torch.rand(10, 1)
+    targets = torch.tensor([1, 1, 0, 0, 0, 1, 1, 1, 0, 0]).float().unsqueeze(1)
+
+    sampler = RemoteGradMatchDownsamplingStrategy(
+        0, 0, 5, {"downsampling_ratio": 50, "balance": False}, BCEWithLogitsLoss(reduction="none"), "cpu"
+    )
+
+    dummy_model.embedding_recorder.start_recording()
+    forward_output = dummy_model(samples).float()
+    embedding = dummy_model.embedding
+
+    sampler.inform_samples(index_mapping, forward_output, targets, embedding)
+
+    selected_samples, selected_weights = sampler.select_points()
+
+    combined = list(zip(selected_samples, selected_weights))
+    combined.sort(key=lambda x: x[0])
+    selected_samples_sorted, selected_weights_sorted = zip(*combined)
+
+    assert len(selected_samples_sorted) == 4
+    assert len(selected_weights_sorted) == 4
+
+    assert selected_samples_deepcore == list(selected_samples_sorted)
+    assert all(
+        np.isclose(expected, computed) for expected, computed in zip(selected_weights_deepcore, selected_weights_sorted)
+    )
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py
index 53855b8c0..f7027a33b 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_gradnorm_downsample.py
@@ -14,7 +14,7 @@ def test_sample_shape_ce():
     per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
     params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False}
-    sampler = RemoteGradNormDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct)
+    sampler = RemoteGradNormDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu")
 
     data = torch.randn(8, 10)
     target = torch.randint(2, size=(8,))
@@ -42,7 +42,7 @@ def test_sample_shape_other_losses():
     per_sample_loss_fct = torch.nn.BCEWithLogitsLoss(reduction="none")
 
     params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False}
-    sampler = RemoteGradNormDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct)
+    sampler = RemoteGradNormDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu")
 
     data = torch.randn(8, 10)
     target = torch.randint(2, size=(8,), dtype=torch.float32).unsqueeze(1)
@@ -81,13 +81,13 @@ def test_sampling_crossentropy():
     }
 
     # Here we use autograd since the number of classes is not provided
-    sampler = RemoteGradNormDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct)
+    sampler = RemoteGradNormDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu")
     forward_outputs = model(data)
 
     sampler.inform_samples(ids, forward_outputs, target)
     _, autograd_weights = sampler.select_points()
     # Here we use the closed form shortcut
-    sampler = RemoteGradNormDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct)
+    sampler = RemoteGradNormDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu")
 
     sampler.inform_samples(ids, forward_outputs, target)
     _, closed_form_weights = sampler.select_points()
@@ -125,7 +125,7 @@ def test_sample_dict_input():
     per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
     params_from_selector = {"downsampling_ratio": 50, "sample_then_batch": False}
-    sampler = RemoteGradNormDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct)
+    sampler = RemoteGradNormDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu")
 
     forward_outputs = model(data)
 
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_kcenter_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_kcenter_downsampling_strategy.py
new file mode 100644
index 000000000..a4a97b768
--- /dev/null
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_kcenter_downsampling_strategy.py
@@ -0,0 +1,170 @@
+import numpy as np
+import torch
+from modyn.tests.trainer_server.internal.trainer.remote_downsamplers.deepcore_comparison_tests_utils import DummyModel
+from modyn.trainer_server.internal.trainer.remote_downsamplers.remote_kcenter_greedy_downsampling_strategy import (
+    RemoteKcenterGreedyDownsamplingStrategy,
+)
+from torch.nn import BCEWithLogitsLoss
+
+
+def get_sampler_config(balance=False):
+    downsampling_ratio = 50
+    per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+
+    params_from_selector = {
+        "downsampling_ratio": downsampling_ratio,
+        "sample_then_batch": False,
+        "args": {},
+        "balance": balance,
+    }
+    return 0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu"
+
+
+def test_select():
+    sampler = RemoteKcenterGreedyDownsamplingStrategy(*get_sampler_config())
+    sample_ids = [1, 2, 3]
+    forward_output = torch.randn(3, 5)  # 3 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1])
+    embedding = torch.randn(3, 10)  # 3 samples, embedding dimension 10
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert len(sampler.matrix_elements) == 1
+    assert sampler.matrix_elements[0].shape == (3, 10)
+
+    sample_ids = [10, 11, 12, 13]
+    forward_output = torch.randn(4, 5)  # 4 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1, 1])  # 4 target labels
+    embedding = torch.randn(4, 10)  # 4 samples, embedding dimension 10
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert len(sampler.matrix_elements) == 2
+    assert sampler.matrix_elements[0].shape == (3, 10)
+    assert sampler.matrix_elements[1].shape == (4, 10)
+    assert sampler.index_sampleid_map == [1, 2, 3, 10, 11, 12, 13]
+
+    selected_points, selected_weights = sampler.select_points()
+
+    assert len(selected_points) == 3
+    assert len(selected_weights) == 3
+    assert all(weight > 0 for weight in selected_weights)
+    assert all(id in [1, 2, 3, 10, 11, 12, 13] for id in selected_points)
+
+
+def test_select_balanced():
+    sampler = RemoteKcenterGreedyDownsamplingStrategy(*get_sampler_config(True))
+    sample_ids = [1, 2, 3]
+    forward_output = torch.randn(3, 5)  # 3 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1])
+    embedding = torch.randn(3, 10)  # 3 samples, embedding dimension 10
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert len(sampler.matrix_elements) == 1
+    assert sampler.matrix_elements[0].shape == (3, 10)
+
+    sampler.inform_end_of_current_label()
+    assert len(sampler.already_selected_samples) == 1
+    assert len(sampler.already_selected_weights) == 1
+
+    sample_ids = [10, 11, 12, 13]
+    forward_output = torch.randn(4, 5)  # 4 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1, 1])  # 4 target labels
+    embedding = torch.randn(4, 10)  # 4 samples, embedding dimension 10
+
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+
+    assert len(sampler.matrix_elements) == 1
+    assert sampler.matrix_elements[0].shape == (4, 10)
+    assert sampler.index_sampleid_map == [10, 11, 12, 13]
+    sampler.inform_end_of_current_label()
+    assert len(sampler.already_selected_samples) == 3
+    assert len(sampler.already_selected_weights) == 3
+
+    selected_points, selected_weights = sampler.select_points()
+
+    assert len(selected_points) == 3
+    assert len(selected_weights) == 3
+    assert all(weight > 0 for weight in selected_weights)
+    assert all(id in [1, 2, 3, 10, 11, 12, 13] for id in selected_points)
+    assert sum(id in [1, 2, 3] for id in selected_points) == 1
+    assert sum(id in [10, 11, 12, 13] for id in selected_points) == 2
+
+
+def test_matching_results_with_deepcore():
+    # RESULTS OBTAINED USING DEEPCORE IN THE SAME SETTING (list[i]= result selecting i samples,
+    # None when kcenter is meaningless, so when 0 or 1 samples are selected. 1 is meaningless since kcenter always
+    # starts from a random sample)
+    selected_samples_deepcore = [
+        None,
+        None,
+        [0, 6],
+        [0, 6, 7],
+        [0, 3, 6, 7],
+        [0, 3, 4, 6, 7],
+        [0, 3, 4, 6, 7, 9],
+        [0, 2, 3, 4, 6, 7, 9],
+        [0, 1, 2, 3, 4, 6, 7, 9],
+        [0, 1, 2, 3, 4, 5, 6, 7, 9],
+    ]
+
+    torch.manual_seed(42)
+    dummy_model = DummyModel()
+    samples = torch.rand(10, 1)
+    target = torch.tensor([1, 1, 0, 0, 0, 1, 1, 1, 1, 1]).float()
+    sample_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    dummy_model.embedding_recorder.start_recording()
+    forward_output = dummy_model(samples).float()
+    embedding = dummy_model.embedding
+
+    for num_of_target_samples in range(2, 9):
+        # reset np seed each time since kcenter starts from a random sample
+        np.random.seed(42)
+
+        sampler = RemoteKcenterGreedyDownsamplingStrategy(
+            0,
+            0,
+            5,
+            {"downsampling_ratio": 10 * num_of_target_samples, "balance": False},
+            BCEWithLogitsLoss(reduction="none"),
+            "cpu",
+        )
+        sampler.inform_samples(sample_ids, forward_output, target, embedding)
+        assert sampler.index_sampleid_map == list(range(10))
+        selected_samples, selected_weights = sampler.select_points()
+        assert len(selected_samples) == num_of_target_samples
+        assert len(selected_weights) == num_of_target_samples
+        assert sorted(selected_samples_deepcore[num_of_target_samples]) == sorted(selected_samples)
+
+
+def test_matching_results_with_deepcore_permutation_fancy_ids():
+    index_mapping = [45, 56, 98, 34, 781, 12, 432, 422, 5, 10]
+    selected_indices_deepcore = [0, 1, 3, 6, 9]
+    selected_samples_deepcore = [index_mapping[i] for i in selected_indices_deepcore]
+
+    torch.manual_seed(467)
+    dummy_model = DummyModel()
+    np.random.seed(67)
+    samples = torch.rand(10, 1)
+    targets = torch.tensor([1, 1, 0, 0, 0, 1, 1, 1, 0, 0]).float()
+
+    sampler = RemoteKcenterGreedyDownsamplingStrategy(
+        0, 0, 5, {"downsampling_ratio": 50, "balance": False}, BCEWithLogitsLoss(reduction="none"), "cpu"
+    )
+
+    dummy_model.embedding_recorder.start_recording()
+    forward_output = dummy_model(samples).float()
+    embedding = dummy_model.embedding
+
+    sampler.inform_samples(index_mapping, forward_output, targets, embedding)
+
+    selected_samples, selected_weights = sampler.select_points()
+
+    assert len(selected_samples) == 5
+    assert len(selected_weights) == 5
+    assert sorted(selected_samples_deepcore) == sorted(selected_samples)
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py
index 6a668d4e1..198c93e43 100644
--- a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_loss_downsample.py
@@ -12,7 +12,7 @@ def test_sample_shape():
     per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
     params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False}
-    sampler = RemoteLossDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct)
+    sampler = RemoteLossDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu")
 
     data = torch.randn(8, 10)
     target = torch.randint(2, size=(8,))
@@ -37,7 +37,7 @@ def test_sample_weights():
     per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
     params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False}
-    sampler = RemoteLossDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct)
+    sampler = RemoteLossDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu")
 
     data = torch.randn(8, 10)
     target = torch.randint(2, size=(8,))
@@ -63,7 +63,7 @@ def test_sample_loss_dependent_sampling():
     per_sample_loss_fct = torch.nn.MSELoss(reduction="none")
 
     params_from_selector = {"downsampling_ratio": downsampling_ratio, "sample_then_batch": False}
-    sampler = RemoteLossDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct)
+    sampler = RemoteLossDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu")
 
     # Create a target with two classes, where half have a true label of 0 and half have a true label of 1
     target = torch.cat([torch.zeros(4), torch.ones(4)])
@@ -110,7 +110,7 @@ def test_sample_dict_input():
     per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
     params_from_selector = {"downsampling_ratio": 50, "sample_then_batch": False}
-    sampler = RemoteLossDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct)
+    sampler = RemoteLossDownsampling(0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu")
 
     forward_output = mymodel(data)
     sampler.inform_samples(sample_ids, forward_output, target)
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_submodular_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_submodular_downsampling_strategy.py
new file mode 100644
index 000000000..c6d56d2d1
--- /dev/null
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_submodular_downsampling_strategy.py
@@ -0,0 +1,175 @@
+import numpy as np
+import torch
+from modyn.tests.trainer_server.internal.trainer.remote_downsamplers.deepcore_comparison_tests_utils import DummyModel
+from modyn.trainer_server.internal.trainer.remote_downsamplers.remote_submodular_downsampling_strategy import (
+    RemoteSubmodularDownsamplingStrategy,
+)
+from torch.nn import BCEWithLogitsLoss
+
+
+def get_sampler_config(submodular: str = "GraphCut", balance=False):
+    downsampling_ratio = 50
+    per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+
+    params_from_selector = {
+        "downsampling_ratio": downsampling_ratio,
+        "sample_then_batch": False,
+        "args": {},
+        "submodular_function": submodular,
+        "balance": balance,
+        "selection_batch": 64,
+    }
+    return 0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu"
+
+
+def test_select_different_submodulars():
+    _test_select_subm("FacilityLocation")
+    _test_select_subm("GraphCut")
+    _test_select_subm("LogDeterminant")
+
+
+def test_select_different_submodulars_balanced():
+    _test_select_subm_balance("FacilityLocation")
+    _test_select_subm_balance("GraphCut")
+    _test_select_subm_balance("LogDeterminant")
+
+
+def _test_select_subm(submodular, balance=False):
+    sampler = RemoteSubmodularDownsamplingStrategy(*get_sampler_config(submodular, balance))
+    sample_ids = [1, 2, 3]
+    forward_output = torch.randn(3, 5)  # 3 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1])
+    embedding = torch.randn(3, 10)  # 3 samples, embedding dimension 10
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+    assert len(sampler.matrix_elements) == 1
+    # 3 samples of dim 5 * 10 + 5
+    assert sampler.matrix_elements[0].shape == (3, 55)
+    sample_ids = [10, 11, 12, 13]
+    forward_output = torch.randn(4, 5)  # 4 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1, 1])  # 4 target labels
+    embedding = torch.randn(4, 10)  # 4 samples, embedding dimension 10
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+    assert len(sampler.matrix_elements) == 2
+    assert sampler.matrix_elements[0].shape == (3, 55)
+    assert sampler.matrix_elements[1].shape == (4, 55)
+    assert sampler.index_sampleid_map == [1, 2, 3, 10, 11, 12, 13]
+    selected_points, selected_weights = sampler.select_points()
+    assert len(selected_points) == 3
+    assert len(selected_weights) == 3
+    assert all(weight > 0 for weight in selected_weights)
+    assert all(id in [1, 2, 3, 10, 11, 12, 13] for id in selected_points)
+
+
+def _test_select_subm_balance(submodular):
+    sampler = RemoteSubmodularDownsamplingStrategy(*get_sampler_config(submodular, True))
+    sample_ids = [1, 2, 3]
+    forward_output = torch.randn(3, 5)  # 3 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1])
+    embedding = torch.randn(3, 10)  # 3 samples, embedding dimension 10
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+    assert len(sampler.matrix_elements) == 1
+    # 3 samples of dim 5 * 10 + 5
+    assert sampler.matrix_elements[0].shape == (3, 55)
+
+    sampler.inform_end_of_current_label()
+    assert len(sampler.already_selected_weights) == 1
+    assert len(sampler.already_selected_samples) == 1
+    assert len(sampler.index_sampleid_map) == 0
+    assert len(sampler.matrix_elements) == 0
+
+    sample_ids = [10, 11, 12, 13]
+    forward_output = torch.randn(4, 5)  # 4 samples, 5 output classes
+    forward_output.requires_grad = True
+    target = torch.tensor([1, 1, 1, 1])  # 4 target labels
+    embedding = torch.randn(4, 10)  # 4 samples, embedding dimension 10
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+    assert len(sampler.matrix_elements) == 1
+    assert sampler.matrix_elements[0].shape == (4, 55)
+    assert sampler.index_sampleid_map == [10, 11, 12, 13]
+
+    sampler.inform_end_of_current_label()
+    assert len(sampler.already_selected_weights) == 3
+    assert len(sampler.already_selected_samples) == 3
+    assert len(sampler.index_sampleid_map) == 0
+    assert len(sampler.matrix_elements) == 0
+
+    selected_points, selected_weights = sampler.select_points()
+    assert len(selected_points) == 3
+    assert len(selected_weights) == 3
+    assert all(weight > 0 for weight in selected_weights)
+    assert all(id in [1, 2, 3, 10, 11, 12, 13] for id in selected_points)
+
+
+def _get_selected_samples(submodular, num_of_target_samples, sample_ids, forward_output, target, embedding):
+    np.random.seed(42)
+
+    sampler = RemoteSubmodularDownsamplingStrategy(
+        0,
+        0,
+        5,
+        {
+            "downsampling_ratio": 10 * num_of_target_samples,
+            "submodular_function": submodular,
+            "balance": False,
+            "selection_batch": 64,
+        },
+        BCEWithLogitsLoss(reduction="none"),
+        "cpu",
+    )
+    sampler.inform_samples(sample_ids, forward_output, target, embedding)
+    assert sampler.index_sampleid_map == list(range(10))
+    selected_samples, selected_weights = sampler.select_points()
+    assert len(selected_samples) == num_of_target_samples
+    assert len(selected_weights) == num_of_target_samples
+    return selected_samples
+
+
+def test_matching_with_deepcore():
+    torch.manual_seed(23)
+    dummy_model = DummyModel()
+    samples = torch.rand(10, 1)
+    target = torch.tensor([0, 1, 0, 0, 0, 1, 1, 0, 1, 1]).unsqueeze(1).float()
+    sample_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    dummy_model.embedding_recorder.start_recording()
+    forward_output = dummy_model(samples).float()
+    embedding = dummy_model.embedding
+
+    # Facility Location
+    expected = [
+        None,
+        [6],
+        [0, 6],
+        [0, 6, 9],
+        [0, 2, 6, 9],
+        [0, 2, 4, 6, 9],
+        [0, 2, 4, 6, 8, 9],
+        [0, 1, 2, 4, 6, 8, 9],
+        [0, 1, 2, 4, 5, 6, 8, 9],
+        [0, 1, 2, 4, 5, 6, 7, 8, 9],
+    ]
+    for i in range(1, 10):
+        assert (
+            sorted(_get_selected_samples("FacilityLocation", i, sample_ids, forward_output, target, embedding))
+            == expected[i]
+        )
+
+    # GraphCut
+    expected = [
+        None,
+        [6],
+        [0, 6],
+        [0, 6, 8],
+        [0, 6, 7, 8],
+        [0, 5, 6, 7, 8],
+        [0, 4, 5, 6, 7, 8],
+        [0, 1, 4, 5, 6, 7, 8],
+        [0, 1, 3, 4, 5, 6, 7, 8],
+        [0, 1, 3, 4, 5, 6, 7, 8, 9],
+    ]
+    for i in range(1, 10):
+        assert (
+            sorted(_get_selected_samples("GraphCut", i, sample_ids, forward_output, target, embedding)) == expected[i]
+        )
diff --git a/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py
new file mode 100644
index 000000000..980dd5af8
--- /dev/null
+++ b/modyn/tests/trainer_server/internal/trainer/remote_downsamplers/test_remote_uncertainty_downsampling_strategy.py
@@ -0,0 +1,77 @@
+import torch
+from modyn.trainer_server.internal.trainer.remote_downsamplers.remote_uncertainty_downsampling_strategy import (
+    RemoteUncertaintyDownsamplingStrategy,
+)
+
+
+def get_sampler_config(balance=False):
+    downsampling_ratio = 50
+    per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+
+    params_from_selector = {
+        "downsampling_ratio": downsampling_ratio,
+        "sample_then_batch": False,
+        "args": {},
+        "balance": balance,
+        "score_metric": "LeastConfidence",
+    }
+    return 0, 0, 0, params_from_selector, per_sample_loss_fct, "cpu"
+
+
+def test_init():
+    amds = RemoteUncertaintyDownsamplingStrategy(*get_sampler_config())
+
+    assert not amds.requires_coreset_supporting_module
+    assert not amds.scores
+    assert not amds.index_sampleid_map
+    assert not amds.requires_data_label_by_label
+
+
+def test_collect_scores():
+    amds = RemoteUncertaintyDownsamplingStrategy(*get_sampler_config())
+
+    first_output = torch.randn((4, 5))
+    second_output = torch.randn((3, 5))
+    amds.inform_samples([1, 2, 3, 4], first_output, None, None)
+    assert len(amds.scores) == 4
+    amds.inform_samples([21, 31, 41], second_output, None, None)
+    assert len(amds.scores) == 7
+
+    assert amds.index_sampleid_map == [1, 2, 3, 4, 21, 31, 41]
+
+    third_output = torch.randn((23, 5))
+    amds.inform_samples(list(range(1000, 1023)), third_output, None, None)
+
+    assert len(amds.scores) == 30
+    assert amds.index_sampleid_map == [1, 2, 3, 4, 21, 31, 41] + list(range(1000, 1023))
+
+
+def test_collect_embedding_balance():
+    amds = RemoteUncertaintyDownsamplingStrategy(*get_sampler_config(True))
+
+    first_output = torch.randn((4, 5))
+    second_output = torch.randn((3, 5))
+    amds.inform_samples([1, 2, 3, 4], first_output, None, None)
+    assert len(amds.scores) == 4
+    amds.inform_samples([21, 31, 41], second_output, None, None)
+    assert len(amds.scores) == 7
+
+    assert amds.index_sampleid_map == [1, 2, 3, 4, 21, 31, 41]
+
+    amds.inform_end_of_current_label()
+    assert len(amds.already_selected_ids) == 3
+    assert len(amds.already_selected_weights) == 3
+    assert len(amds.scores) == 0
+    assert len(amds.index_sampleid_map) == 0
+
+    third_output = torch.randn((23, 5))
+    amds.inform_samples(list(range(1000, 1023)), third_output, None, None)
+
+    assert len(amds.scores) == 23
+    assert amds.index_sampleid_map == list(range(1000, 1023))
+
+    amds.inform_end_of_current_label()
+    assert len(amds.already_selected_ids) == 14
+    assert len(amds.already_selected_weights) == 14
+    assert len(amds.scores) == 0
+    assert len(amds.index_sampleid_map) == 0
diff --git a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py
index 9117b3f76..d181f0ea1 100644
--- a/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py
+++ b/modyn/tests/trainer_server/internal/trainer/test_pytorch_trainer.py
@@ -126,6 +126,8 @@ def mock_get_dataloaders(
     storage_address,
     selector_address,
     training_id,
+    prefetched_partitions,
+    num_parallel_requests,
     tokenizer,
     log_path,
 ):
diff --git a/modyn/tests/utils/test_utils.py b/modyn/tests/utils/test_utils.py
index f65327761..8ac6ded13 100644
--- a/modyn/tests/utils/test_utils.py
+++ b/modyn/tests/utils/test_utils.py
@@ -185,6 +185,7 @@ def test_instantiate_class_existing():
         64,
         {"downsampling_ratio": 67},
         {},
+        "cpu",
     )
     assert isinstance(remote_downsampler, RemoteLossDownsampling)
     assert remote_downsampler.downsampling_ratio == 67
diff --git a/modyn/trainer_server/internal/dataset/data_utils.py b/modyn/trainer_server/internal/dataset/data_utils.py
index a10545ed7..3a9a4046b 100644
--- a/modyn/trainer_server/internal/dataset/data_utils.py
+++ b/modyn/trainer_server/internal/dataset/data_utils.py
@@ -8,6 +8,8 @@
 
 logger = logging.getLogger(__name__)
 
+# pylint: disable=too-many-locals
+
 
 def prepare_dataloaders(
     pipeline_id: int,
@@ -20,6 +22,8 @@ def prepare_dataloaders(
     storage_address: str,
     selector_address: str,
     training_id: int,
+    num_prefetched_partitions: int,
+    parallel_prefetch_requests: int,
     tokenizer: Optional[str],
     log_path: Optional[pathlib.Path],
 ) -> tuple[torch.utils.data.DataLoader, Optional[torch.utils.data.DataLoader]]:
@@ -52,6 +56,8 @@ def prepare_dataloaders(
         storage_address,
         selector_address,
         training_id,
+        num_prefetched_partitions,
+        parallel_prefetch_requests,
         tokenizer,
         log_path,
     )
@@ -77,6 +83,8 @@ def prepare_per_class_dataloader_from_online_dataset(
         online_dataset._selector_address,
         online_dataset._training_id,
         initial_filtered_label,
+        online_dataset._num_prefetched_partitions,
+        online_dataset._parallel_prefetch_requests,
         online_dataset._tokenizer_name,
     )
     return torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)
diff --git a/modyn/trainer_server/internal/dataset/key_sources/local_key_source.py b/modyn/trainer_server/internal/dataset/key_sources/local_key_source.py
index 5532dd799..8f60cf75a 100644
--- a/modyn/trainer_server/internal/dataset/key_sources/local_key_source.py
+++ b/modyn/trainer_server/internal/dataset/key_sources/local_key_source.py
@@ -15,6 +15,8 @@ def get_keys_and_weights(self, worker_id: int, partition_id: int) -> tuple[list[
         path = self._trigger_sample_storage.get_file_path(self._pipeline_id, self._trigger_id, partition_id, worker_id)
         tuples_list = self._trigger_sample_storage._parse_file(path)
 
+        if len(tuples_list) == 0:
+            return [], []
         keys, weights = zip(*tuples_list)
 
         return list(keys), list(weights)
diff --git a/modyn/trainer_server/internal/dataset/local_dataset_writer.py b/modyn/trainer_server/internal/dataset/local_dataset_writer.py
index 8a649cffd..a35323115 100644
--- a/modyn/trainer_server/internal/dataset/local_dataset_writer.py
+++ b/modyn/trainer_server/internal/dataset/local_dataset_writer.py
@@ -62,14 +62,10 @@ def _store_triggersamples_impl(
     def inform_samples(self, sample_ids: list, sample_weights: torch.Tensor) -> None:
         # map the two input lists to the desired format
         assert self.output_samples_list is not None
-        samples_list = np.empty(len(sample_ids), dtype=np.dtype("i8,f8"))
-
-        for i, _ in enumerate(sample_ids):
-            samples_list[i] = (sample_ids[i], sample_weights[i])
 
         # add the input tuples to the output list.
-        for element in samples_list:
-            self.output_samples_list[self.current_sample_index] = element
+        for sample_id, sample_weight in zip(sample_ids, sample_weights):
+            self.output_samples_list[self.current_sample_index] = (sample_id, sample_weight)
             self.current_sample_index += 1
 
             # if the target is reached, store the samples
diff --git a/modyn/trainer_server/internal/dataset/online_dataset.py b/modyn/trainer_server/internal/dataset/online_dataset.py
index c97becff2..f0879319d 100644
--- a/modyn/trainer_server/internal/dataset/online_dataset.py
+++ b/modyn/trainer_server/internal/dataset/online_dataset.py
@@ -1,9 +1,10 @@
-import gc
+import contextlib
 import json
 import logging
 import os
 import pathlib
-from typing import Any, Callable, Generator, Optional, Tuple, Union
+import threading
+from typing import Any, Callable, Generator, Iterator, Optional, Tuple
 
 import grpc
 from modyn.common.benchmark.stopwatch import Stopwatch
@@ -40,6 +41,8 @@ def __init__(
         storage_address: str,
         selector_address: str,
         training_id: int,
+        num_prefetched_partitions: int,
+        parallel_prefetch_requests: int,
         tokenizer: Optional[str],
         log_path: Optional[pathlib.Path],
     ):
@@ -48,6 +51,8 @@ def __init__(
         self._training_id = training_id
         self._dataset_id = dataset_id
         self._first_call = True
+        self._num_prefetched_partitions = num_prefetched_partitions
+        self._parallel_prefetch_requests = parallel_prefetch_requests
 
         self._bytes_parser = bytes_parser
         self._serialized_transforms = serialized_transforms
@@ -65,6 +70,17 @@ def __init__(
         self._log: dict[str, Any] = {"partitions": {}}
         self._sw = Stopwatch()
 
+        self._data_threads: dict[int, threading.Thread] = {}
+        self._pref_started: dict[int, bool] = {}
+        self._thread_data_container: dict[int, dict[str, Any]] = {}
+        self._partition_locks: dict[int, threading.Lock] = {}
+        self._partition_signals: dict[int, threading.Condition] = {}  # Should use the lock out of partition_locks
+        self._partition_valid_until: dict[int, int] = {}
+        self._partition_valid: dict[int, bool] = {}
+        self._next_partition_to_fetch = 0
+        self._launched_prefetches = 0
+        self._start_prefetch_lock: Optional[threading.Lock] = None
+
         if log_path is None:
             logger.warning("Did not provide log path for OnlineDataset - logging disabled.")
 
@@ -79,20 +95,6 @@ def __init__(
     def change_key_source(self, source: AbstractKeySource) -> None:
         self._key_source = source
 
-    def _get_data_from_storage(self, selector_keys: list[int]) -> tuple[list[bytes], list[int]]:
-        req = GetRequest(dataset_id=self._dataset_id, keys=selector_keys)
-
-        data_from_storage: dict[int, tuple[bytes, int]] = {}
-        response: GetResponse
-        for _, response in enumerate(self._storagestub.Get(req)):
-            for key, sample, label in zip(response.keys, response.samples, response.labels):
-                data_from_storage[key] = (sample, label)
-
-        sample_list = [data_from_storage[key][0] for key in selector_keys]
-        label_list = [data_from_storage[key][1] for key in selector_keys]
-
-        return sample_list, label_list
-
     def _setup_composed_transform(self) -> None:
         assert self._bytes_parser_function is not None
 
@@ -134,50 +136,76 @@ def _info(self, msg: str, worker_id: Optional[int]) -> None:  # pragma: no cover
     def _debug(self, msg: str, worker_id: Optional[int]) -> None:  # pragma: no cover
         logger.debug(f"[Training {self._training_id}][PL {self._pipeline_id}][Worker {worker_id}] {msg}")
 
+    def _get_data_from_storage(
+        self, selector_keys: list[int]
+    ) -> Iterator[tuple[list[int], list[bytes], list[int], int]]:
+        req = GetRequest(dataset_id=self._dataset_id, keys=selector_keys)
+        stopw = Stopwatch()
+
+        response: GetResponse
+        stopw.start("ResponseTime", overwrite=True)
+        for _, response in enumerate(self._storagestub.Get(req)):
+            yield list(response.keys), list(response.samples), list(response.labels), stopw.stop("ResponseTime")
+            stopw.start("ResponseTime", overwrite=True)
+
+    # pylint: disable=too-many-locals
     def _get_data(
-        self, worker_id: int, partition_id: int
-    ) -> tuple[list[int], list[bytes], list[int], Optional[list[float]]]:
+        self,
+        data_container: dict,
+        worker_id: int,
+        partition_id: int,
+        partition_valid: Optional[dict],
+        partition_valid_until: Optional[dict],
+        partition_locks: Optional[dict],
+        partition_signals: Optional[dict],
+        callback: Optional[Callable],
+    ) -> None:
         get_data_log = {}
-        self._sw.start("GetKeysAndWeights", overwrite=True)
+        self._sw.start(f"GetKeysAndWeightsPart{partition_id}", overwrite=True)
         keys, weights = self._key_source.get_keys_and_weights(worker_id, partition_id)
-        get_data_log["get_keys_and_weights"] = self._sw.stop("GetKeysAndWeights")
+        get_data_log["get_keys_and_weights"] = self._sw.stop(f"GetKeysAndWeightsPart{partition_id}")
         get_data_log["num_items"] = len(keys)
 
         self._info("Getting data from storage", worker_id)
-        self._sw.start("GetData", overwrite=True)
-        data, labels = self._get_data_from_storage(keys)
-        get_data_log["get_data"] = self._sw.stop("GetData")
-
+        self._sw.start(f"GetDataPart{partition_id}", overwrite=True)
+        all_response_times = []
+
+        key_weight_map = {key: weights[idx] for idx, key in enumerate(keys)} if weights is not None else None
+
+        for data_tuple in self._get_data_from_storage(keys):
+            stor_keys, data, labels, response_time = data_tuple
+            all_response_times.append(response_time)
+            num_items = len(stor_keys)
+            with partition_locks[partition_id] if partition_locks is not None else contextlib.suppress():
+                data_container["data"].extend(data)
+                data_container["keys"].extend(stor_keys)
+                data_container["labels"].extend(labels)
+                data_container["weights"].extend(
+                    [key_weight_map[key] for key in stor_keys]
+                    if key_weight_map is not None
+                    else [None for _ in range(len(stor_keys))]
+                )
+                if partition_valid_until is not None:
+                    partition_valid_until[partition_id] += num_items
+
+            if partition_signals is not None:
+                with partition_signals[partition_id]:
+                    partition_signals[partition_id].notify_all()
+
+        get_data_log["get_data"] = self._sw.stop(f"GetDataPart{partition_id}")
+        get_data_log["response_times"] = all_response_times
         self._log["partitions"][str(partition_id)] = get_data_log
 
-        return keys, data, labels, weights
+        if partition_locks is not None and partition_valid is not None:
+            with partition_locks[partition_id]:
+                partition_valid[partition_id] = True
 
-    def _get_data_iterator(
-        self, keys: list[int], data: list[bytes], labels: list[int], weights: Optional[list[float]]
-    ) -> enumerate:
-        assert self._uses_weights is not None
+        if callback is not None:
+            callback()
 
-        # pylint: disable-next = unsubscriptable-object
-        iterator: Union[zip[Tuple[int, bytes, int]], zip[Tuple[int, bytes, int, float]]]
-        if self._uses_weights:
-            assert weights is not None and len(weights) == len(keys)
-            iterator = zip(keys, data, labels, weights)
-        else:
-            iterator = zip(keys, data, labels)
-        return enumerate(iterator)
-
-    def _unpack_data_tuple(self, data_tuple: Tuple) -> Tuple[int, bytes, int, Optional[float]]:
-        assert self._uses_weights is not None
-
-        if self._uses_weights:
-            key, sample, label, weight = data_tuple
-        else:
-            key, sample, label = data_tuple
-            weight = None
-
-        return key, sample, label, weight
-
-    def _get_data_tuple(self, key: int, sample: bytes, label: int, weight: Optional[float]) -> Optional[Tuple]:
+    def _get_transformed_data_tuple(
+        self, key: int, sample: bytes, label: int, weight: Optional[float]
+    ) -> Optional[Tuple]:
         assert self._uses_weights is not None
         self._sw.start("transform", resume=True)
         # mypy complains here because _transform has unknown type, which is ok
@@ -200,11 +228,174 @@ def _persist_log(self, worker_id: int) -> None:
 
         log_file = f"{self._log_path / str(worker_id)}.log"
         self._log["transform"] = self._sw.measurements.get("transform", 0)
+        self._log["wait_for_later_partitions"] = self._sw.measurements.get("wait_for_later_partitions", 0)
+        self._log["wait_for_initial_partition"] = self._sw.measurements.get("wait_for_initial_partition", 0)
 
         with open(log_file, "w", encoding="utf-8") as logfile:
             json.dump(self._log, logfile)
 
-    # pylint: disable=too-many-locals, too-many-branches
+    def _prefetch_partition(self, worker_id: int, maybe_continue: bool = False) -> None:
+        assert self._start_prefetch_lock is not None
+        with self._start_prefetch_lock:
+            if self._num_prefetched_partitions < 1 or self._next_partition_to_fetch >= self._num_partitions:
+                return  # Prefetching disabled or nothing more to prefetch
+
+            if maybe_continue and self._launched_prefetches >= self._num_prefetched_partitions:
+                return  # Two callbacks started to prefetch basically at the same time
+
+            if maybe_continue:
+                # Do this as early as possible to avoid running into the "problem" above frequently
+                self._launched_prefetches += 1
+
+            assert self._next_partition_to_fetch >= 0
+            assert (
+                self._next_partition_to_fetch not in self._data_threads
+            ), f"Prefetching for partition {self._next_partition_to_fetch} has already been started"
+
+            self._thread_data_container[self._next_partition_to_fetch] = {
+                "data": [],
+                "keys": [],
+                "labels": [],
+                "weights": [],
+            }
+            self._partition_valid[self._next_partition_to_fetch] = False
+            self._partition_valid_until[self._next_partition_to_fetch] = -1
+            self._partition_locks[self._next_partition_to_fetch] = threading.Lock()
+            self._partition_signals[self._next_partition_to_fetch] = threading.Condition(
+                self._partition_locks[self._next_partition_to_fetch]
+            )
+
+            callback = None
+            if maybe_continue:
+
+                def callback_func() -> None:
+                    self._info("Prefetch callback called.", worker_id)
+
+                    # It might be that between the check and the actual launch
+                    # We start another launch
+                    # We catch this with the lock within _prefetch_partition
+                    if self._launched_prefetches < self._num_prefetched_partitions:
+                        self._info(
+                            f"Only {self._launched_prefetches} out of {self._num_prefetched_partitions}"
+                            + " partitions have been fetched, issuing another request.",
+                            worker_id,
+                        )
+                        self._prefetch_partition(worker_id, True)
+                    else:
+                        self._info("Not issuing another request.", worker_id)
+
+                callback = callback_func
+
+            self._data_threads[self._next_partition_to_fetch] = threading.Thread(
+                target=self._get_data,
+                args=(
+                    self._thread_data_container[self._next_partition_to_fetch],
+                    worker_id,
+                    self._next_partition_to_fetch,
+                    self._partition_valid,
+                    self._partition_valid_until,
+                    self._partition_locks,
+                    self._partition_signals,
+                    callback,
+                ),
+            )
+
+            self._data_threads[self._next_partition_to_fetch].start()
+            self._pref_started[self._next_partition_to_fetch] = True
+
+            self._next_partition_to_fetch += 1
+
+    def _fetch_partition_noprefetch(
+        self, worker_id: int, partition_id: int
+    ) -> Iterator[tuple[int, bytes, int, Optional[float]]]:
+        assert self._num_prefetched_partitions < 1
+        container: dict[str, Any] = {"data": [], "keys": [], "labels": [], "weights": []}
+        self._get_data(container, worker_id, partition_id, None, None, None, None, None)
+        assert "data" in container and "labels" in container and "keys" in container and "weights" in container
+
+        for idx in range(len(container["keys"])):
+            yield container["keys"][idx], container["data"][idx], container["labels"][idx], container["weights"][idx]
+
+    def _is_partition_fetched(self, partition_id: int) -> bool:
+        if partition_id not in self._partition_locks or partition_id not in self._partition_valid:
+            return False
+
+        with self._partition_locks[partition_id]:
+            return self._partition_valid[partition_id]
+
+    def _partition_max_index(self, partition_id: int) -> int:
+        with self._partition_locks[partition_id]:
+            return self._partition_valid_until[partition_id]
+
+    def _get_partition_data(
+        self, last_idx: int, max_idx: int, partition_id: int
+    ) -> Iterator[tuple[int, bytes, int, Optional[float]]]:
+        for idx in range(last_idx + 1, max_idx + 1):
+            yield self._thread_data_container[partition_id]["keys"][idx], self._thread_data_container[partition_id][
+                "data"
+            ][idx], self._thread_data_container[partition_id]["labels"][idx], self._thread_data_container[partition_id][
+                "weights"
+            ][
+                idx
+            ]
+
+    def _wait_for_new_partition_data(self, partition_id: int) -> None:
+        with self._partition_signals[partition_id]:
+            self._partition_signals[partition_id].wait(1)  # In case we do not get woken up, we at most waste a second
+
+    def prefetched_partition_generator(
+        self, worker_id: int, partition_id: int
+    ) -> Iterator[tuple[int, bytes, int, Optional[float]]]:
+        last_idx = -1
+
+        while not self._is_partition_fetched(partition_id):
+            max_idx = self._partition_max_index(partition_id)
+            if max_idx <= last_idx:  # No new data
+                self._wait_for_new_partition_data(partition_id)
+
+            yield from self._get_partition_data(last_idx, max_idx, partition_id)
+            last_idx = max_idx
+
+        # Yield potential remaining data
+        self._info(f"Joining thread for partition {partition_id}", worker_id)
+        self._data_threads[partition_id].join()
+        self._info(f"Thread for partition {partition_id} joined", worker_id)
+        max_idx = self._partition_max_index(partition_id)
+        yield from self._get_partition_data(last_idx, max_idx, partition_id)
+
+    def start_prefetching(self, worker_id: int) -> None:
+        if self._num_prefetched_partitions < 1:
+            # No prefetching at all
+            return
+
+        if self._num_prefetched_partitions <= self._parallel_prefetch_requests:
+            # We can emit prefetching requests once and be done with it
+            for _ in range(self._num_prefetched_partitions):
+                self._prefetch_partition(worker_id, False)
+
+            return
+
+        # We have to respect the limit of parallel requests
+        for _ in range(self._parallel_prefetch_requests):
+            self._prefetch_partition(worker_id, True)
+
+    def all_partition_generator(self, worker_id: int) -> Iterator[tuple[int, bytes, int, Optional[float]]]:
+        self.start_prefetching(worker_id)
+
+        for partition_id in range(self._num_partitions):
+            self._persist_log(worker_id)
+
+            if self._num_prefetched_partitions > 0:
+                if partition_id < self._num_partitions - 1:
+                    # As we consume one partition, prefetch exactly one more partition
+                    self._prefetch_partition(worker_id, False)
+
+                yield from self.prefetched_partition_generator(worker_id, partition_id)
+            else:
+                yield from self._fetch_partition_noprefetch(worker_id, partition_id)
+
+    # pylint: disable=too-many-locals, too-many-branches, too-many-statements
+
     def __iter__(self) -> Generator:
         worker_info = get_worker_info()
         if worker_info is None:
@@ -224,49 +415,34 @@ def __iter__(self) -> Generator:
             self._uses_weights = self._key_source.uses_weights()
             self._silence_pil()
             self._debug("gRPC initialized.", worker_id)
-            # Reinit logging and timetracking in this worker
+            # Reinit logging, timetracking in this worker
             self._log = {"partitions": {}}
             self._sw = Stopwatch()
+            self._start_prefetch_lock = threading.Lock()
+
+        # Always reinitialize these structures for prefetching (for multiple epochs)
+        self._data_threads = {}
+        self._thread_data_container = {}
+        self._pref_started = {}
+        self._next_partition_to_fetch = 0
+        self._partition_locks = {}
+        self._partition_valid_until = {}
+        self._partition_valid = {}
+        self._partition_signals = {}
 
         assert self._transform is not None
         self._num_partitions = self._key_source.get_num_data_partitions()
-        self._info(f"Total number of partitions will be {self._num_partitions}", worker_id)
+        self._info(
+            f"Total number of partitions will be {self._num_partitions}.\n"
+            + f"Parallel prefetch requests = {self._parallel_prefetch_requests}\n"
+            + f"Num prefetched partitions = {self._num_prefetched_partitions}",
+            worker_id,
+        )
         self._log["num_partitions"] = self._num_partitions
+        self._num_prefetched_partitions = min(self._num_prefetched_partitions, self._num_partitions)
 
-        keys, data, labels, weights = self._get_data(worker_id=worker_id, partition_id=0)
-
-        for partition in range(self._num_partitions):
-            self._persist_log(worker_id)
-            num_samples_on_this_partition = len(keys)
-            # We (arbitrarily) fetch the next partition when we have seen 80% of the current partition
-            fetch_next_partition_idx = int(num_samples_on_this_partition * 0.8)
-            self._info(f"Train on partition {partition}, on {num_samples_on_this_partition} batches", worker_id)
-
-            for idx, data_tuple in self._get_data_iterator(keys, data, labels, weights):
-                key, sample, label, weight = self._unpack_data_tuple(data_tuple)
-
-                if partition < self._num_partitions - 1 and idx == fetch_next_partition_idx:
-                    # TODO(#175) in case this blocks training
-                    new_keys, new_data, new_labels, new_weights = self._get_data(
-                        worker_id=worker_id, partition_id=partition + 1
-                    )
-
-                data_tuple = self._get_data_tuple(key, sample, label, weight)
-
-                if data_tuple is not None:
-                    yield data_tuple
-
-            # this should mean we keep only two partitions in mem
-            if partition < self._num_partitions - 1:
-                del keys
-                del data
-                del labels
-                del weights
-                keys, data, labels, weights = new_keys, new_data, new_labels, new_weights
-                del new_keys
-                del new_data
-                del new_labels
-                del new_weights
-                gc.collect()
+        for data_tuple in self.all_partition_generator(worker_id):
+            if (transformed_tuple := self._get_transformed_data_tuple(*data_tuple)) is not None:
+                yield transformed_tuple
 
         self._persist_log(worker_id)
diff --git a/modyn/trainer_server/internal/dataset/per_class_online_dataset.py b/modyn/trainer_server/internal/dataset/per_class_online_dataset.py
index 9413297a0..f10adaa9f 100644
--- a/modyn/trainer_server/internal/dataset/per_class_online_dataset.py
+++ b/modyn/trainer_server/internal/dataset/per_class_online_dataset.py
@@ -20,6 +20,8 @@ def __init__(
         selector_address: str,
         training_id: int,
         initial_filtered_label: int,
+        num_prefetched_partitions: int,
+        parallel_prefetch_requests: int,
         tokenizer: Optional[str],
     ):
         super().__init__(
@@ -31,15 +33,19 @@ def __init__(
             storage_address,
             selector_address,
             training_id,
+            num_prefetched_partitions,
+            parallel_prefetch_requests,
             tokenizer,
             None,
         )
         assert initial_filtered_label is not None
         self.filtered_label = initial_filtered_label
 
-    def _get_data_tuple(self, key: int, sample: bytes, label: int, weight: Optional[float]) -> Optional[Tuple]:
+    def _get_transformed_data_tuple(
+        self, key: int, sample: bytes, label: int, weight: Optional[float]
+    ) -> Optional[Tuple]:
         assert self.filtered_label is not None
 
         if self.filtered_label != label:
             return None
-        return super()._get_data_tuple(key, sample, label, weight)
+        return super()._get_transformed_data_tuple(key, sample, label, weight)
diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py
index d02654560..e62f14fdd 100644
--- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py
+++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.py
@@ -14,7 +14,7 @@
 
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\xb9\x06\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x0b\n\x03\x61mp\x18\x04 \x01(\x08\x12\x10\n\x08model_id\x18\x05 \x01(\t\x12\x30\n\x13model_configuration\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1c\n\x14use_pretrained_model\x18\x07 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x08 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\t \x01(\x05\x12\x12\n\nbatch_size\x18\n \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x0b \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\x0c \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\r \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0e \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0f \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x11 \x03(\t\x12)\n\x0clr_scheduler\x18\x12 \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x13 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x14 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x15 \x01(\x05\x12\x11\n\x04seed\x18\x16 \x01(\x05H\x00\x88\x01\x01\x12-\n\ttokenizer\x18\x17 \x01(\x0b\x32\x15.trainer.PythonStringH\x01\x88\x01\x01\x42\x07\n\x05_seedB\x0c\n\n_tokenizer\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\xa6\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12 \n\x03log\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x16\n\texception\x18\x07 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x08 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\t \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\n \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\x0b \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x14trainer_server.proto\x12\x07trainer\"\x1b\n\nJsonString\x12\r\n\x05value\x18\x01 \x01(\t\"\x1d\n\x0cPythonString\x12\r\n\x05value\x18\x01 \x01(\t\"3\n\x04\x44\x61ta\x12\x12\n\ndataset_id\x18\x01 \x01(\t\x12\x17\n\x0fnum_dataloaders\x18\x02 \x01(\x05\"\x19\n\x17TrainerAvailableRequest\"-\n\x18TrainerAvailableResponse\x12\x11\n\tavailable\x18\x01 \x01(\x08\"F\n\x0e\x43heckpointInfo\x12\x1b\n\x13\x63heckpoint_interval\x18\x01 \x01(\x05\x12\x17\n\x0f\x63heckpoint_path\x18\x02 \x01(\t\"\x80\x07\n\x14StartTrainingRequest\x12\x13\n\x0bpipeline_id\x18\x01 \x01(\x05\x12\x12\n\ntrigger_id\x18\x02 \x01(\x05\x12\x0e\n\x06\x64\x65vice\x18\x03 \x01(\t\x12\x0b\n\x03\x61mp\x18\x04 \x01(\x08\x12\x10\n\x08model_id\x18\x05 \x01(\t\x12\x30\n\x13model_configuration\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1c\n\x14use_pretrained_model\x18\x07 \x01(\x08\x12\x1c\n\x14load_optimizer_state\x18\x08 \x01(\x08\x12\x1b\n\x13pretrained_model_id\x18\t \x01(\x05\x12\x12\n\nbatch_size\x18\n \x01(\x05\x12;\n\x1etorch_optimizers_configuration\x18\x0b \x01(\x0b\x32\x13.trainer.JsonString\x12\x17\n\x0ftorch_criterion\x18\x0c \x01(\t\x12\x31\n\x14\x63riterion_parameters\x18\r \x01(\x0b\x32\x13.trainer.JsonString\x12 \n\tdata_info\x18\x0e \x01(\x0b\x32\r.trainer.Data\x12\x30\n\x0f\x63heckpoint_info\x18\x0f \x01(\x0b\x32\x17.trainer.CheckpointInfo\x12+\n\x0c\x62ytes_parser\x18\x10 \x01(\x0b\x32\x15.trainer.PythonString\x12\x16\n\x0etransform_list\x18\x11 \x03(\t\x12)\n\x0clr_scheduler\x18\x12 \x01(\x0b\x32\x13.trainer.JsonString\x12\x30\n\x11label_transformer\x18\x13 \x01(\x0b\x32\x15.trainer.PythonString\x12\x36\n\x19grad_scaler_configuration\x18\x14 \x01(\x0b\x32\x13.trainer.JsonString\x12\x1a\n\x12\x65pochs_per_trigger\x18\x15 \x01(\x05\x12!\n\x19num_prefetched_partitions\x18\x16 \x01(\x05\x12\"\n\x1aparallel_prefetch_requests\x18\x17 \x01(\x05\x12\x11\n\x04seed\x18\x18 \x01(\x05H\x00\x88\x01\x01\x12-\n\ttokenizer\x18\x19 \x01(\x0b\x32\x15.trainer.PythonStringH\x01\x88\x01\x01\x42\x07\n\x05_seedB\x0c\n\n_tokenizer\"F\n\x15StartTrainingResponse\x12\x18\n\x10training_started\x18\x01 \x01(\x08\x12\x13\n\x0btraining_id\x18\x02 \x01(\x05\",\n\x15TrainingStatusRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"\xa6\x03\n\x16TrainingStatusResponse\x12\r\n\x05valid\x18\x01 \x01(\x08\x12\x12\n\nis_running\x18\x02 \x01(\x08\x12\x13\n\x0bis_training\x18\x03 \x01(\x08\x12\x17\n\x0fstate_available\x18\x04 \x01(\x08\x12\x0f\n\x07\x62locked\x18\x05 \x01(\x08\x12 \n\x03log\x18\x06 \x01(\x0b\x32\x13.trainer.JsonString\x12\x16\n\texception\x18\x07 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x62\x61tches_seen\x18\x08 \x01(\x03H\x01\x88\x01\x01\x12\x19\n\x0csamples_seen\x18\t \x01(\x03H\x02\x88\x01\x01\x12&\n\x19\x64ownsampling_batches_seen\x18\n \x01(\x03H\x03\x88\x01\x01\x12&\n\x19\x64ownsampling_samples_seen\x18\x0b \x01(\x03H\x04\x88\x01\x01\x42\x0c\n\n_exceptionB\x0f\n\r_batches_seenB\x0f\n\r_samples_seenB\x1c\n\x1a_downsampling_batches_seenB\x1c\n\x1a_downsampling_samples_seen\"-\n\x16StoreFinalModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"@\n\x17StoreFinalModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x10\n\x08model_id\x18\x02 \x01(\x05\",\n\x15GetLatestModelRequest\x12\x13\n\x0btraining_id\x18\x01 \x01(\x05\"A\n\x16GetLatestModelResponse\x12\x13\n\x0bvalid_state\x18\x01 \x01(\x08\x12\x12\n\nmodel_path\x18\x02 \x01(\t2\xc9\x03\n\rTrainerServer\x12Z\n\x11trainer_available\x12 .trainer.TrainerAvailableRequest\x1a!.trainer.TrainerAvailableResponse\"\x00\x12Q\n\x0estart_training\x12\x1d.trainer.StartTrainingRequest\x1a\x1e.trainer.StartTrainingResponse\"\x00\x12X\n\x13get_training_status\x12\x1e.trainer.TrainingStatusRequest\x1a\x1f.trainer.TrainingStatusResponse\"\x00\x12X\n\x11store_final_model\x12\x1f.trainer.StoreFinalModelRequest\x1a .trainer.StoreFinalModelResponse\"\x00\x12U\n\x10get_latest_model\x12\x1e.trainer.GetLatestModelRequest\x1a\x1f.trainer.GetLatestModelResponse\"\x00\x62\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -35,21 +35,21 @@
   _globals['_CHECKPOINTINFO']._serialized_start=220
   _globals['_CHECKPOINTINFO']._serialized_end=290
   _globals['_STARTTRAININGREQUEST']._serialized_start=293
-  _globals['_STARTTRAININGREQUEST']._serialized_end=1118
-  _globals['_STARTTRAININGRESPONSE']._serialized_start=1120
-  _globals['_STARTTRAININGRESPONSE']._serialized_end=1190
-  _globals['_TRAININGSTATUSREQUEST']._serialized_start=1192
-  _globals['_TRAININGSTATUSREQUEST']._serialized_end=1236
-  _globals['_TRAININGSTATUSRESPONSE']._serialized_start=1239
-  _globals['_TRAININGSTATUSRESPONSE']._serialized_end=1661
-  _globals['_STOREFINALMODELREQUEST']._serialized_start=1663
-  _globals['_STOREFINALMODELREQUEST']._serialized_end=1708
-  _globals['_STOREFINALMODELRESPONSE']._serialized_start=1710
-  _globals['_STOREFINALMODELRESPONSE']._serialized_end=1774
-  _globals['_GETLATESTMODELREQUEST']._serialized_start=1776
-  _globals['_GETLATESTMODELREQUEST']._serialized_end=1820
-  _globals['_GETLATESTMODELRESPONSE']._serialized_start=1822
-  _globals['_GETLATESTMODELRESPONSE']._serialized_end=1887
-  _globals['_TRAINERSERVER']._serialized_start=1890
-  _globals['_TRAINERSERVER']._serialized_end=2347
+  _globals['_STARTTRAININGREQUEST']._serialized_end=1189
+  _globals['_STARTTRAININGRESPONSE']._serialized_start=1191
+  _globals['_STARTTRAININGRESPONSE']._serialized_end=1261
+  _globals['_TRAININGSTATUSREQUEST']._serialized_start=1263
+  _globals['_TRAININGSTATUSREQUEST']._serialized_end=1307
+  _globals['_TRAININGSTATUSRESPONSE']._serialized_start=1310
+  _globals['_TRAININGSTATUSRESPONSE']._serialized_end=1732
+  _globals['_STOREFINALMODELREQUEST']._serialized_start=1734
+  _globals['_STOREFINALMODELREQUEST']._serialized_end=1779
+  _globals['_STOREFINALMODELRESPONSE']._serialized_start=1781
+  _globals['_STOREFINALMODELRESPONSE']._serialized_end=1845
+  _globals['_GETLATESTMODELREQUEST']._serialized_start=1847
+  _globals['_GETLATESTMODELREQUEST']._serialized_end=1891
+  _globals['_GETLATESTMODELRESPONSE']._serialized_start=1893
+  _globals['_GETLATESTMODELRESPONSE']._serialized_end=1958
+  _globals['_TRAINERSERVER']._serialized_start=1961
+  _globals['_TRAINERSERVER']._serialized_end=2418
 # @@protoc_insertion_point(module_scope)
diff --git a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi
index 8b793c5a0..9723ebdb8 100644
--- a/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi
+++ b/modyn/trainer_server/internal/grpc/generated/trainer_server_pb2.pyi
@@ -133,6 +133,8 @@ class StartTrainingRequest(google.protobuf.message.Message):
     LABEL_TRANSFORMER_FIELD_NUMBER: builtins.int
     GRAD_SCALER_CONFIGURATION_FIELD_NUMBER: builtins.int
     EPOCHS_PER_TRIGGER_FIELD_NUMBER: builtins.int
+    NUM_PREFETCHED_PARTITIONS_FIELD_NUMBER: builtins.int
+    PARALLEL_PREFETCH_REQUESTS_FIELD_NUMBER: builtins.int
     SEED_FIELD_NUMBER: builtins.int
     TOKENIZER_FIELD_NUMBER: builtins.int
     pipeline_id: builtins.int
@@ -166,6 +168,8 @@ class StartTrainingRequest(google.protobuf.message.Message):
     @property
     def grad_scaler_configuration(self) -> global___JsonString: ...
     epochs_per_trigger: builtins.int
+    num_prefetched_partitions: builtins.int
+    parallel_prefetch_requests: builtins.int
     seed: builtins.int
     @property
     def tokenizer(self) -> global___PythonString: ...
@@ -193,11 +197,13 @@ class StartTrainingRequest(google.protobuf.message.Message):
         label_transformer: global___PythonString | None = ...,
         grad_scaler_configuration: global___JsonString | None = ...,
         epochs_per_trigger: builtins.int = ...,
+        num_prefetched_partitions: builtins.int = ...,
+        parallel_prefetch_requests: builtins.int = ...,
         seed: builtins.int | None = ...,
         tokenizer: global___PythonString | None = ...,
     ) -> None: ...
     def HasField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "lr_scheduler", b"lr_scheduler", "model_configuration", b"model_configuration", "seed", b"seed", "tokenizer", b"tokenizer", "torch_optimizers_configuration", b"torch_optimizers_configuration"]) -> builtins.bool: ...
-    def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "amp", b"amp", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "model_configuration", b"model_configuration", "model_id", b"model_id", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "tokenizer", b"tokenizer", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["_seed", b"_seed", "_tokenizer", b"_tokenizer", "amp", b"amp", "batch_size", b"batch_size", "bytes_parser", b"bytes_parser", "checkpoint_info", b"checkpoint_info", "criterion_parameters", b"criterion_parameters", "data_info", b"data_info", "device", b"device", "epochs_per_trigger", b"epochs_per_trigger", "grad_scaler_configuration", b"grad_scaler_configuration", "label_transformer", b"label_transformer", "load_optimizer_state", b"load_optimizer_state", "lr_scheduler", b"lr_scheduler", "model_configuration", b"model_configuration", "model_id", b"model_id", "num_prefetched_partitions", b"num_prefetched_partitions", "parallel_prefetch_requests", b"parallel_prefetch_requests", "pipeline_id", b"pipeline_id", "pretrained_model_id", b"pretrained_model_id", "seed", b"seed", "tokenizer", b"tokenizer", "torch_criterion", b"torch_criterion", "torch_optimizers_configuration", b"torch_optimizers_configuration", "transform_list", b"transform_list", "trigger_id", b"trigger_id", "use_pretrained_model", b"use_pretrained_model"]) -> None: ...
     @typing.overload
     def WhichOneof(self, oneof_group: typing_extensions.Literal["_seed", b"_seed"]) -> typing_extensions.Literal["seed"] | None: ...
     @typing.overload
diff --git a/modyn/trainer_server/internal/trainer/pytorch_trainer.py b/modyn/trainer_server/internal/trainer/pytorch_trainer.py
index 47c118893..5d4f69b80 100644
--- a/modyn/trainer_server/internal/trainer/pytorch_trainer.py
+++ b/modyn/trainer_server/internal/trainer/pytorch_trainer.py
@@ -17,6 +17,7 @@
 import numpy as np
 import torch
 from modyn.common.benchmark.stopwatch import Stopwatch
+from modyn.models.coreset_methods_support import CoresetSupportingModule
 from modyn.selector.internal.grpc.generated.selector_pb2 import (
     AvailableLabelsResponse,
     GetAvailableLabelsRequest,
@@ -161,6 +162,8 @@ def __init__(
             training_info.storage_address,
             training_info.selector_address,
             training_info.training_id,
+            training_info.num_prefetched_partitions,
+            training_info.parallel_prefetch_requests,
             training_info.tokenizer,
             self._dataset_log_path,
         )
@@ -324,6 +327,7 @@ def instantiate_downsampler(
             self._batch_size,
             downsampler_config,
             per_sample_loss,
+            self._device,
         )
 
     def sample_then_batch_this_epoch(self, epoch: int) -> bool:
@@ -385,7 +389,7 @@ def train(self) -> None:  # pylint: disable=too-many-locals, too-many-branches
                 if retrieve_weights_from_dataloader:
                     # model output is a torch.FloatTensor but weights is a torch.DoubleTensor.
                     # We need to cast to do the dot product
-                    weights = batch[3].float()
+                    weights = batch[3].float().to(self._device)
 
                 for _, optimizer in self._optimizers.items():
                     optimizer.zero_grad()
@@ -606,14 +610,39 @@ def downsample_batch(
         assert self._downsampling_mode == DownsamplingMode.BATCH_THEN_SAMPLE
 
         self._downsampler.init_downsampler()
+        self.start_embedding_recording_if_needed()
         big_batch_output = self._model.model(data)
-        self._downsampler.inform_samples(sample_ids, big_batch_output, target)
+        embeddings = self.get_embeddings_if_recorded()
+        self._downsampler.inform_samples(sample_ids, big_batch_output, target, embeddings)
+        self.end_embedding_recorder_if_needed()
+
         # TODO(#218) Persist information on the sample IDs/weights when downsampling is performed
         selected_indexes, weights = self._downsampler.select_points()
         selected_data, selected_target = get_tensors_subset(selected_indexes, data, target, sample_ids)
         sample_ids, data, target = selected_indexes, selected_data, selected_target
         # TODO(#219) Investigate if we can avoid 2 forward passes
-        return data, sample_ids, target, weights
+        return data, sample_ids, target, weights.to(self._device)
+
+    def start_embedding_recording_if_needed(self) -> None:
+        if self._downsampler.requires_coreset_supporting_module:
+            # enable the embedding recorder to keep track of last layer embedding. The embeddings are stored
+            # in self._model.model.embedding_recorder.embedding
+            assert isinstance(self._model.model, CoresetSupportingModule)
+            self._model.model.embedding_recorder.start_recording()
+
+    def get_embeddings_if_recorded(self) -> Optional[torch.Tensor]:
+        # supply the embeddings if required by the downsampler
+        if self._downsampler.requires_coreset_supporting_module:
+            embeddings = self._model.model.embedding_recorder.embedding
+        else:
+            embeddings = None
+        return embeddings
+
+    def end_embedding_recorder_if_needed(self) -> None:
+        if self._downsampler.requires_coreset_supporting_module:
+            # turn off the embedding recording (not needed for regular training)
+            assert isinstance(self._model.model, CoresetSupportingModule)
+            self._model.model.embedding_recorder.end_recording()
 
     def downsample_trigger_training_set(self) -> None:
         """
@@ -633,6 +662,9 @@ def downsample_trigger_training_set(self) -> None:
         )
         self._train_dataloader.dataset.change_key_source(selector_key_source)
         self._downsampler.init_downsampler()
+
+        self.start_embedding_recording_if_needed()
+
         if self._downsampler.requires_data_label_by_label:
             assert isinstance(self._downsampler, AbstractPerLabelRemoteDownsamplingStrategy)
             available_labels = self._get_available_labels_from_selector()
@@ -661,6 +693,8 @@ def downsample_trigger_training_set(self) -> None:
 
         selected_ids, weights = self._downsampler.select_points()
 
+        self.end_embedding_recorder_if_needed()
+
         # to store all the selected (sample, weight).
         # TODO(#283) investigate which size performs the best
         file_size = self._num_dataloaders * self._batch_size
@@ -714,7 +748,9 @@ def _iterate_dataloader_and_compute_scores(
             with torch.autocast(self._device_type, enabled=self._amp):
                 # compute the scores and accumulate them
                 model_output = self._model.model(data)
-                self._downsampler.inform_samples(sample_ids, model_output, target)
+                embeddings = self.get_embeddings_if_recorded()
+                self._downsampler.inform_samples(sample_ids, model_output, target, embeddings)
+
         return batch_number, number_of_samples
 
     def end_of_trigger_cleaning(self) -> None:
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/__init__.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/__init__.py
index d05cfe23f..9bd6cfc39 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/__init__.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/__init__.py
@@ -1,7 +1,12 @@
 import os
 
+from .remote_craig_downsampling import RemoteCraigDownsamplingStrategy  # noqa: F401
+from .remote_grad_match_downsampling_strategy import RemoteGradMatchDownsamplingStrategy  # noqa: F401
 from .remote_gradnorm_downsampling import RemoteGradNormDownsampling  # noqa: F401
+from .remote_kcenter_greedy_downsampling_strategy import RemoteKcenterGreedyDownsamplingStrategy  # noqa: F401
 from .remote_loss_downsampling import RemoteLossDownsampling  # noqa: F401
+from .remote_submodular_downsampling_strategy import RemoteSubmodularDownsamplingStrategy  # noqa: F401
+from .remote_uncertainty_downsampling_strategy import RemoteUncertaintyDownsamplingStrategy  # noqa: F401
 
 files = os.listdir(os.path.dirname(__file__))
 files.remove("__init__.py")
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py
new file mode 100644
index 000000000..a92dbefaa
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_matrix_downsampling_strategy.py
@@ -0,0 +1,122 @@
+from abc import abstractmethod
+from enum import Enum
+from typing import Any, Optional
+
+import numpy as np
+import torch
+from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_per_label_remote_downsample_strategy import (
+    AbstractPerLabelRemoteDownsamplingStrategy,
+)
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.shuffling import _shuffle_list_and_tensor
+
+MatrixContent = Enum("MatrixContent", ["EMBEDDINGS", "GRADIENTS"])
+
+
+class AbstractMatrixDownsamplingStrategy(AbstractPerLabelRemoteDownsamplingStrategy):
+    """
+    Class to abstract the common behaviour of many downsampling strategies that collect the gradients or the embeddings
+    (thus a Matrix) and then select the points based on some method-specific metric (submodular, clustering, OMP...).
+    """
+
+    def __init__(
+        self,
+        pipeline_id: int,
+        trigger_id: int,
+        batch_size: int,
+        params_from_selector: dict,
+        per_sample_loss: Any,
+        device: str,
+    ):
+        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector, device)
+
+        self.criterion = per_sample_loss
+
+        # This class uses the embedding recorder
+        self.requires_coreset_supporting_module = True
+        self.matrix_elements: list[torch.Tensor] = []
+
+        # actual classes must specify which content should be stored. Can be either Gradients or Embeddings. Use the
+        # enum defined above to specify what should be stored
+        self.matrix_content: Optional[MatrixContent] = None
+
+        # if true, the downsampling is balanced across classes ex class sizes = [10, 50, 30] and 50% downsampling
+        # yields the following downsampled class sizes [5, 25, 15] while without balance something like [0, 45, 0] can
+        # happen
+        self.balance = params_from_selector["balance"]
+        if self.balance:
+            # Selection happens class by class. These data structures are used to store the selection results
+            self.already_selected_samples: list[int] = []
+            self.already_selected_weights = torch.tensor([]).float()
+            self.requires_data_label_by_label = True
+        else:
+            self.requires_data_label_by_label = False
+
+    def inform_samples(
+        self,
+        sample_ids: list[int],
+        forward_output: torch.Tensor,
+        target: torch.Tensor,
+        embedding: Optional[torch.Tensor] = None,
+    ) -> None:
+        assert embedding is not None
+        assert self.matrix_content is not None
+
+        if self.matrix_content == MatrixContent.GRADIENTS:
+            new_elements = self._compute_gradients(forward_output, target, embedding)
+        elif self.matrix_content == MatrixContent.EMBEDDINGS:
+            new_elements = embedding.detach().cpu()
+        else:
+            raise AssertionError("The required content does not exits.")
+
+        self.matrix_elements.append(new_elements)
+        # keep the mapping index<->sample_id
+        self.index_sampleid_map += sample_ids
+
+    def _compute_gradients(
+        self, forward_output: torch.Tensor, target: torch.Tensor, embedding: torch.Tensor
+    ) -> torch.Tensor:
+        loss = self.criterion(forward_output, target).mean()
+        embedding_dim = embedding.shape[1]
+        num_classes = forward_output.shape[1]
+        batch_num = target.shape[0]
+        # compute the gradient for each element provided
+        with torch.no_grad():
+            bias_parameters_grads = torch.autograd.grad(loss, forward_output)[0]
+            weight_parameters_grads = embedding.view(batch_num, 1, embedding_dim).repeat(
+                1, num_classes, 1
+            ) * bias_parameters_grads.view(batch_num, num_classes, 1).repeat(1, 1, embedding_dim)
+            gradients = torch.cat([bias_parameters_grads, weight_parameters_grads.flatten(1)], dim=1).cpu().numpy()
+        return gradients
+
+    def inform_end_of_current_label(self) -> None:
+        assert self.balance
+        selected_samples, selected_weights = self._select_from_matrix()
+        self.already_selected_samples += selected_samples
+        self.already_selected_weights = torch.cat((self.already_selected_weights, selected_weights))
+        self.matrix_elements = []
+        self.index_sampleid_map: list[int] = []
+
+    def select_points(self) -> tuple[list[int], torch.Tensor]:
+        if self.balance:
+            ids, weights = self.already_selected_samples, self.already_selected_weights
+        else:
+            ids, weights = self._select_from_matrix()
+
+        return _shuffle_list_and_tensor(ids, weights)
+
+    def _select_from_matrix(self) -> tuple[list[int], torch.Tensor]:
+        matrix = np.concatenate(self.matrix_elements)
+        number_of_samples = len(matrix)
+        target_size = max(int(self.downsampling_ratio * number_of_samples / 100), 1)
+        selected_indices, weights = self._select_indexes_from_matrix(matrix, target_size)
+        selected_ids = [self.index_sampleid_map[index] for index in selected_indices]
+        return selected_ids, weights
+
+    def init_downsampler(self) -> None:
+        self.matrix_elements = []
+        self.index_sampleid_map = []
+
+    @abstractmethod
+    def _select_indexes_from_matrix(self, matrix: np.ndarray, target_size: int) -> tuple[list[int], torch.Tensor]:
+        # Here is where tha actual selection happens
+        raise NotImplementedError()
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_per_label_remote_downsample_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_per_label_remote_downsample_strategy.py
index 09e96b77f..0c8830935 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_per_label_remote_downsample_strategy.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_per_label_remote_downsample_strategy.py
@@ -6,8 +6,8 @@
 
 
 class AbstractPerLabelRemoteDownsamplingStrategy(AbstractRemoteDownsamplingStrategy):
-    def __init__(self, pipeline_id: int, trigger_id: int, batch_size: int, params_from_selector: dict):
-        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector)
+    def __init__(self, pipeline_id: int, trigger_id: int, batch_size: int, params_from_selector: dict, device: str):
+        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector, device)
         self.requires_data_label_by_label = True
 
     @abstractmethod
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py
index ee171a5c4..b38f35685 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/abstract_remote_downsampling_strategy.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Union
+from typing import Optional, Union
 
 import torch
 
@@ -32,10 +32,13 @@ def get_tensors_subset(
 
 
 class AbstractRemoteDownsamplingStrategy(ABC):
-    def __init__(self, pipeline_id: int, trigger_id: int, batch_size: int, params_from_selector: dict) -> None:
+    def __init__(
+        self, pipeline_id: int, trigger_id: int, batch_size: int, params_from_selector: dict, device: str
+    ) -> None:
         self.pipeline_id = pipeline_id
         self.batch_size = batch_size
         self.trigger_id = trigger_id
+        self.device = device
 
         assert "downsampling_ratio" in params_from_selector
         self.downsampling_ratio = params_from_selector["downsampling_ratio"]
@@ -55,12 +58,22 @@ def __init__(self, pipeline_id: int, trigger_id: int, batch_size: int, params_fr
         # can use the following parameter
         self.requires_data_label_by_label = False
 
+        # Some methods require extra features (embedding recorder, get_last_layer) that are implemented in the class
+        # CoresetSupporingModule for model implementations.
+        self.requires_coreset_supporting_module = False
+
     @abstractmethod
     def init_downsampler(self) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def inform_samples(self, sample_ids: list[int], forward_output: torch.Tensor, target: torch.Tensor) -> None:
+    def inform_samples(
+        self,
+        sample_ids: list[int],
+        forward_output: torch.Tensor,
+        target: torch.Tensor,
+        embedding: Optional[torch.Tensor] = None,
+    ) -> None:
         raise NotImplementedError
 
     @abstractmethod
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/README.md b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/README.md
new file mode 100644
index 000000000..2d10478f3
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/README.md
@@ -0,0 +1,32 @@
+# DeepCore Utils
+The content of this folder is taken from DeepCore with minor fixes. 
+Compared to DeepCore, the `else` branch of `orthogonal_matching_pursuit` (line 47) has been changed to support the updated version
+of `torch.linalg.lstsq`. Refer to this [issue](https://github.com/PatrickZH/DeepCore/issues/10) for the fix.
+
+
+You can find the original code [here](https://github.com/PatrickZH/DeepCore/tree/main/deepcore/methods/methods_utils) and 
+the MIT license [here](https://raw.githubusercontent.com/PatrickZH/DeepCore/main/LICENSE.md)
+
+## DEEPCORE license
+
+MIT License
+
+Copyright (c) 2023 ZHAO, BO
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/__init__.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/__init__.py
new file mode 100644
index 000000000..af9bc1ea3
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/__init__.py
@@ -0,0 +1,8 @@
+"""
+DeepCore Utils
+"""
+import os
+
+files = os.listdir(os.path.dirname(__file__))
+files.remove("__init__.py")
+__all__ = [f[:-3] for f in files if f.endswith(".py")]
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/cossim.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/cossim.py
new file mode 100644
index 000000000..4a536072e
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/cossim.py
@@ -0,0 +1,39 @@
+# flake8: noqa
+# mypy: ignore-errors
+
+import numpy as np
+import torch
+
+
+def cossim_np(v1, v2):
+    num = np.dot(v1, v2.T)
+    denom = np.linalg.norm(v1, axis=1).reshape(-1, 1) * np.linalg.norm(v2, axis=1)
+    res = num / denom
+    res[np.isneginf(res)] = 0.0
+    return 0.5 + 0.5 * res
+
+
+def cossim_pair_np(v1):
+    num = np.dot(v1, v1.T)
+    norm = np.linalg.norm(v1, axis=1)
+    denom = norm.reshape(-1, 1) * norm
+    res = num / denom
+    res[np.isneginf(res)] = 0.0
+    return 0.5 + 0.5 * res
+
+
+def cossim(v1, v2):
+    num = torch.matmul(v1, v2.T)
+    denom = torch.norm(v1, dim=1).view(-1, 1) * torch.norm(v2, dim=1)
+    res = num / denom
+    res[torch.isneginf(res)] = 0.0
+    return 0.5 + 0.5 * res
+
+
+def cossim_pair(v1):
+    num = torch.matmul(v1, v1.T)
+    norm = torch.norm(v1, dim=1)
+    denom = norm.view(-1, 1) * norm
+    res = num / denom
+    res[torch.isneginf(res)] = 0.0
+    return 0.5 + 0.5 * res
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/euclidean.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/euclidean.py
new file mode 100644
index 000000000..c46a61030
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/euclidean.py
@@ -0,0 +1,40 @@
+# flake8: noqa
+# mypy: ignore-errors
+
+import numpy as np
+import torch
+
+
+def euclidean_dist(x, y):
+    m, n = x.size(0), y.size(0)
+    xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, n)
+    yy = torch.pow(y, 2).sum(1, keepdim=True).expand(n, m).t()
+    dist = xx + yy
+    dist.addmm_(1, -2, x, y.t())
+    dist = dist.clamp(min=1e-12).sqrt()
+    return dist
+
+
+def euclidean_dist_pair(x):
+    m = x.size(0)
+    xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, m)
+    dist = xx + xx.t()
+    dist.addmm_(1, -2, x, x.t())
+    dist = dist.clamp(min=1e-12).sqrt()
+    return dist
+
+
+def euclidean_dist_np(x, y):
+    (rowx, colx) = x.shape
+    (rowy, coly) = y.shape
+    xy = np.dot(x, y.T)
+    x2 = np.repeat(np.reshape(np.sum(np.multiply(x, x), axis=1), (rowx, 1)), repeats=rowy, axis=1)
+    y2 = np.repeat(np.reshape(np.sum(np.multiply(y, y), axis=1), (rowy, 1)), repeats=rowx, axis=1).T
+    return np.sqrt(np.clip(x2 + y2 - 2.0 * xy, 1e-12, None))
+
+
+def euclidean_dist_pair_np(x):
+    (rowx, colx) = x.shape
+    xy = np.dot(x, x.T)
+    x2 = np.repeat(np.reshape(np.sum(np.multiply(x, x), axis=1), (rowx, 1)), repeats=rowx, axis=1)
+    return np.sqrt(np.clip(x2 + x2.T - 2.0 * xy, 1e-12, None))
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/k_center_greedy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/k_center_greedy.py
new file mode 100644
index 000000000..0d64a4098
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/k_center_greedy.py
@@ -0,0 +1,58 @@
+# flake8: noqa
+# mypy: ignore-errors
+
+import numpy as np
+import torch
+
+
+def k_center_greedy(matrix, budget: int, metric, device, print_freq: int = 20):
+    if type(matrix) == torch.Tensor:
+        assert matrix.dim() == 2
+    elif type(matrix) == np.ndarray:
+        assert matrix.ndim == 2
+        matrix = torch.from_numpy(matrix).requires_grad_(False).to(device)
+
+    sample_num = matrix.shape[0]
+    assert sample_num >= 1
+
+    if budget < 0:
+        raise ValueError("Illegal budget size.")
+    elif budget > sample_num:
+        budget = sample_num
+
+    index = np.arange(sample_num)
+
+    assert callable(metric)
+
+    with torch.no_grad():
+        select_result = np.zeros(sample_num, dtype=bool)
+        # Randomly select one initial point.
+        already_selected = [np.random.randint(0, sample_num)]
+        budget -= 1
+        select_result[already_selected] = True
+
+        if budget == 0:
+            return index[select_result]
+
+        num_of_already_selected = np.sum(select_result)
+
+        # Initialize a (num_of_already_selected+budget-1)*sample_num matrix storing distances of pool points from
+        # each clustering center.
+        dis_matrix = -1 * torch.ones([num_of_already_selected + budget - 1, sample_num], requires_grad=False).to(device)
+
+        dis_matrix[:num_of_already_selected, ~select_result] = metric(matrix[select_result], matrix[~select_result])
+
+        mins = torch.min(dis_matrix[:num_of_already_selected, :], dim=0).values
+
+        for i in range(budget):
+            if print_freq is not None and i % print_freq == 0:
+                print("| Selecting [%3d/%3d]" % (i + 1, budget))
+            p = torch.argmax(mins).item()
+            select_result[p] = True
+
+            if i == budget - 1:
+                break
+            mins[p] = -1
+            dis_matrix[num_of_already_selected + i, ~select_result] = metric(matrix[[p]], matrix[~select_result])
+            mins = torch.min(mins, dis_matrix[num_of_already_selected + i])
+    return index[select_result]
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/orthogonal_matching_pursuit.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/orthogonal_matching_pursuit.py
new file mode 100644
index 000000000..fddba0c2d
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/orthogonal_matching_pursuit.py
@@ -0,0 +1,105 @@
+# flake8: noqa
+# mypy: ignore-errors
+
+import numpy as np
+import torch
+from scipy.linalg import lstsq
+from scipy.optimize import nnls
+
+
+def orthogonal_matching_pursuit(A, b, budget: int, lam: float = 1.0):
+    """approximately solves min_x |x|_0 s.t. Ax=b using Orthogonal Matching Pursuit
+    Acknowlegement to:
+    https://github.com/krishnatejakk/GradMatch/blob/main/GradMatch/selectionstrategies/helpers/omp_solvers.py
+    Args:
+      A: design matrix of size (d, n)
+      b: measurement vector of length d
+      budget: selection budget
+      lam: regularization coef. for the final output vector
+    Returns:
+       vector of length n
+    """
+    with torch.no_grad():
+        d, n = A.shape
+        if budget <= 0:
+            budget = 0
+        elif budget > n:
+            budget = n
+
+        x = np.zeros(n, dtype=np.float32)
+        resid = b.clone()
+        indices = []
+        boolean_mask = torch.ones(n, dtype=bool, device="cuda")
+        all_idx = torch.arange(n, device="cuda")
+
+        for i in range(budget):
+            projections = torch.matmul(A.T, resid)
+            index = torch.argmax(projections[boolean_mask])
+            index = all_idx[boolean_mask][index]
+
+            indices.append(index.item())
+            boolean_mask[index] = False
+
+            if indices.__len__() == 1:
+                A_i = A[:, index]
+                x_i = projections[index] / torch.dot(A_i, A_i).view(-1)
+                A_i = A[:, index].view(1, -1)
+            else:
+                A_i = torch.cat((A_i, A[:, index].view(1, -1)), dim=0)
+                temp = torch.matmul(A_i, torch.transpose(A_i, 0, 1)) + lam * torch.eye(A_i.shape[0], device="cuda:0")
+                lstsq_out = torch.linalg.lstsq(temp, torch.matmul(A_i, b).view(-1, 1))
+                x_i = torch.cat((lstsq_out.solution, lstsq_out.residuals))
+            resid = b - torch.matmul(torch.transpose(A_i, 0, 1), x_i).view(-1)
+        if budget > 1:
+            x_i = nnls(temp.cpu().numpy(), torch.matmul(A_i, b).view(-1).cpu().numpy())[0]
+            x[indices] = x_i
+        elif budget == 1:
+            x[indices[0]] = 1.0
+    return x
+
+
+def orthogonal_matching_pursuit_np(A, b, budget: int, lam: float = 1.0):
+    """approximately solves min_x |x|_0 s.t. Ax=b using Orthogonal Matching Pursuit
+    Acknowlegement to:
+    https://github.com/krishnatejakk/GradMatch/blob/main/GradMatch/selectionstrategies/helpers/omp_solvers.py
+    Args:
+      A: design matrix of size (d, n)
+      b: measurement vector of length d
+      budget: selection budget
+      lam: regularization coef. for the final output vector
+    Returns:
+       vector of length n
+    """
+    d, n = A.shape
+    if budget <= 0:
+        budget = 0
+    elif budget > n:
+        budget = n
+
+    x = np.zeros(n, dtype=np.float32)
+    resid = np.copy(b)
+    indices = []
+    boolean_mask = np.ones(n, dtype=bool)
+    all_idx = np.arange(n)
+
+    for i in range(budget):
+        projections = A.T.dot(resid)
+        index = np.argmax(projections[boolean_mask])
+        index = all_idx[boolean_mask][index]
+
+        indices.append(index.item())
+        boolean_mask[index] = False
+
+        if indices.__len__() == 1:
+            A_i = A[:, index]
+            x_i = projections[index] / A_i.T.dot(A_i)
+        else:
+            A_i = np.vstack([A_i, A[:, index]])
+            x_i = lstsq(A_i.dot(A_i.T) + lam * np.identity(A_i.shape[0]), A_i.dot(b))[0]
+        resid = b - A_i.T.dot(x_i)
+    if budget > 1:
+        x_i = nnls(A_i.dot(A_i.T) + lam * np.identity(A_i.shape[0]), A_i.dot(b))[0]
+        x[indices] = x_i
+    elif budget == 1:
+        x[indices[0]] = 1.0
+    return x
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/shuffling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/shuffling.py
new file mode 100644
index 000000000..889e13450
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/shuffling.py
@@ -0,0 +1,14 @@
+import random
+
+import torch
+
+
+def _shuffle_list_and_tensor(samples: list, weights: torch.Tensor) -> tuple[list[int], torch.Tensor]:
+    num_elements = len(samples)
+    indices = list(range(num_elements))
+    random.shuffle(indices)
+
+    shuffled_samples = [samples[i] for i in indices]
+    shuffled_weights = weights[indices]
+
+    return shuffled_samples, shuffled_weights
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/submodular_function.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/submodular_function.py
new file mode 100644
index 000000000..856aa065b
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/submodular_function.py
@@ -0,0 +1,160 @@
+# flake8: noqa
+# mypy: ignore-errors
+
+import numpy as np
+
+SUBMODULAR_FUNCTIONS = ["FacilityLocation", "GraphCut", "LogDeterminant"]
+
+
+class SubmodularFunction(object):
+    def __init__(self, index, similarity_kernel=None, similarity_matrix=None, already_selected=[]):
+        self.index = index
+        self.n = len(index)
+
+        self.already_selected = already_selected
+
+        assert similarity_kernel is not None or similarity_matrix is not None
+
+        # For the sample similarity matrix, the method supports two input modes, one is to input a pairwise similarity
+        # matrix for the whole sample, and the other case allows the input of a similarity kernel to be used to
+        # calculate similarities incrementally at a later time if required.
+        if similarity_kernel is not None:
+            assert callable(similarity_kernel)
+            self.similarity_kernel = self._similarity_kernel(similarity_kernel)
+        else:
+            assert similarity_matrix.shape[0] == self.n and similarity_matrix.shape[1] == self.n
+            self.similarity_matrix = similarity_matrix
+            self.similarity_kernel = lambda a, b: self.similarity_matrix[np.ix_(a, b)]
+
+    def _similarity_kernel(self, similarity_kernel):
+        return similarity_kernel
+
+
+class FacilityLocation(SubmodularFunction):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        if self.already_selected.__len__() == 0:
+            self.cur_max = np.zeros(self.n, dtype=np.float32)
+        else:
+            self.cur_max = np.max(self.similarity_kernel(np.arange(self.n), self.already_selected), axis=1)
+
+        self.all_idx = np.ones(self.n, dtype=bool)
+
+    def _similarity_kernel(self, similarity_kernel):
+        # Initialize a matrix to store similarity values of sample points.
+        self.sim_matrix = np.zeros([self.n, self.n], dtype=np.float32)
+        self.if_columns_calculated = np.zeros(self.n, dtype=bool)
+
+        def _func(a, b):
+            if not np.all(self.if_columns_calculated[b]):
+                if b.dtype != bool:
+                    temp = ~self.all_idx
+                    temp[b] = True
+                    b = temp
+                not_calculated = b & ~self.if_columns_calculated
+                self.sim_matrix[:, not_calculated] = similarity_kernel(self.all_idx, not_calculated)
+                self.if_columns_calculated[not_calculated] = True
+            return self.sim_matrix[np.ix_(a, b)]
+
+        return _func
+
+    def calc_gain(self, idx_gain, selected, **kwargs):
+        gains = np.maximum(0.0, self.similarity_kernel(self.all_idx, idx_gain) - self.cur_max.reshape(-1, 1)).sum(
+            axis=0
+        )
+        return gains
+
+    def calc_gain_batch(self, idx_gain, selected, **kwargs):
+        batch_idx = ~self.all_idx
+        batch_idx[0 : kwargs["batch"]] = True
+        gains = np.maximum(
+            0.0, self.similarity_kernel(batch_idx, idx_gain) - self.cur_max[batch_idx].reshape(-1, 1)
+        ).sum(axis=0)
+        for i in range(kwargs["batch"], self.n, kwargs["batch"]):
+            batch_idx = ~self.all_idx
+            batch_idx[i * kwargs["batch"] : (i + 1) * kwargs["batch"]] = True
+            gains += np.maximum(
+                0.0, self.similarity_kernel(batch_idx, idx_gain) - self.cur_max[batch_idx].reshape(-1, 1)
+            ).sum(axis=0)
+        return gains
+
+    def update_state(self, new_selection, total_selected, **kwargs):
+        self.cur_max = np.maximum(self.cur_max, np.max(self.similarity_kernel(self.all_idx, new_selection), axis=1))
+        # self.cur_max = np.max(np.append(self.cur_max.reshape(-1, 1),
+        # self.similarity_kernel(self.all_idx, new_selection), axis=1), axis=1)
+
+
+class GraphCut(SubmodularFunction):
+    def __init__(self, lam: float = 1.0, **kwargs):
+        super().__init__(**kwargs)
+        self.lam = lam
+
+        if "similarity_matrix" in kwargs:
+            self.sim_matrix_cols_sum = np.sum(self.similarity_matrix, axis=0)
+        self.all_idx = np.ones(self.n, dtype=bool)
+
+    def _similarity_kernel(self, similarity_kernel):
+        # Initialize a matrix to store similarity values of sample points.
+        self.sim_matrix = np.zeros([self.n, self.n], dtype=np.float32)
+        self.sim_matrix_cols_sum = np.zeros(self.n, dtype=np.float32)
+        self.if_columns_calculated = np.zeros(self.n, dtype=bool)
+
+        def _func(a, b):
+            if not np.all(self.if_columns_calculated[b]):
+                if b.dtype != bool:
+                    temp = ~self.all_idx
+                    temp[b] = True
+                    b = temp
+                not_calculated = b & ~self.if_columns_calculated
+                self.sim_matrix[:, not_calculated] = similarity_kernel(self.all_idx, not_calculated)
+                self.sim_matrix_cols_sum[not_calculated] = np.sum(self.sim_matrix[:, not_calculated], axis=0)
+                self.if_columns_calculated[not_calculated] = True
+            return self.sim_matrix[np.ix_(a, b)]
+
+        return _func
+
+    def calc_gain(self, idx_gain, selected, **kwargs):
+        gain = (
+            -2.0 * np.sum(self.similarity_kernel(selected, idx_gain), axis=0)
+            + self.lam * self.sim_matrix_cols_sum[idx_gain]
+        )
+
+        return gain
+
+    def update_state(self, new_selection, total_selected, **kwargs):
+        pass
+
+
+class LogDeterminant(SubmodularFunction):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.all_idx = np.ones(self.n, dtype=bool)
+
+    def _similarity_kernel(self, similarity_kernel):
+        # Initialize a matrix to store similarity values of sample points.
+        self.sim_matrix = np.zeros([self.n, self.n], dtype=np.float32)
+        self.if_columns_calculated = np.zeros(self.n, dtype=bool)
+
+        def _func(a, b):
+            if not np.all(self.if_columns_calculated[b]):
+                if b.dtype != bool:
+                    temp = ~self.all_idx
+                    temp[b] = True
+                    b = temp
+                not_calculated = b & ~self.if_columns_calculated
+                self.sim_matrix[:, not_calculated] = similarity_kernel(self.all_idx, not_calculated)
+                self.if_columns_calculated[not_calculated] = True
+            return self.sim_matrix[np.ix_(a, b)]
+
+        return _func
+
+    def calc_gain(self, idx_gain, selected, **kwargs):
+        # Gain for LogDeterminant can be written as $f(x | A ) = \log\det(S_{a} - S_{a,A}S_{A}^{-1}S_{x,A}^T)$.
+        sim_idx_gain = self.similarity_kernel(selected, idx_gain).T
+        sim_selected = self.similarity_kernel(selected, selected)
+        return (np.dot(sim_idx_gain, np.linalg.pinv(sim_selected)) * sim_idx_gain).sum(-1)
+
+    def update_state(self, new_selection, total_selected, **kwargs):
+        pass
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/submodular_optimizer.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/submodular_optimizer.py
new file mode 100644
index 000000000..b2661061e
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/submodular_optimizer.py
@@ -0,0 +1,155 @@
+# flake8: noqa
+# mypy: ignore-errors
+
+import numpy as np
+
+OPTIMIZER_CHOICES = ["NaiveGreedy", "LazyGreedy", "StochasticGreedy", "ApproximateLazyGreedy"]
+
+
+class optimizer(object):
+    def __init__(self, args, index, budget: int, already_selected=[]):
+        self.args = args
+        self.index = index
+
+        if budget <= 0 or budget > index.__len__():
+            raise ValueError("Illegal budget for optimizer.")
+
+        self.n = len(index)
+        self.budget = budget
+        self.already_selected = already_selected
+
+
+class NaiveGreedy(optimizer):
+    def __init__(self, args, index, budget: int, already_selected=[]):
+        super(NaiveGreedy, self).__init__(args, index, budget, already_selected)
+
+    def select(self, gain_function, update_state=None, **kwargs):
+        assert callable(gain_function)
+        if update_state is not None:
+            assert callable(update_state)
+        selected = np.zeros(self.n, dtype=bool)
+        selected[self.already_selected] = True
+
+        greedy_gain = np.zeros(len(self.index))
+        for i in range(sum(selected), self.budget):
+            if self.args.print_freq is not None and i % self.args.print_freq == 0:
+                print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
+            greedy_gain[~selected] = gain_function(~selected, selected, **kwargs)
+            current_selection = greedy_gain.argmax()
+            selected[current_selection] = True
+            greedy_gain[current_selection] = -np.inf
+            if update_state is not None:
+                update_state(np.array([current_selection]), selected, **kwargs)
+
+        return self.index[selected]
+
+
+class LazyGreedy(optimizer):
+    def __init__(self, args, index, budget: int, already_selected=[]):
+        super(LazyGreedy, self).__init__(args, index, budget, already_selected)
+
+    def select(self, gain_function, update_state=None, **kwargs):
+        assert callable(gain_function)
+        if update_state is not None:
+            assert callable(update_state)
+        selected = np.zeros(self.n, dtype=bool)
+        selected[self.already_selected] = True
+
+        greedy_gain = np.zeros(len(self.index))
+        greedy_gain[~selected] = gain_function(~selected, selected, **kwargs)
+        greedy_gain[selected] = -np.inf
+
+        for i in range(sum(selected), self.budget):
+            if i % self.args.print_freq == 0:
+                print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
+            best_gain = -np.inf
+            last_max_element = -1
+            while True:
+                cur_max_element = greedy_gain.argmax()
+                if last_max_element == cur_max_element:
+                    # Select cur_max_element into the current subset
+                    selected[cur_max_element] = True
+                    greedy_gain[cur_max_element] = -np.inf
+
+                    if update_state is not None:
+                        update_state(np.array([cur_max_element]), selected, **kwargs)
+                    break
+                new_gain = gain_function(np.array([cur_max_element]), selected, **kwargs)[0]
+                greedy_gain[cur_max_element] = new_gain
+                if new_gain >= best_gain:
+                    best_gain = new_gain
+                    last_max_element = cur_max_element
+        return self.index[selected]
+
+
+class StochasticGreedy(optimizer):
+    def __init__(self, args, index, budget: int, already_selected=[], epsilon: float = 0.9):
+        super(StochasticGreedy, self).__init__(args, index, budget, already_selected)
+        self.epsilon = epsilon
+
+    def select(self, gain_function, update_state=None, **kwargs):
+        assert callable(gain_function)
+        if update_state is not None:
+            assert callable(update_state)
+        selected = np.zeros(self.n, dtype=bool)
+        selected[self.already_selected] = True
+
+        sample_size = max(round(-np.log(self.epsilon) * self.n / self.budget), 1)
+
+        greedy_gain = np.zeros(len(self.index))
+        all_idx = np.arange(self.n)
+        for i in range(sum(selected), self.budget):
+            if i % self.args.print_freq == 0:
+                print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
+
+            # Uniformly select a subset from unselected samples with size sample_size
+            subset = np.random.choice(all_idx[~selected], replace=False, size=min(sample_size, self.n - i))
+
+            if subset.__len__() == 0:
+                break
+
+            greedy_gain[subset] = gain_function(subset, selected, **kwargs)
+            current_selection = greedy_gain[subset].argmax()
+            selected[subset[current_selection]] = True
+            greedy_gain[subset[current_selection]] = -np.inf
+            if update_state is not None:
+                update_state(np.array([subset[current_selection]]), selected, **kwargs)
+        return self.index[selected]
+
+
+class ApproximateLazyGreedy(optimizer):
+    def __init__(self, args, index, budget: int, already_selected=[], beta: float = 0.9):
+        super(ApproximateLazyGreedy, self).__init__(args, index, budget, already_selected)
+        self.beta = beta
+
+    def select(self, gain_function, update_state=None, **kwargs):
+        assert callable(gain_function)
+        if update_state is not None:
+            assert callable(update_state)
+        selected = np.zeros(self.n, dtype=bool)
+        selected[self.already_selected] = True
+
+        greedy_gain = np.zeros(len(self.index))
+        greedy_gain[~selected] = gain_function(~selected, selected, **kwargs)
+        greedy_gain[selected] = -np.inf
+
+        for i in range(sum(selected), self.budget):
+            if i % self.args.print_freq == 0:
+                print("| Selecting [%3d/%3d]" % (i + 1, self.budget))
+            while True:
+                cur_max_element = greedy_gain.argmax()
+                max_gain = greedy_gain[cur_max_element]
+
+                new_gain = gain_function(np.array([cur_max_element]), selected, **kwargs)[0]
+
+                if new_gain >= self.beta * max_gain:
+                    # Select cur_max_element into the current subset
+                    selected[cur_max_element] = True
+                    greedy_gain[cur_max_element] = -np.inf
+
+                    if update_state is not None:
+                        update_state(np.array([cur_max_element]), selected, **kwargs)
+                    break
+                else:
+                    greedy_gain[cur_max_element] = new_gain
+        return self.index[selected]
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py
new file mode 100644
index 000000000..3574c2139
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_craig_downsampling.py
@@ -0,0 +1,201 @@
+from argparse import Namespace
+from typing import Any, Optional
+
+import numpy as np
+import torch
+from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_per_label_remote_downsample_strategy import (
+    AbstractPerLabelRemoteDownsamplingStrategy,
+)
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils import submodular_optimizer
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.euclidean import euclidean_dist_pair_np
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.shuffling import _shuffle_list_and_tensor
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.submodular_function import (
+    FacilityLocation,
+)
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.submodular_optimizer import (
+    OPTIMIZER_CHOICES,
+)
+
+
+class RemoteCraigDownsamplingStrategy(AbstractPerLabelRemoteDownsamplingStrategy):
+    """
+    Strategy introduced in:
+    Data-efficient Training of Machine Learning Models
+    Implementation adapted from:
+    DEEPCORE https://raw.githubusercontent.com/PatrickZH/DeepCore/main/deepcore/methods/craig.py
+    This strategy selects points via submodular maximization of a per-class FacilityLocation function. The score is
+    proportional to the Euclidean distance between the two samples' gradients.
+    """
+
+    def __init__(
+        self,
+        pipeline_id: int,
+        trigger_id: int,
+        batch_size: int,
+        params_from_selector: dict,
+        per_sample_loss: Any,
+        device: str,
+    ) -> None:
+        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector, device)
+
+        self.criterion = per_sample_loss
+
+        self.selection_batch = params_from_selector["selection_batch"]
+        self.greedy = params_from_selector["greedy"]
+        if self.greedy not in OPTIMIZER_CHOICES:
+            raise ValueError(
+                f"The required Greedy optimizer is not available. Pick one of the following: {OPTIMIZER_CHOICES}"
+            )
+
+        # This class uses the embedding recorder
+        self.requires_coreset_supporting_module = True
+
+        # Samples are supplied label by label (this class is instance of AbstractPerLabelRemoteDownsamplingStrategy).
+        # The following list keeps the gradients of the current label. When all the samples belonging to the current
+        # label have been seen, the scores are computed and the list is emptied
+        self.current_class_gradients: list[torch.Tensor] = []
+        # distance_matrix[i,j] = 0 if label[i]!=label[j] else is proportional to the euclidean distance between the
+        # two samples in the gradient space
+        self.distance_matrix = np.zeros([0, 0])
+
+        # if true, the downsampling is balanced across classes ex class sizes = [10, 50, 30] and 50% downsampling
+        # yields the following downsampled class sizes [5, 25, 15] while without balance something like [0, 45, 0] can
+        # happen
+        self.balance = params_from_selector["balance"]
+        if self.balance:
+            self.already_selected_samples: list[int] = []
+            self.already_selected_weights = torch.tensor([]).float()
+
+    def inform_samples(
+        self,
+        sample_ids: list[int],
+        forward_output: torch.Tensor,
+        target: torch.Tensor,
+        embedding: Optional[torch.Tensor] = None,
+    ) -> None:
+        assert embedding is not None
+
+        # Slightly different implementation for BTS and STB since in STB points are supplied class by class while in
+        # BTS are not. STB will always use the first branch, BTS will typically (might use the first if all the points
+        # belong to the same class) use the second one
+        # The problem with CRAIG is that the algorithm considers class when selecting points. This is not a problem in
+        # STB since there is a dedicated dataloader to receive the samples as desired. In BTS it is different because
+        # potentially in the same batch, there could be samples from different classes. If this happens, the algorithm
+        # is used in a different way (receiving samples class by class, emulating what happens in STB).
+
+        different_targets_in_this_batch = target.unique()
+        if len(different_targets_in_this_batch) == 1:
+            self._inform_samples_single_class(sample_ids, forward_output, target, embedding)
+        else:
+            for current_target in different_targets_in_this_batch:
+                mask = target == current_target
+                this_target_sample_ids = [sample_ids[i] for i, keep in enumerate(mask) if keep]
+                self._inform_samples_single_class(
+                    this_target_sample_ids, forward_output[mask], target[mask], embedding[mask]
+                )
+                self.inform_end_of_current_label()
+
+    def _inform_samples_single_class(
+        self, sample_ids: list[int], forward_output: torch.Tensor, target: torch.Tensor, embedding: torch.Tensor
+    ) -> None:
+        embedding_dim = embedding.shape[1]
+        num_classes = forward_output.shape[1]
+        batch_num = target.shape[0]
+
+        loss = self.criterion(forward_output, target).mean()
+
+        # compute the gradient for each element provided
+        with torch.no_grad():
+            bias_parameters_grads = torch.autograd.grad(loss, forward_output)[0]
+            weight_parameters_grads = embedding.view(batch_num, 1, embedding_dim).repeat(
+                1, num_classes, 1
+            ) * bias_parameters_grads.view(batch_num, num_classes, 1).repeat(1, 1, embedding_dim)
+
+            # store the computed gradients
+            self.current_class_gradients.append(
+                torch.cat([bias_parameters_grads, weight_parameters_grads.flatten(1)], dim=1).cpu().numpy()
+            )
+
+        # keep the mapping index<->sample_id
+        self.index_sampleid_map += sample_ids
+
+    def add_to_distance_matrix(self, submatrix: np.ndarray) -> None:
+        # compute the new size of the matrix
+        current_size = self.distance_matrix.shape[0]
+        new_size = current_size + submatrix.shape[0]
+
+        # copy the old matrix into the new one
+        new_matrix = np.zeros([new_size, new_size])
+        new_matrix[:current_size, :current_size] = self.distance_matrix
+
+        # add the new submatrix
+        new_matrix[
+            current_size:,
+            current_size:,
+        ] = submatrix
+
+        self.distance_matrix = new_matrix
+
+    def inform_end_of_current_label(self) -> None:
+        if len(self.current_class_gradients) == 0:
+            # no new gradients, just return
+            return
+        # compute the scores for each pair of samples belonging to the current class
+        gradients = np.concatenate(self.current_class_gradients)
+        matrix = -1.0 * euclidean_dist_pair_np(gradients)
+        matrix -= np.min(matrix) - 1e-3
+        # store the result in the matrix
+        self.add_to_distance_matrix(matrix)
+        # empty the gradients list
+        self.current_class_gradients = []
+
+        if self.balance:
+            # here we select the points if we want to keep a balance across classes
+            this_class_samples, this_class_weights = self._select_points_from_distance_matrix()
+            self.already_selected_samples += this_class_samples
+            self.already_selected_weights = torch.cat((self.already_selected_weights, this_class_weights))
+            self.distance_matrix = np.zeros([0, 0])
+            self.index_sampleid_map: list[int] = []
+
+    def calc_weights(self, matrix: np.ndarray, result: np.ndarray) -> torch.Tensor:
+        min_sample = np.argmax(matrix[result], axis=0)
+        weights = np.ones(np.sum(result) if result.dtype == bool else len(result))
+        for i in min_sample:
+            weights[i] = weights[i] + 1
+        return torch.tensor(weights).float()
+
+    def select_points(self) -> tuple[list[int], torch.Tensor]:
+        if self.balance:
+            # return a shuffled version, otherwise training happens class by class
+            return _shuffle_list_and_tensor(self.already_selected_samples, self.already_selected_weights)
+
+        if len(self.current_class_gradients) != 0:
+            # conclude the last class if there are still samples
+            self.inform_end_of_current_label()
+        return self._select_points_from_distance_matrix()
+
+    def _select_points_from_distance_matrix(self) -> tuple[list[int], torch.Tensor]:
+        number_of_samples = self.distance_matrix.shape[0]
+        target_size = max(int(self.downsampling_ratio * number_of_samples / 100), 1)
+
+        all_index = np.arange(number_of_samples)
+        submod_function = FacilityLocation(index=all_index, similarity_matrix=self.distance_matrix)
+        submod_optimizer = submodular_optimizer.__dict__[self.greedy](
+            args=Namespace(print_freq=None), index=all_index, budget=target_size
+        )
+        selection_result = submod_optimizer.select(
+            gain_function=submod_function.calc_gain_batch,
+            update_state=submod_function.update_state,
+            batch=self.selection_batch,
+        )
+        weights = self.calc_weights(self.distance_matrix, selection_result)
+        selected_ids = [self.index_sampleid_map[sample] for sample in selection_result]
+        return selected_ids, weights
+
+    def init_downsampler(self) -> None:
+        self.index_sampleid_map = []
+        self.current_class_gradients = []
+        self.distance_matrix = np.zeros([0, 0])
+        if self.balance:
+            self.already_selected_samples = []
+            self.already_selected_weights = torch.tensor([]).float()
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_grad_match_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_grad_match_downsampling_strategy.py
new file mode 100644
index 000000000..64c0983a9
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_grad_match_downsampling_strategy.py
@@ -0,0 +1,53 @@
+from typing import Any
+
+import numpy as np
+import torch
+from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_matrix_downsampling_strategy import (
+    AbstractMatrixDownsamplingStrategy,
+    MatrixContent,
+)
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.orthogonal_matching_pursuit import (
+    orthogonal_matching_pursuit,
+    orthogonal_matching_pursuit_np,
+)
+
+
+class RemoteGradMatchDownsamplingStrategy(AbstractMatrixDownsamplingStrategy):
+    """
+    Strategy introduced in:
+    GRAD-MATCH:Gradient Matching based Data Subset Selection for Efficient Deep Model Training (Killamsetty et. al.)
+    Implementation adapted from:
+    DEEPCORE https://raw.githubusercontent.com/PatrickZH/DeepCore/main/deepcore/methods/gradmatch.py
+    This strategy collects the Gradients (leveraging the abstract class AbstractMatrixDownsamplingStrategy) and then
+    selects the samples using Orthogonal Matching Pursuit (OMP). The goal is to find a sparse vector x such that
+    Ax = b, where A is the matrix containing the last layer gradients, and b is the mean across every dimension.
+    Note that DEEPCORE proposes two versions, one requiring a validation dataset. Such a dataset is not available
+    in modyn; thus, that version is unavailable. The vector b is the mean of the gradients of the validation dataset
+    in this alternative version.
+    """
+
+    def __init__(
+        self,
+        pipeline_id: int,
+        trigger_id: int,
+        batch_size: int,
+        params_from_selector: dict,
+        per_sample_loss: Any,
+        device: str,
+    ):
+        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector, per_sample_loss, device)
+        self.matrix_content = MatrixContent.GRADIENTS
+
+    def _select_indexes_from_matrix(self, matrix: np.ndarray, target_size: int) -> tuple[list[int], torch.Tensor]:
+        cur_val_gradients = np.mean(matrix, axis=0)
+
+        if self.device == "cpu":
+            cur_weights = orthogonal_matching_pursuit_np(matrix.T, cur_val_gradients, budget=target_size)
+        else:
+            cur_weights = orthogonal_matching_pursuit(
+                torch.Tensor(matrix).T, torch.Tensor(cur_val_gradients), budget=target_size
+            )
+        selection_result = np.nonzero(cur_weights)[0]
+        weights = torch.tensor(cur_weights[selection_result])
+
+        return selection_result.tolist(), weights
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py
index cef9e8aff..303bddf78 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_gradnorm_downsampling.py
@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, Optional
 
 import torch
 from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_remote_downsampling_strategy import (
@@ -16,9 +16,15 @@ class RemoteGradNormDownsampling(AbstractRemoteDownsamplingStrategy):
     """
 
     def __init__(
-        self, pipeline_id: int, trigger_id: int, batch_size: int, params_from_selector: dict, per_sample_loss: Any
+        self,
+        pipeline_id: int,
+        trigger_id: int,
+        batch_size: int,
+        params_from_selector: dict,
+        per_sample_loss: Any,
+        device: str,
     ) -> None:
-        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector)
+        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector, device)
 
         self.per_sample_loss_fct = per_sample_loss
 
@@ -33,28 +39,39 @@ def get_scores(self, forward_output: torch.Tensor, target: torch.Tensor) -> torc
                 # softmax to the forward output to obtain the probabilities
                 probs = torch.nn.functional.softmax(forward_output, dim=1)
                 num_classes = forward_output.shape[-1]
-                one_hot_targets = torch.nn.functional.one_hot(target, num_classes=num_classes)
+
+                # Pylint complains torch.nn.functional.one_hot is not callable for whatever reason
+                one_hot_targets = torch.nn.functional.one_hot(  # pylint: disable=not-callable
+                    target, num_classes=num_classes
+                )
                 scores = torch.norm(probs - one_hot_targets, dim=-1)
         else:
             sample_losses = self.per_sample_loss_fct(forward_output, target)
             last_layer_gradients = torch.autograd.grad(sample_losses.sum(), forward_output, retain_graph=False)[0]
             scores = torch.norm(last_layer_gradients, dim=-1)
 
-        return scores
+        return scores.cpu()
 
     def init_downsampler(self) -> None:
         self.probabilities = []
         self.index_sampleid_map: list[int] = []
         self.number_of_points_seen = 0
 
-    def inform_samples(self, sample_ids: list[int], forward_output: torch.Tensor, target: torch.Tensor) -> None:
+    def inform_samples(
+        self,
+        sample_ids: list[int],
+        forward_output: torch.Tensor,
+        target: torch.Tensor,
+        embedding: Optional[torch.Tensor] = None,
+    ) -> None:
         scores = self.get_scores(forward_output, target)
         self.probabilities.append(scores)
         self.number_of_points_seen += forward_output.shape[0]
         self.index_sampleid_map += sample_ids
 
     def select_points(self) -> tuple[list[int], torch.Tensor]:
-        target_size = int(self.downsampling_ratio * self.number_of_points_seen / 100)
+        # select always at least 1 point
+        target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / 100), 1)
 
         probabilities = torch.cat(self.probabilities, dim=0)
         probabilities = probabilities / probabilities.sum()
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_kcenter_greedy_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_kcenter_greedy_downsampling_strategy.py
new file mode 100644
index 000000000..c174fdce8
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_kcenter_greedy_downsampling_strategy.py
@@ -0,0 +1,47 @@
+from typing import Any
+
+import numpy as np
+import torch
+from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_matrix_downsampling_strategy import (
+    AbstractMatrixDownsamplingStrategy,
+    MatrixContent,
+)
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.euclidean import euclidean_dist
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.k_center_greedy import k_center_greedy
+
+
+class RemoteKcenterGreedyDownsamplingStrategy(AbstractMatrixDownsamplingStrategy):
+    """
+    Strategy introduced in:
+    Active learning for convolutional neural networks: A coreset approach (Sener and Savarese)
+    Implementation adapted from:
+    DEEPCORE https://raw.githubusercontent.com/PatrickZH/DeepCore/main/deepcore/methods/kcentergreedy.py
+    This strategy collects the Embeddings (leveraging the abstract class AbstractMatrixDownsamplingStrategy)
+    and then selects the samples by clustering them in the embedding space. The clustering algorithm is k-center.
+    This strategy was introduced first for active learning to discover which points are worth receiving a label.
+    Hence, this class does not compute weights and returns a tensor of ones instead.
+    """
+
+    def __init__(
+        self,
+        pipeline_id: int,
+        trigger_id: int,
+        batch_size: int,
+        params_from_selector: dict,
+        per_sample_loss: Any,
+        device: str,
+    ):
+        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector, per_sample_loss, device)
+
+        self.matrix_content = MatrixContent.EMBEDDINGS
+        self.metric = euclidean_dist
+
+    def _select_indexes_from_matrix(self, matrix: np.ndarray, target_size: int) -> tuple[list[int], torch.Tensor]:
+        selected_indices = k_center_greedy(
+            matrix, budget=target_size, metric=self.metric, device=self.device, print_freq=None
+        )
+
+        # no weights are returned by this technique
+        selected_weights = torch.ones((len(selected_indices)))
+
+        return selected_indices, selected_weights
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py
index 60e8b30db..9a0581605 100644
--- a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_loss_downsampling.py
@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, Optional
 
 import torch
 from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_remote_downsampling_strategy import (
@@ -7,10 +7,23 @@
 
 
 class RemoteLossDownsampling(AbstractRemoteDownsamplingStrategy):
+    """
+    Method inspired by
+    Not All Samples Are Created Equal: Deep Learning with Importance Sampling (Katharopoulos, Fleuret)
+    Instead of computing the last layer gradient (as GradNorm does), here, the selection proxy is the loss. Hence,
+    a higher loss means a higher probability of being selected. This version is cheaper but less accurate.
+    """
+
     def __init__(
-        self, pipeline_id: int, trigger_id: int, batch_size: int, params_from_selector: dict, per_sample_loss: Any
+        self,
+        pipeline_id: int,
+        trigger_id: int,
+        batch_size: int,
+        params_from_selector: dict,
+        per_sample_loss: Any,
+        device: str,
     ) -> None:
-        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector)
+        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector, device)
 
         self.per_sample_loss_fct = per_sample_loss
         self.probabilities: list[torch.Tensor] = []
@@ -25,14 +38,21 @@ def init_downsampler(self) -> None:
         self.index_sampleid_map: list[int] = []
         self.number_of_points_seen = 0
 
-    def inform_samples(self, sample_ids: list[int], forward_output: torch.Tensor, target: torch.Tensor) -> None:
+    def inform_samples(
+        self,
+        sample_ids: list[int],
+        forward_output: torch.Tensor,
+        target: torch.Tensor,
+        embedding: Optional[torch.Tensor] = None,
+    ) -> None:
         scores = self.get_scores(forward_output, target)
         self.probabilities.append(scores)
         self.number_of_points_seen += forward_output.shape[0]
         self.index_sampleid_map += sample_ids
 
     def select_points(self) -> tuple[list[int], torch.Tensor]:
-        target_size = int(self.downsampling_ratio * self.number_of_points_seen / 100)
+        # select always at least 1 point
+        target_size = max(int(self.downsampling_ratio * self.number_of_points_seen / 100), 1)
 
         probabilities = torch.cat(self.probabilities, dim=0)
         probabilities = probabilities / probabilities.sum()
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_submodular_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_submodular_downsampling_strategy.py
new file mode 100644
index 000000000..12b254b91
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_submodular_downsampling_strategy.py
@@ -0,0 +1,89 @@
+from argparse import Namespace
+from typing import Any
+
+import numpy as np
+import torch
+from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_matrix_downsampling_strategy import (
+    AbstractMatrixDownsamplingStrategy,
+    MatrixContent,
+)
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils import (
+    submodular_function,
+    submodular_optimizer,
+)
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.cossim import cossim_np
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.submodular_function import (
+    SUBMODULAR_FUNCTIONS,
+)
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.submodular_optimizer import (
+    OPTIMIZER_CHOICES,
+)
+
+
+class RemoteSubmodularDownsamplingStrategy(AbstractMatrixDownsamplingStrategy):
+    """
+    Strategy introduced in:
+    Submodular combinatorial information measures with applications in machine learning (Iyer et al.)
+    Implementation adapted from:
+    DEEPCORE https://raw.githubusercontent.com/PatrickZH/DeepCore/main/deepcore/methods/submodular.py
+    This strategy collects the last layer gradients (leveraging the class AbstractMatrixDownsamplingStrategy)
+    and then selects the samples by mapping the problem to submodular optimization. The user can select the submod
+    function (available: FacilityLocation, GraphCut and LogDeterminant) and the optimizer to solve it greedily.
+    The similarity between points is measured using the scaled and shifted cosine similarity between the last layer
+    gradients. The shift is needed since some submodular functions (e.g. GraphCut) require non-negative values.
+    """
+
+    def __init__(
+        self,
+        pipeline_id: int,
+        trigger_id: int,
+        batch_size: int,
+        params_from_selector: dict,
+        per_sample_loss: Any,
+        device: str,
+    ):
+        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector, per_sample_loss, device)
+        self.matrix_content = MatrixContent.GRADIENTS
+
+        self.selection_batch = params_from_selector["selection_batch"]
+
+        if "submodular_function" not in params_from_selector:
+            raise ValueError(
+                f"Please specify the submodular function used to select the datapoints. "
+                f"Available functions: {SUBMODULAR_FUNCTIONS}."
+                f"Use the parameter param submodular_function"
+            )
+        self._function = params_from_selector["submodular_function"]
+        if self._function not in SUBMODULAR_FUNCTIONS:
+            raise ValueError(
+                f"The specified submodular function is not available. "
+                f"Pick one from {SUBMODULAR_FUNCTIONS}"
+                f"Use the parameter param submodular_function"
+            )
+
+        self._greedy = params_from_selector.get("submodular_optimizer", "NaiveGreedy")
+        if self._greedy not in OPTIMIZER_CHOICES:
+            raise ValueError(
+                f"The required Greedy optimizer is not available. "
+                f"Pick one of the following: {OPTIMIZER_CHOICES}"
+                f"Use the parameter param submodular_optimizer"
+            )
+
+    def _select_indexes_from_matrix(self, matrix: np.ndarray, target_size: int) -> tuple[list[int], torch.Tensor]:
+        number_of_samples = len(matrix)
+        all_index = np.arange(number_of_samples)
+        submod_function = submodular_function.__dict__[self._function](
+            index=all_index, similarity_kernel=lambda a, b: cossim_np(matrix[a], matrix[b])
+        )
+        submod_optimizer = submodular_optimizer.__dict__[self._greedy](
+            args=Namespace(print_freq=None), index=all_index, budget=target_size
+        )
+        selection_result = submod_optimizer.select(
+            gain_function=submod_function.calc_gain,
+            update_state=submod_function.update_state,
+            batch=self.selection_batch,
+        )
+
+        # no weights are computed with this strategy
+        weights = torch.ones(len(selection_result))
+        return selection_result, weights
diff --git a/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_uncertainty_downsampling_strategy.py b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_uncertainty_downsampling_strategy.py
new file mode 100644
index 000000000..28fbf5df5
--- /dev/null
+++ b/modyn/trainer_server/internal/trainer/remote_downsamplers/remote_uncertainty_downsampling_strategy.py
@@ -0,0 +1,122 @@
+from typing import Any, Optional
+
+import numpy as np
+import torch
+from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_per_label_remote_downsample_strategy import (
+    AbstractPerLabelRemoteDownsamplingStrategy,
+)
+from modyn.trainer_server.internal.trainer.remote_downsamplers.deepcore_utils.shuffling import _shuffle_list_and_tensor
+
+
+class RemoteUncertaintyDownsamplingStrategy(AbstractPerLabelRemoteDownsamplingStrategy):
+    """
+    Strategy introduced in:
+    Selection via Proxy: Efficient Data Selection for Deep Learning
+    Implementation adapted from:
+    DEEPCORE https://raw.githubusercontent.com/PatrickZH/DeepCore/main/deepcore/methods/uncertainty.py
+    This strategy collects a measure of uncertainty (LeastConfidence, Entropy or Margin) for each sample and selects
+    the top-k most uncertain samples.
+    The user can specify which metric to use with the pipeline parameter score_metric.
+    """
+
+    def __init__(
+        self,
+        pipeline_id: int,
+        trigger_id: int,
+        batch_size: int,
+        params_from_selector: dict,
+        per_sample_loss: Any,
+        device: str,
+    ):
+        super().__init__(pipeline_id, trigger_id, batch_size, params_from_selector, device)
+
+        self.criterion = per_sample_loss
+        self.scores = np.array([])
+
+        if "score_metric" not in params_from_selector:
+            raise ValueError(
+                "Please provide a way to score uncertainty. Options available: LeastConfidence, Entropy, Margin"
+            )
+        self.score_metric = params_from_selector["score_metric"]
+
+        # if true, the downsampling is balanced across classes ex class sizes = [10, 50, 30] and 50% downsampling
+        # yields the following downsampled class sizes [5, 25, 15] while without balance something like [0, 45, 0] can
+        # happen
+        self.balance = params_from_selector["balance"]
+        if self.balance:
+            # the selection happens class by class. Hence, these data structures are used to store the selection
+            self.already_selected_ids: list[int] = []
+            self.already_selected_weights = torch.tensor([]).float()
+            self.requires_data_label_by_label = True
+        else:
+            self.requires_data_label_by_label = False
+
+    def inform_samples(
+        self,
+        sample_ids: list[int],
+        forward_output: torch.Tensor,
+        target: torch.Tensor,
+        embedding: Optional[torch.Tensor] = None,
+    ) -> None:
+        assert embedding is None
+
+        self.scores = np.append(self.scores, self._compute_score(forward_output.detach()))
+        # keep the mapping index<->sample_id
+        self.index_sampleid_map += sample_ids
+
+    def _compute_score(self, forward_output: torch.Tensor) -> np.ndarray:
+        if self.score_metric == "LeastConfidence":
+            scores = forward_output.max(dim=1).values.cpu().numpy()
+        elif self.score_metric == "Entropy":
+            preds = torch.nn.functional.softmax(forward_output, dim=1).cpu().numpy()
+            scores = (np.log(preds + 1e-6) * preds).sum(axis=1)
+        elif self.score_metric == "Margin":
+            preds = torch.nn.functional.softmax(forward_output, dim=1)
+            preds_argmax = torch.argmax(preds, dim=1)
+            max_preds = preds[torch.ones(preds.shape[0], dtype=bool), preds_argmax].clone()
+            preds[torch.ones(preds.shape[0], dtype=bool), preds_argmax] = -1.0
+            preds_sub_argmax = torch.argmax(preds, dim=1)
+            scores = (max_preds - preds[torch.ones(preds.shape[0], dtype=bool), preds_sub_argmax]).cpu().numpy()
+        else:
+            raise AssertionError("The required metric does not exist")
+
+        return scores
+
+    def inform_end_of_current_label(self) -> None:
+        """
+        Per-class selection. Here the top-k% samples of the current classes are selected and kept in
+        already_selected_ids
+
+        """
+        assert self.balance
+        # select the samples
+        selected_samples, selected_weights = self._select_from_scores()
+        # save the selected sample IDs and weights
+        self.already_selected_ids += selected_samples
+        self.already_selected_weights = torch.cat((self.already_selected_weights, selected_weights))
+        # clean the data structures for the following class
+        self.scores = np.array([])
+        self.index_sampleid_map: list[int] = []
+
+    def select_points(self) -> tuple[list[int], torch.Tensor]:
+        if self.balance:
+            # the selection has already been done for each class, just return the concatenation of the selections
+            ids, weights = self.already_selected_ids, self.already_selected_weights
+        else:
+            # select the sampleIDs and compute the weights
+            ids, weights = self._select_from_scores()
+        return _shuffle_list_and_tensor(ids, weights)
+
+    def _select_from_scores(self) -> tuple[list[int], torch.Tensor]:
+        number_of_samples = len(self.scores)
+        target_size = max(int(self.downsampling_ratio * number_of_samples / 100), 1)
+        selected_indices, weights = self._select_indexes_from_scores(target_size)
+        selected_ids = [self.index_sampleid_map[index] for index in selected_indices]
+        return selected_ids, weights
+
+    def init_downsampler(self) -> None:
+        self.scores = []
+        self.index_sampleid_map = []
+
+    def _select_indexes_from_scores(self, target_size: int) -> tuple[list[int], torch.Tensor]:
+        return np.argsort(self.scores[::-1])[:target_size], torch.ones(target_size).float()
diff --git a/modyn/trainer_server/internal/utils/training_info.py b/modyn/trainer_server/internal/utils/training_info.py
index 8fb7b30c6..41465f691 100644
--- a/modyn/trainer_server/internal/utils/training_info.py
+++ b/modyn/trainer_server/internal/utils/training_info.py
@@ -27,6 +27,8 @@ def __init__(
         self.pipeline_id = request.pipeline_id
         self.trigger_id = request.trigger_id
         self.training_id = training_id
+        self.num_prefetched_partitions = request.num_prefetched_partitions
+        self.parallel_prefetch_requests = request.parallel_prefetch_requests
 
         self.dataset_id = request.data_info.dataset_id
         self.num_dataloaders = request.data_info.num_dataloaders
diff --git a/plotting/common/common.py b/plotting/common/common.py
new file mode 100644
index 000000000..01a48ec14
--- /dev/null
+++ b/plotting/common/common.py
@@ -0,0 +1,151 @@
+# Credits to Lawrence Benson (https://github.com/hpides/perma-bench/tree/eval/scripts)
+
+import json
+import os
+import sys
+
+import matplotlib
+import matplotlib.pyplot as plt
+
+#######################################
+# Plotting
+#######################################
+
+FS = 20
+MILLION = 1_000_000
+SINGLE_FIG_WIDTH = 5
+SINGLE_FIG_HEIGHT = 3.5
+SINGLE_FIG_SIZE = (SINGLE_FIG_WIDTH, SINGLE_FIG_HEIGHT)
+DOUBLE_FIG_WIDTH = 10
+DOUBLE_FIG_HEIGHT = 3.5
+DOUBLE_FIG_SIZE = (DOUBLE_FIG_WIDTH, DOUBLE_FIG_HEIGHT)
+PLOT_PATHS = []
+IMG_TYPES = ['.png'] # add .svg here to generate svg
+
+PIPELINE_COLOR = {
+    'models_exp0_finetune':    '#a1dab4',
+    'retrain_noreset':    '#378d54',
+    'apache-512':    '#41b6c4',
+    'barlow-256':    '#2c7fb8',
+    'barlow-512':    '#2c7fb8',
+    'z-barlow-dram': '#253494',
+    'z-apache-dram': '#0c1652',
+}
+
+PIPELINE_MARKER = {
+    'models_exp0_finetune':    'P',
+    'retrain_noreset':    'o',
+    'apache-512':    'd',
+    'barlow-256':    's',
+    'barlow-512':    '.',
+    'z-apache-dram': 'x',
+    'z-barlow-dram': '^',
+}
+
+PIPELINE_HATCH = {
+    'models_exp0_finetune':  '\\\\',
+    'retrain_noreset':  '//',
+    'apache-512':  '\\',
+    'barlow-256':  '/',
+    'barlow-512':  '.',
+    'z-apache-dram': '.',
+    'z-barlow-dram': 'x',
+}
+
+PIPELINE_NAME = {
+    'models_exp0_finetune': 'Finetuning',
+    'retrain_noreset':    'Retrain',
+    'apache-512':    'A-512',
+    'barlow-256':    'B-256',
+    'barlow-512':    'B-256-PF',
+    'z-apache-dram': 'A-D',
+    'z-barlow-dram': 'B-D',
+}
+
+
+
+def INIT_PLOT():
+    matplotlib.rcParams.update({
+        'font.size': FS,
+        'svg.fonttype': 'none',
+    })
+
+
+def PRINT_PLOT_PATHS():
+    print(f"To view new plots, run:\n\topen {' '.join(PLOT_PATHS)}")
+
+def BAR(system):
+    return {
+        "color": 'white',
+        "edgecolor": PIPELINE_COLOR[system],
+        "hatch": PIPELINE_HATCH[system],
+        "lw": 3
+    }
+
+def LINE(system):
+    return {
+        "lw": 4,
+        "ms": 10,
+        "color": PIPELINE_COLOR[system],
+        "marker": PIPELINE_MARKER[system],
+        "markeredgewidth": 1,
+        "markeredgecolor": 'black',
+    }
+
+def BAR_X_TICKS_POS(bar_width, num_bars, num_xticks):
+    return [i - (bar_width / 2) + ((num_bars * bar_width) / 2) for i in range(num_xticks)]
+
+def RESIZE_TICKS(ax, x=FS, y=FS):
+    for tick in ax.xaxis.get_major_ticks():
+        tick.label.set_fontsize(x)
+    for tick in ax.yaxis.get_major_ticks():
+        tick.label.set_fontsize(y)
+
+def HATCH_WIDTH(width=4):
+    matplotlib.rcParams['hatch.linewidth'] = width
+
+def Y_GRID(ax):
+    ax.grid(axis='y', which='major')
+    ax.set_axisbelow(True)
+
+def HIDE_BORDERS(ax, show_left=False):
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['bottom'].set_visible(True)
+    ax.spines['left'].set_visible(show_left)
+
+def FIG_LEGEND(fig):
+    fig.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=6,
+               frameon=False, columnspacing=1, handletextpad=0.3
+               #, borderpad=0.1, labelspacing=0.1, handlelength=1.8
+              )
+    fig.tight_layout()
+
+
+def LOAD_DATA(path):
+    with open(path) as json_file:
+        return json.load(json_file)
+
+def SAVE_PLOT(plot_path, img_types=None):
+    if img_types is None:
+        img_types = IMG_TYPES
+
+    for img_type in img_types:
+        img_path = f"{plot_path}{img_type}"
+        PLOT_PATHS.append(img_path)
+        plt.savefig(img_path, bbox_inches='tight', dpi=300)
+
+    plt.figure()
+
+
+def INIT(args):
+    if len(args) != 3:
+        sys.exit("Need /path/to/results /path/to/plots")
+
+    result_path = args[1]
+    plot_dir = args[2]
+
+    os.makedirs(plot_dir, exist_ok=True)
+    INIT_PLOT()
+
+    return result_path, plot_dir
\ No newline at end of file
diff --git a/plotting/system/avg_max_med_batch.py b/plotting/system/avg_max_med_batch.py
new file mode 100644
index 000000000..c05b1fb2f
--- /dev/null
+++ b/plotting/system/avg_max_med_batch.py
@@ -0,0 +1,90 @@
+import glob
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from plotting.common.common import *
+
+
+def plot_baravg(pipeline_log, ax, trigger):
+    data = []
+
+    bar_labels = dict()
+
+    for pipeline in pipeline_log:
+
+        relevant_data = pipeline["supervisor"]["triggers"][trigger]["trainer_log"]["epochs"][0]
+        meta_data = pipeline["configuration"]["pipeline_config"]["training"]
+
+        max_fb = relevant_data["MaxFetchBatch"] / 1000
+        avg_fb = relevant_data["AvgFetchBatch"] / 1000
+
+        total_fb = relevant_data["TotalFetchBatch"] / 1000
+        total_train = pipeline["supervisor"]["triggers"][trigger]["trainer_log"]["total_train"] / 1000
+        
+        x = f"{meta_data['dataloader_workers']}/{meta_data['num_prefetched_partitions']}/{meta_data['parallel_prefetch_requests']}"
+
+        percentage = round((total_fb / total_train) * 100,1)
+        bar_labels[x] = f"{int(total_fb)} ({percentage}%)\n"
+
+        data.append([x, avg_fb, max_fb])
+
+    data_df = pd.DataFrame(data, columns=["x", "Avg", "Max"])
+    test_data_melted = data_df.melt(id_vars="x", value_name = "time", var_name="measure")
+
+    mask = test_data_melted.measure.isin(['Max'])
+    scale = test_data_melted[~mask].time.mean()/ test_data_melted[mask].time.mean()
+    test_data_melted.loc[mask, 'time'] = test_data_melted.loc[mask, 'time']*scale
+
+    sns.barplot(data=test_data_melted, x="x", y="time", hue="measure", ax=ax)
+    bar_label_list = [bar_labels[x._text] for x in ax.get_xticklabels()]
+    ax.bar_label(ax.containers[0], labels=bar_label_list, size=11)
+
+    ax.set_xlabel("Workers / Prefetched Partitions / Parallel Requests")
+    ax.tick_params(axis='x', which='major', labelsize=14)
+    ax.set_ylabel("Avg")
+    ax2 = ax.twinx()
+
+    ax2.set_ylim(ax.get_ylim())
+    ax2.set_yticklabels(np.round(ax.get_yticks()/scale,1))
+    ax2.set_ylabel('Max')
+    ax.get_legend().set_visible(False)
+
+    #ax.set_xticks(list(x))
+    #ax.set_xticklabels([f"{idx + 1}" for idx, _ in enumerate(x)])
+    #ax.set_xlabel("Waiting time for next batch (seconds)")
+
+    #ax.set_ylabel("Count")
+
+    ax.set_title("Average and Max Time per Batch")
+
+def load_all_pipelines(data_path):
+    all_data = []
+
+    for filename in glob.iglob(data_path + '/**/*.log', recursive=True):
+        data = LOAD_DATA(filename)
+        all_data.append(data)
+
+    return all_data
+
+if __name__ == '__main__':
+    # Idee: Selber plot mit TotalTrain und anteil fetch batch an total train
+
+    data_path, plot_dir = INIT(sys.argv)
+    data = load_all_pipelines(data_path)
+    fig, ax = plt.subplots(1,1, figsize=DOUBLE_FIG_SIZE)
+
+    plot_baravg(data, ax, "0")
+
+
+    HATCH_WIDTH()
+    FIG_LEGEND(fig)
+
+    Y_GRID(ax)
+    HIDE_BORDERS(ax)
+
+    plot_path = os.path.join(plot_dir, "avg_max")
+    SAVE_PLOT(plot_path)
+    PRINT_PLOT_PATHS()
\ No newline at end of file
diff --git a/plotting/system/next_batch_distribution.py b/plotting/system/next_batch_distribution.py
new file mode 100644
index 000000000..37d455114
--- /dev/null
+++ b/plotting/system/next_batch_distribution.py
@@ -0,0 +1,84 @@
+import glob
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+from plotting.common.common import *
+
+
+def plot_nbd(pipeline_log, ax, trigger):
+    relevant_data = pipeline_log["supervisor"]["triggers"][trigger]["trainer_log"]
+    all_epoch_timings = []
+    for epoch in relevant_data["epochs"]:
+        all_epoch_timings.extend(epoch["BatchTimings"])
+    all_epoch_timings = np.array(all_epoch_timings) / 1000 # ms to seconds
+    
+
+    sns.histplot(data=all_epoch_timings, ax=ax, log_scale=True)
+   
+    #ax.set_xticks(list(x))
+    #ax.set_xticklabels([f"{idx + 1}" for idx, _ in enumerate(x)])
+    #ax.set_xlabel("Waiting time for next batch (seconds)")
+
+    #ax.set_ylabel("Count")
+
+    #ax.set_title("Histogram of waiting times")
+
+def load_all_pipelines(data_path, worker_count_filter):
+    all_data = []
+    uniq_prefetched_partitions = set()
+    uniq_parallel_prefetch_requests = set()
+
+    for filename in glob.iglob(data_path + '/**/*.log', recursive=True):
+        data = LOAD_DATA(filename)
+        num_data_loaders = data["configuration"]["pipeline_config"]["training"]["dataloader_workers"]
+        prefetched_partitions = data["configuration"]["pipeline_config"]["training"]["num_prefetched_partitions"]
+        parallel_prefetch_requests = data["configuration"]["pipeline_config"]["training"]["parallel_prefetch_requests"]
+
+        if num_data_loaders == worker_count_filter:
+            all_data.append(data)
+            uniq_prefetched_partitions.add(prefetched_partitions)
+            uniq_parallel_prefetch_requests.add(parallel_prefetch_requests)
+
+    return all_data, (len(uniq_prefetched_partitions), len(uniq_parallel_prefetch_requests)), uniq_prefetched_partitions, uniq_parallel_prefetch_requests
+
+if __name__ == '__main__':
+    data_path, plot_dir = INIT(sys.argv)
+    WORKER_COUNT = 8
+
+    all_data, figure_dimensions, uniq_prefetched_partitions, uniq_parallel_prefetch_requests = load_all_pipelines(data_path, WORKER_COUNT)
+
+    fig, axes = plt.subplots(*figure_dimensions, figsize=(40,20), sharex=True)
+
+    row_vals = sorted(uniq_prefetched_partitions)
+    column_vals = sorted(uniq_parallel_prefetch_requests)
+
+    for row_idx, row_val in enumerate(row_vals):
+        for col_idx, column_val in enumerate(column_vals):
+            ax = axes[row_idx][col_idx]
+            if row_idx == 0:
+                ax.set_title(f"{column_val} PPR")
+            if col_idx == 0:
+                ax.set_ylabel(f"{row_val} PP", rotation=90, size='large')
+
+            for data in all_data:
+                prefetched_partitions = data["configuration"]["pipeline_config"]["training"]["num_prefetched_partitions"]
+                parallel_prefetch_requests = data["configuration"]["pipeline_config"]["training"]["parallel_prefetch_requests"]
+
+                if row_val == prefetched_partitions and column_val == parallel_prefetch_requests:
+                    plot_nbd(data, ax, "0")
+
+
+    HATCH_WIDTH()
+    #FIG_LEGEND(fig)
+    for row in axes:
+        for ax in row:
+            Y_GRID(ax)
+            HIDE_BORDERS(ax)
+
+    fig.tight_layout()
+
+    plot_path = os.path.join(plot_dir, "next_batch_distribution")
+    SAVE_PLOT(plot_path)
+    PRINT_PLOT_PATHS()
\ No newline at end of file
diff --git a/plotting/system/training_breakdown.py b/plotting/system/training_breakdown.py
new file mode 100644
index 000000000..f87f5c14c
--- /dev/null
+++ b/plotting/system/training_breakdown.py
@@ -0,0 +1 @@
+# TODO
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 2026c576f..835c0b0b8 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,6 +14,7 @@ max-line-length = 120
 exclude = *_grpc.py,
           *_pb2.py,
           benchmark/**/*
+          plotting/**/*
           modyn/storage/build/**
           modyn/storage/cmake-build-debug/**
 
diff --git a/storage_postgresql.conf b/storage_postgresql.conf
index 9f4f5fd6d..92a44accc 100644
--- a/storage_postgresql.conf
+++ b/storage_postgresql.conf
@@ -42,21 +42,21 @@ listen_addresses = '*'
 # Data Storage: ssd
 
 max_connections = 300
-shared_buffers = 8GB
-effective_cache_size = 24GB
+shared_buffers = 24GB
+effective_cache_size = 72GB
 maintenance_work_mem = 2GB
 checkpoint_completion_target = 0.9
 wal_buffers = 16MB
 default_statistics_target = 100
 random_page_cost = 1.1
 effective_io_concurrency = 200
-work_mem = 6990kB
+work_mem = 20971kB
 min_wal_size = 1GB
 max_wal_size = 4GB
-max_worker_processes = 4
-max_parallel_workers_per_gather = 2
-max_parallel_workers = 4
-max_parallel_maintenance_workers = 2
+max_worker_processes = 16
+max_parallel_workers_per_gather = 4
+max_parallel_workers = 16
+max_parallel_maintenance_workers = 4
 
 
 #------------------------------------------------------------------------------