From 1dc36acabb9d5c140f094a5574e48e692014a4ef Mon Sep 17 00:00:00 2001
From: Xianzhe Ma <xianzma@gmail.com>
Date: Tue, 30 Apr 2024 15:42:18 +0200
Subject: [PATCH] Generate test/train datasets for yearbook and arxiv (#383)

This PR modifies the files to generate yearbook and arxiv datasets, so
that both train and test datasets are generated.
---
 benchmark/wildtime_benchmarks/README.md       |  4 +-
 .../wildtime_benchmarks/benchmark_utils.py    |  2 +-
 .../data_generation_arxiv.py                  | 98 ++++++++++++++-----
 .../data_generation_yearbook.py               | 76 +++++++++-----
 .../example_pipelines/yearbook.yaml           |  6 +-
 modyn/config/examples/modyn_config.yaml       | 45 ++++++++-
 6 files changed, 176 insertions(+), 55 deletions(-)

diff --git a/benchmark/wildtime_benchmarks/README.md b/benchmark/wildtime_benchmarks/README.md
index 8028229db..f6ad2180f 100644
--- a/benchmark/wildtime_benchmarks/README.md
+++ b/benchmark/wildtime_benchmarks/README.md
@@ -18,7 +18,7 @@ Use the `-h` flag to find out more.
 The goal is to predict the sex given a yearbook picture.
 The dataset contains 37189 samples collected from 1930 to 2013. 
 Since timestamps in Modyn are based on Unix Timestamps (so 0 is 1/1/1970) we have to remap the years to days. 
-Precisely, the timestamp for pictures from 1930 is 1/1/1970, then 2/1/1970 for the ones taken in 1931 and so forth. 
+Precisely, the timestamp for pictures from 1930 is 1/Jan/1970, then 2/Jan/1970 for the ones taken in 1931 and so forth. 
 Samples are saved using BinaryFileWrapper by grouping all samples of the same year in one file.
 
 ### FMoW
@@ -34,7 +34,7 @@ Titles belonging to the same year are grouped into the same CSV file and stored
 Each year is mapped to a year starting from 1/1/1970.
 
 ### Arxiv
-The goal is to predict the paper category (55 classes) given the paper title. 
+The goal is to predict the paper category (172 classes) given the paper title. 
 The dataset contains more than 2 million samples collected from 2002 to 2017. 
 Titles belonging to the same year are grouped into the same CSV file and stored together. 
 Each year is mapped to a year starting from 1/1/1970.
diff --git a/benchmark/wildtime_benchmarks/benchmark_utils.py b/benchmark/wildtime_benchmarks/benchmark_utils.py
index 0a37dc829..2d5b4102c 100644
--- a/benchmark/wildtime_benchmarks/benchmark_utils.py
+++ b/benchmark/wildtime_benchmarks/benchmark_utils.py
@@ -57,7 +57,7 @@ def setup_logger():
 
 
 def create_fake_timestamp(year: int, base_year: int) -> int:
-    timestamp = ((year - base_year) * DAY_LENGTH_SECONDS) + 1
+    timestamp = ((year - base_year) * DAY_LENGTH_SECONDS)
     return timestamp
 
 
diff --git a/benchmark/wildtime_benchmarks/data_generation_arxiv.py b/benchmark/wildtime_benchmarks/data_generation_arxiv.py
index 250bf455e..95784af96 100644
--- a/benchmark/wildtime_benchmarks/data_generation_arxiv.py
+++ b/benchmark/wildtime_benchmarks/data_generation_arxiv.py
@@ -1,10 +1,8 @@
 import os
 import pickle
 
-import torch
 from benchmark_utils import create_timestamp, download_if_not_exists, setup_argparser_wildtime, setup_logger
 from torch.utils.data import Dataset
-from tqdm import tqdm
 
 logger = setup_logger()
 
@@ -17,6 +15,20 @@ def main():
     ArXivDownloader(args.dir).store_data(args.all, args.dummyyear)
 
 
+# There are some lines in the train dataset that are corrupted, i.e. the csv file wrapper cannot properly read the data.
+# We remove these lines from the dataset.
+corrupted_idx_dict = {
+    2007: [33213],
+    2008: [22489],
+    2009: [64621, 165454],
+    2015: [42007, 94935],
+    2016: [111398],
+    2019: [41309, 136814],
+    2020: [102074],
+    2021: [32013, 55660]
+}
+
+
 class ArXivDownloader(Dataset):
     time_steps = [i for i in range(2007, 2023)]
     input_dim = 55
@@ -24,12 +36,6 @@ class ArXivDownloader(Dataset):
     drive_id = "1H5xzHHgXl8GOMonkb6ojye-Y2yIp436V"
     file_name = "arxiv.pkl"
 
-    def __getitem__(self, idx):
-        return self._dataset["title"][idx], torch.LongTensor([self._dataset["category"][idx]])[0]
-
-    def __len__(self):
-        return len(self._dataset["category"])
-
     def __init__(self,  data_dir):
         super().__init__()
 
@@ -43,40 +49,88 @@ def __init__(self,  data_dir):
         self._dataset = datasets
         self.path = data_dir
 
-    def store_data(self, store_all_data: bool, add_final_dummy_year: bool):
-        for year in tqdm(self._dataset):
+    def store_data(self, create_test_data: bool, add_final_dummy_year: bool):
+        # create directories
+        if not os.path.exists(self.path):
+            os.mkdir(self.path)
+
+        train_dir = os.path.join(self.path, "train")
+        os.makedirs(train_dir, exist_ok=True)
+
+        if create_test_data:
+            test_dir = os.path.join(self.path, "test")
+            os.makedirs(test_dir, exist_ok=True)
+
+        stats = {}
+
+        for year in self._dataset:
             # for simplicity, instead of using years we map each day to a year from 1970
             year_timestamp = create_timestamp(year=1970, month=1, day=year-2006)
-            year_rows = []
 
-            splits = [0, 1] if store_all_data else [0]
-            for split in splits:
+            def get_split_by_id(split: int) -> list[str]:
+                rows = []
                 for i in range(len(self._dataset[year][split]["title"])):
                     text = self._dataset[year][split]["title"][i].replace("\n", " ")
                     label = self._dataset[year][split]["category"][i]
                     csv_row = f"{text}\t{label}"
-                    year_rows.append(csv_row)
+                    rows.append(csv_row)
+                return rows
 
-            # store the year file
-            text_file = os.path.join(self.path, f"{year}.csv")
-            with open(text_file, "w", encoding="utf-8") as f:
-                f.write("\n".join(year_rows))
+            train_year_rows = get_split_by_id(0)
+            train_year_rows = self.filter_corrupted_lines(year, train_year_rows)
+            train_file = os.path.join(train_dir, f"{year}.csv")
+            with open(train_file, "w", encoding="utf-8") as f:
+                f.write("\n".join(train_year_rows))
 
             # set timestamp
-            os.utime(text_file, (year_timestamp, year_timestamp))
+            os.utime(train_file, (year_timestamp, year_timestamp))
+
+            if create_test_data:
+                test_year_rows = get_split_by_id(1)
+                test_file = os.path.join(test_dir, f"{year}.csv")
+                with open(test_file, "w", encoding="utf-8") as f:
+                    f.write("\n".join(test_year_rows))
+
+                # set timestamp
+                os.utime(test_file, (year_timestamp, year_timestamp))
+                stats[year] = {"train": len(train_year_rows), "test": len(test_year_rows)}
+            else:
+                stats[year] = {"train": len(train_year_rows)}
+        with open(os.path.join(self.path, "overall_stats.json"), "w") as f:
+            import json
+            json.dump(stats, f, indent=4)
 
         if add_final_dummy_year:
             dummy_year = year + 1
             year_timestamp = create_timestamp(year=1970, month=1, day= dummy_year - 2006)
-            text_file = os.path.join(self.path, f"{dummy_year}.csv")
-            with open(text_file, "w", encoding="utf-8") as f:
+            train_dummy_file = os.path.join(train_dir, f"{dummy_year}.csv")
+            with open(train_dummy_file, "w", encoding="utf-8") as f:
                 f.write("\n".join(["dummy\t0"]))
 
             # set timestamp
-            os.utime(text_file, (year_timestamp, year_timestamp))
+            os.utime(train_dummy_file, (year_timestamp, year_timestamp))
+
+            if create_test_data:
+                test_dummy_file = os.path.join(test_dir, f"{dummy_year}.csv")
+                with open(test_dummy_file, "w", encoding="utf-8") as f:
+                    f.write("\n".join(["dummy\t0"]))
+
+                # set timestamp
+                os.utime(test_dummy_file, (year_timestamp, year_timestamp))
 
         os.remove(os.path.join(self.path, "arxiv.pkl"))
 
+    @staticmethod
+    def filter_corrupted_lines(year, rows):
+        if year in corrupted_idx_dict:
+            corrupted_idx = corrupted_idx_dict[year]
+            goodlines = []
+            for i, l in enumerate(rows):
+                if i not in corrupted_idx:
+                    goodlines.append(l)
+            return goodlines
+        return rows
+
 
 if __name__ == "__main__":
     main()
diff --git a/benchmark/wildtime_benchmarks/data_generation_yearbook.py b/benchmark/wildtime_benchmarks/data_generation_yearbook.py
index a3b2535bf..c1f0866cb 100644
--- a/benchmark/wildtime_benchmarks/data_generation_yearbook.py
+++ b/benchmark/wildtime_benchmarks/data_generation_yearbook.py
@@ -37,43 +37,73 @@ def __init__(self, data_dir: str):
         self._dataset = datasets
         self.data_dir = data_dir
 
-    def _get_year_data(self, year: int, store_all_data: bool) -> list[Tuple]:
-        splits = [0, 1] if store_all_data else [0]
-        images = torch.FloatTensor(
-            np.array(
-                [   # transpose to transform from HWC to CHW (H=height, W=width, C=channels).
-                    # Pytorch requires CHW format
-                    img.transpose(2, 0, 1)
-                    # _dataset has 3 dimensions [years][train=0,valid=1,test=2]["images"/"labels"]
-                    for split in splits # just train if --all not specified, else test, train and val
-                    for img in self._dataset[year][split]["images"]
-                ]
+    def _get_year_data(self, year: int, create_test_data: bool) -> tuple[dict[str, list[tuple]], dict[str, int]]:
+        def get_split_by_id(split: int) -> list[Tuple]:
+            images = torch.FloatTensor(
+                np.array(
+                    [   # transpose to transform from HWC to CHW (H=height, W=width, C=channels).
+                        # Pytorch requires CHW format
+                        img.transpose(2, 0, 1)
+                        # _dataset has 2 dimensions [years][train=0,test=1]["images"/"labels"]
+                        for img in self._dataset[year][split]["images"]
+                    ]
+                )
             )
-        )
-        labels = torch.cat([torch.LongTensor(self._dataset[year][split]["labels"]) for split in splits])
-        return [(images[i], labels[i]) for i in range(len(images))]
+            labels = torch.LongTensor(self._dataset[year][split]["labels"])
+            return [(images[i], labels[i]) for i in range(len(images))]
+
+        if not create_test_data:
+            train_size = len(get_split_by_id(0))
+            ds = {"train": get_split_by_id(0)}
+            stats = { "train": train_size }
+        else:
+            train_size = len(get_split_by_id(0))
+            test_size = len(get_split_by_id(1))
+            ds = {"train": get_split_by_id(0), "test": get_split_by_id(1)}
+            stats = {"train": train_size, "test": test_size}
+        return ds, stats
 
     def __len__(self) -> int:
         return len(self._dataset["labels"])
 
-    def store_data(self, store_all_data: bool, add_final_dummy_year: bool) -> None:
+    def store_data(self, create_test_data: bool, add_final_dummy_year: bool) -> None:
         # create directories
         if not os.path.exists(self.data_dir):
             os.mkdir(self.data_dir)
 
+        train_dir = os.path.join(self.data_dir, "train")
+        os.makedirs(train_dir, exist_ok=True)
+
+        if create_test_data:
+            test_dir = os.path.join(self.data_dir, "test")
+            os.makedirs(test_dir, exist_ok=True)
+
+        overall_stats = {}
         for year in self.time_steps:
-            print(f"Saving data for year {year}")
-            ds = self._get_year_data(year, store_all_data)
-            self.create_binary_file(ds,
-                                    os.path.join(self.data_dir, f"{year}.bin"),
+            ds, stats = self._get_year_data(year, create_test_data)
+            overall_stats[year] = stats
+            self.create_binary_file(ds["train"],
+                                    os.path.join(train_dir, f"{year}.bin"),
                                     create_fake_timestamp(year, base_year=1930))
+            if create_test_data:
+                self.create_binary_file(ds["test"],
+                                        os.path.join(test_dir, f"{year}.bin"),
+                                        create_fake_timestamp(year, base_year=1930))
+
+        with open(os.path.join(self.data_dir, "overall_stats.json"), "w") as f:
+            import json
+            json.dump(overall_stats, f, indent=4)
 
         if add_final_dummy_year:
             dummy_year = year + 1
-            dummy_data = [ ds[0] ] # get one sample from the previous year
+            dummy_data = [ ds["train"][0] ] # get one sample from the previous year
             self.create_binary_file(dummy_data,
-                                    os.path.join(self.data_dir, f"{dummy_year}.bin"),
+                                    os.path.join(train_dir, f"{dummy_year}.bin"),
                                     create_fake_timestamp(dummy_year, base_year=1930))
+            if create_test_data:
+                self.create_binary_file(dummy_data,
+                                        os.path.join(test_dir, f"{dummy_year}.bin"),
+                                        create_fake_timestamp(dummy_year, base_year=1930))
 
         os.remove(os.path.join(self.data_dir, "yearbook.pkl"))
 
@@ -85,9 +115,9 @@ def create_binary_file(data, output_file_name: str, timestamp: int) -> None:
                 label_integer = tensor2.item()
 
                 features_size = len(features_bytes)
-                assert features_size == 4096
+                assert features_size == 12288
 
-                f.write(int.to_bytes(label_integer, length=4, byteorder="little"))
+                f.write(int.to_bytes(label_integer, length=4, byteorder="big"))
                 f.write(features_bytes)
 
         os.utime(output_file_name, (timestamp, timestamp))
diff --git a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml
index 967379e4d..1908f6105 100644
--- a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml
+++ b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml
@@ -41,10 +41,12 @@ data:
   dataset_id: yearbook
   transformations: []
   bytes_parser_function: |
+    import warnings
     import torch
-    import numpy as np
     def bytes_parser_function(data: memoryview) -> torch.Tensor:
-      return torch.from_numpy(np.frombuffer(data, dtype=np.float32))
+      with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=UserWarning)
+      return torch.frombuffer(data, dtype=torch.float32).reshape(3, 32, 32)
 
 trigger:
   id: TimeTrigger
diff --git a/modyn/config/examples/modyn_config.yaml b/modyn/config/examples/modyn_config.yaml
index 7b05a24f8..45109a20c 100644
--- a/modyn/config/examples/modyn_config.yaml
+++ b/modyn/config/examples/modyn_config.yaml
@@ -50,15 +50,33 @@ storage:
       },
       {
         name: "yearbook",
-        description: "Yearbook Dataset from Wild-Time",
+        description: "Yearbook Dataset from Wild-Time (training set)",
         version: "0.0.1",
-        base_path: "/datasets/yearbook",
+        base_path: "/datasets/yearbook/train",
         filesystem_wrapper_type: "LocalFilesystemWrapper",
         file_wrapper_type: "BinaryFileWrapper",
         file_wrapper_config:
           {
             byteorder: "big",
-            record_size: 4100,
+            record_size: 12292,
+            label_size: 4,
+            file_extension: ".bin"
+          },
+        ignore_last_timestamp: false,
+        file_watcher_interval: 5,
+        selector_batch_size: 256,
+      },
+      {
+        name: "yearbook-test",
+        description: "Yearbook Dataset from Wild-Time (test set)",
+        version: "0.0.1",
+        base_path: "/datasets/yearbook/test",
+        filesystem_wrapper_type: "LocalFilesystemWrapper",
+        file_wrapper_type: "BinaryFileWrapper",
+        file_wrapper_config:
+          {
+            byteorder: "big",
+            record_size: 12292,
             label_size: 4,
             file_extension: ".bin"
           },
@@ -101,9 +119,26 @@ storage:
       },
       {
         name: "arxiv",
-        description: "Arxiv Dataset (from Wild-time)",
+        description: "Arxiv Dataset from Wild-time (training set)",
+        version: "0.0.1",
+        base_path: "/datasets/arxiv/train",
+        filesystem_wrapper_type: "LocalFilesystemWrapper",
+        file_wrapper_type: "CsvFileWrapper",
+        file_wrapper_config:
+          {
+            file_extension: ".csv",
+            separator: "\t", #tsv best option here since sentences contain commas and semicolons
+            label_index: 1
+          },
+        ignore_last_timestamp: false,
+        file_watcher_interval: 5,
+        selector_batch_size: 4096,
+      },
+      {
+        name: "arxiv-test",
+        description: "Arxiv Dataset from Wild-time (test set)",
         version: "0.0.1",
-        base_path: "/datasets/arxiv",
+        base_path: "/datasets/arxiv/test",
         filesystem_wrapper_type: "LocalFilesystemWrapper",
         file_wrapper_type: "CsvFileWrapper",
         file_wrapper_config: