From 1dc36acabb9d5c140f094a5574e48e692014a4ef Mon Sep 17 00:00:00 2001 From: Xianzhe Ma Date: Tue, 30 Apr 2024 15:42:18 +0200 Subject: [PATCH] Generate test/train datasets for yearbook and arxiv (#383) This PR modifies the files to generate yearbook and arxiv datasets, so that both train and test datasets are generated. --- benchmark/wildtime_benchmarks/README.md | 4 +- .../wildtime_benchmarks/benchmark_utils.py | 2 +- .../data_generation_arxiv.py | 98 ++++++++++++++----- .../data_generation_yearbook.py | 76 +++++++++----- .../example_pipelines/yearbook.yaml | 6 +- modyn/config/examples/modyn_config.yaml | 45 ++++++++- 6 files changed, 176 insertions(+), 55 deletions(-) diff --git a/benchmark/wildtime_benchmarks/README.md b/benchmark/wildtime_benchmarks/README.md index 8028229db..f6ad2180f 100644 --- a/benchmark/wildtime_benchmarks/README.md +++ b/benchmark/wildtime_benchmarks/README.md @@ -18,7 +18,7 @@ Use the `-h` flag to find out more. The goal is to predict the sex given a yearbook picture. The dataset contains 37189 samples collected from 1930 to 2013. Since timestamps in Modyn are based on Unix Timestamps (so 0 is 1/1/1970) we have to remap the years to days. -Precisely, the timestamp for pictures from 1930 is 1/1/1970, then 2/1/1970 for the ones taken in 1931 and so forth. +Precisely, the timestamp for pictures from 1930 is 1/Jan/1970, then 2/Jan/1970 for the ones taken in 1931 and so forth. Samples are saved using BinaryFileWrapper by grouping all samples of the same year in one file. ### FMoW @@ -34,7 +34,7 @@ Titles belonging to the same year are grouped into the same CSV file and stored Each year is mapped to a year starting from 1/1/1970. ### Arxiv -The goal is to predict the paper category (55 classes) given the paper title. +The goal is to predict the paper category (172 classes) given the paper title. The dataset contains more than 2 million samples collected from 2002 to 2017. Titles belonging to the same year are grouped into the same CSV file and stored together. Each year is mapped to a year starting from 1/1/1970. diff --git a/benchmark/wildtime_benchmarks/benchmark_utils.py b/benchmark/wildtime_benchmarks/benchmark_utils.py index 0a37dc829..2d5b4102c 100644 --- a/benchmark/wildtime_benchmarks/benchmark_utils.py +++ b/benchmark/wildtime_benchmarks/benchmark_utils.py @@ -57,7 +57,7 @@ def setup_logger(): def create_fake_timestamp(year: int, base_year: int) -> int: - timestamp = ((year - base_year) * DAY_LENGTH_SECONDS) + 1 + timestamp = ((year - base_year) * DAY_LENGTH_SECONDS) return timestamp diff --git a/benchmark/wildtime_benchmarks/data_generation_arxiv.py b/benchmark/wildtime_benchmarks/data_generation_arxiv.py index 250bf455e..95784af96 100644 --- a/benchmark/wildtime_benchmarks/data_generation_arxiv.py +++ b/benchmark/wildtime_benchmarks/data_generation_arxiv.py @@ -1,10 +1,8 @@ import os import pickle -import torch from benchmark_utils import create_timestamp, download_if_not_exists, setup_argparser_wildtime, setup_logger from torch.utils.data import Dataset -from tqdm import tqdm logger = setup_logger() @@ -17,6 +15,20 @@ def main(): ArXivDownloader(args.dir).store_data(args.all, args.dummyyear) +# There are some lines in the train dataset that are corrupted, i.e. the csv file wrapper cannot properly read the data. +# We remove these lines from the dataset. +corrupted_idx_dict = { + 2007: [33213], + 2008: [22489], + 2009: [64621, 165454], + 2015: [42007, 94935], + 2016: [111398], + 2019: [41309, 136814], + 2020: [102074], + 2021: [32013, 55660] +} + + class ArXivDownloader(Dataset): time_steps = [i for i in range(2007, 2023)] input_dim = 55 @@ -24,12 +36,6 @@ class ArXivDownloader(Dataset): drive_id = "1H5xzHHgXl8GOMonkb6ojye-Y2yIp436V" file_name = "arxiv.pkl" - def __getitem__(self, idx): - return self._dataset["title"][idx], torch.LongTensor([self._dataset["category"][idx]])[0] - - def __len__(self): - return len(self._dataset["category"]) - def __init__(self, data_dir): super().__init__() @@ -43,40 +49,88 @@ def __init__(self, data_dir): self._dataset = datasets self.path = data_dir - def store_data(self, store_all_data: bool, add_final_dummy_year: bool): - for year in tqdm(self._dataset): + def store_data(self, create_test_data: bool, add_final_dummy_year: bool): + # create directories + if not os.path.exists(self.path): + os.mkdir(self.path) + + train_dir = os.path.join(self.path, "train") + os.makedirs(train_dir, exist_ok=True) + + if create_test_data: + test_dir = os.path.join(self.path, "test") + os.makedirs(test_dir, exist_ok=True) + + stats = {} + + for year in self._dataset: # for simplicity, instead of using years we map each day to a year from 1970 year_timestamp = create_timestamp(year=1970, month=1, day=year-2006) - year_rows = [] - splits = [0, 1] if store_all_data else [0] - for split in splits: + def get_split_by_id(split: int) -> list[str]: + rows = [] for i in range(len(self._dataset[year][split]["title"])): text = self._dataset[year][split]["title"][i].replace("\n", " ") label = self._dataset[year][split]["category"][i] csv_row = f"{text}\t{label}" - year_rows.append(csv_row) + rows.append(csv_row) + return rows - # store the year file - text_file = os.path.join(self.path, f"{year}.csv") - with open(text_file, "w", encoding="utf-8") as f: - f.write("\n".join(year_rows)) + train_year_rows = get_split_by_id(0) + train_year_rows = self.filter_corrupted_lines(year, train_year_rows) + train_file = os.path.join(train_dir, f"{year}.csv") + with open(train_file, "w", encoding="utf-8") as f: + f.write("\n".join(train_year_rows)) # set timestamp - os.utime(text_file, (year_timestamp, year_timestamp)) + os.utime(train_file, (year_timestamp, year_timestamp)) + + if create_test_data: + test_year_rows = get_split_by_id(1) + test_file = os.path.join(test_dir, f"{year}.csv") + with open(test_file, "w", encoding="utf-8") as f: + f.write("\n".join(test_year_rows)) + + # set timestamp + os.utime(test_file, (year_timestamp, year_timestamp)) + stats[year] = {"train": len(train_year_rows), "test": len(test_year_rows)} + else: + stats[year] = {"train": len(train_year_rows)} + with open(os.path.join(self.path, "overall_stats.json"), "w") as f: + import json + json.dump(stats, f, indent=4) if add_final_dummy_year: dummy_year = year + 1 year_timestamp = create_timestamp(year=1970, month=1, day= dummy_year - 2006) - text_file = os.path.join(self.path, f"{dummy_year}.csv") - with open(text_file, "w", encoding="utf-8") as f: + train_dummy_file = os.path.join(train_dir, f"{dummy_year}.csv") + with open(train_dummy_file, "w", encoding="utf-8") as f: f.write("\n".join(["dummy\t0"])) # set timestamp - os.utime(text_file, (year_timestamp, year_timestamp)) + os.utime(train_dummy_file, (year_timestamp, year_timestamp)) + + if create_test_data: + test_dummy_file = os.path.join(test_dir, f"{dummy_year}.csv") + with open(test_dummy_file, "w", encoding="utf-8") as f: + f.write("\n".join(["dummy\t0"])) + + # set timestamp + os.utime(test_dummy_file, (year_timestamp, year_timestamp)) os.remove(os.path.join(self.path, "arxiv.pkl")) + @staticmethod + def filter_corrupted_lines(year, rows): + if year in corrupted_idx_dict: + corrupted_idx = corrupted_idx_dict[year] + goodlines = [] + for i, l in enumerate(rows): + if i not in corrupted_idx: + goodlines.append(l) + return goodlines + return rows + if __name__ == "__main__": main() diff --git a/benchmark/wildtime_benchmarks/data_generation_yearbook.py b/benchmark/wildtime_benchmarks/data_generation_yearbook.py index a3b2535bf..c1f0866cb 100644 --- a/benchmark/wildtime_benchmarks/data_generation_yearbook.py +++ b/benchmark/wildtime_benchmarks/data_generation_yearbook.py @@ -37,43 +37,73 @@ def __init__(self, data_dir: str): self._dataset = datasets self.data_dir = data_dir - def _get_year_data(self, year: int, store_all_data: bool) -> list[Tuple]: - splits = [0, 1] if store_all_data else [0] - images = torch.FloatTensor( - np.array( - [ # transpose to transform from HWC to CHW (H=height, W=width, C=channels). - # Pytorch requires CHW format - img.transpose(2, 0, 1) - # _dataset has 3 dimensions [years][train=0,valid=1,test=2]["images"/"labels"] - for split in splits # just train if --all not specified, else test, train and val - for img in self._dataset[year][split]["images"] - ] + def _get_year_data(self, year: int, create_test_data: bool) -> tuple[dict[str, list[tuple]], dict[str, int]]: + def get_split_by_id(split: int) -> list[Tuple]: + images = torch.FloatTensor( + np.array( + [ # transpose to transform from HWC to CHW (H=height, W=width, C=channels). + # Pytorch requires CHW format + img.transpose(2, 0, 1) + # _dataset has 2 dimensions [years][train=0,test=1]["images"/"labels"] + for img in self._dataset[year][split]["images"] + ] + ) ) - ) - labels = torch.cat([torch.LongTensor(self._dataset[year][split]["labels"]) for split in splits]) - return [(images[i], labels[i]) for i in range(len(images))] + labels = torch.LongTensor(self._dataset[year][split]["labels"]) + return [(images[i], labels[i]) for i in range(len(images))] + + if not create_test_data: + train_size = len(get_split_by_id(0)) + ds = {"train": get_split_by_id(0)} + stats = { "train": train_size } + else: + train_size = len(get_split_by_id(0)) + test_size = len(get_split_by_id(1)) + ds = {"train": get_split_by_id(0), "test": get_split_by_id(1)} + stats = {"train": train_size, "test": test_size} + return ds, stats def __len__(self) -> int: return len(self._dataset["labels"]) - def store_data(self, store_all_data: bool, add_final_dummy_year: bool) -> None: + def store_data(self, create_test_data: bool, add_final_dummy_year: bool) -> None: # create directories if not os.path.exists(self.data_dir): os.mkdir(self.data_dir) + train_dir = os.path.join(self.data_dir, "train") + os.makedirs(train_dir, exist_ok=True) + + if create_test_data: + test_dir = os.path.join(self.data_dir, "test") + os.makedirs(test_dir, exist_ok=True) + + overall_stats = {} for year in self.time_steps: - print(f"Saving data for year {year}") - ds = self._get_year_data(year, store_all_data) - self.create_binary_file(ds, - os.path.join(self.data_dir, f"{year}.bin"), + ds, stats = self._get_year_data(year, create_test_data) + overall_stats[year] = stats + self.create_binary_file(ds["train"], + os.path.join(train_dir, f"{year}.bin"), create_fake_timestamp(year, base_year=1930)) + if create_test_data: + self.create_binary_file(ds["test"], + os.path.join(test_dir, f"{year}.bin"), + create_fake_timestamp(year, base_year=1930)) + + with open(os.path.join(self.data_dir, "overall_stats.json"), "w") as f: + import json + json.dump(overall_stats, f, indent=4) if add_final_dummy_year: dummy_year = year + 1 - dummy_data = [ ds[0] ] # get one sample from the previous year + dummy_data = [ ds["train"][0] ] # get one sample from the previous year self.create_binary_file(dummy_data, - os.path.join(self.data_dir, f"{dummy_year}.bin"), + os.path.join(train_dir, f"{dummy_year}.bin"), create_fake_timestamp(dummy_year, base_year=1930)) + if create_test_data: + self.create_binary_file(dummy_data, + os.path.join(test_dir, f"{dummy_year}.bin"), + create_fake_timestamp(dummy_year, base_year=1930)) os.remove(os.path.join(self.data_dir, "yearbook.pkl")) @@ -85,9 +115,9 @@ def create_binary_file(data, output_file_name: str, timestamp: int) -> None: label_integer = tensor2.item() features_size = len(features_bytes) - assert features_size == 4096 + assert features_size == 12288 - f.write(int.to_bytes(label_integer, length=4, byteorder="little")) + f.write(int.to_bytes(label_integer, length=4, byteorder="big")) f.write(features_bytes) os.utime(output_file_name, (timestamp, timestamp)) diff --git a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml index 967379e4d..1908f6105 100644 --- a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml +++ b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml @@ -41,10 +41,12 @@ data: dataset_id: yearbook transformations: [] bytes_parser_function: | + import warnings import torch - import numpy as np def bytes_parser_function(data: memoryview) -> torch.Tensor: - return torch.from_numpy(np.frombuffer(data, dtype=np.float32)) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UserWarning) + return torch.frombuffer(data, dtype=torch.float32).reshape(3, 32, 32) trigger: id: TimeTrigger diff --git a/modyn/config/examples/modyn_config.yaml b/modyn/config/examples/modyn_config.yaml index 7b05a24f8..45109a20c 100644 --- a/modyn/config/examples/modyn_config.yaml +++ b/modyn/config/examples/modyn_config.yaml @@ -50,15 +50,33 @@ storage: }, { name: "yearbook", - description: "Yearbook Dataset from Wild-Time", + description: "Yearbook Dataset from Wild-Time (training set)", version: "0.0.1", - base_path: "/datasets/yearbook", + base_path: "/datasets/yearbook/train", filesystem_wrapper_type: "LocalFilesystemWrapper", file_wrapper_type: "BinaryFileWrapper", file_wrapper_config: { byteorder: "big", - record_size: 4100, + record_size: 12292, + label_size: 4, + file_extension: ".bin" + }, + ignore_last_timestamp: false, + file_watcher_interval: 5, + selector_batch_size: 256, + }, + { + name: "yearbook-test", + description: "Yearbook Dataset from Wild-Time (test set)", + version: "0.0.1", + base_path: "/datasets/yearbook/test", + filesystem_wrapper_type: "LocalFilesystemWrapper", + file_wrapper_type: "BinaryFileWrapper", + file_wrapper_config: + { + byteorder: "big", + record_size: 12292, label_size: 4, file_extension: ".bin" }, @@ -101,9 +119,26 @@ storage: }, { name: "arxiv", - description: "Arxiv Dataset (from Wild-time)", + description: "Arxiv Dataset from Wild-time (training set)", + version: "0.0.1", + base_path: "/datasets/arxiv/train", + filesystem_wrapper_type: "LocalFilesystemWrapper", + file_wrapper_type: "CsvFileWrapper", + file_wrapper_config: + { + file_extension: ".csv", + separator: "\t", #tsv best option here since sentences contain commas and semicolons + label_index: 1 + }, + ignore_last_timestamp: false, + file_watcher_interval: 5, + selector_batch_size: 4096, + }, + { + name: "arxiv-test", + description: "Arxiv Dataset from Wild-time (test set)", version: "0.0.1", - base_path: "/datasets/arxiv", + base_path: "/datasets/arxiv/test", filesystem_wrapper_type: "LocalFilesystemWrapper", file_wrapper_type: "CsvFileWrapper", file_wrapper_config: