Skip to content

Commit

Permalink
Generate test/train datasets for yearbook and arxiv (#383)
Browse files Browse the repository at this point in the history
This PR modifies the files to generate yearbook and arxiv datasets, so
that both train and test datasets are generated.
  • Loading branch information
XianzheMa authored Apr 30, 2024
1 parent 1f9b3d1 commit 1dc36ac
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 55 deletions.
4 changes: 2 additions & 2 deletions benchmark/wildtime_benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Use the `-h` flag to find out more.
The goal is to predict the sex given a yearbook picture.
The dataset contains 37189 samples collected from 1930 to 2013.
Since timestamps in Modyn are based on Unix Timestamps (so 0 is 1/1/1970) we have to remap the years to days.
Precisely, the timestamp for pictures from 1930 is 1/1/1970, then 2/1/1970 for the ones taken in 1931 and so forth.
Precisely, the timestamp for pictures from 1930 is 1/Jan/1970, then 2/Jan/1970 for the ones taken in 1931 and so forth.
Samples are saved using BinaryFileWrapper by grouping all samples of the same year in one file.

### FMoW
Expand All @@ -34,7 +34,7 @@ Titles belonging to the same year are grouped into the same CSV file and stored
Each year is mapped to a year starting from 1/1/1970.

### Arxiv
The goal is to predict the paper category (55 classes) given the paper title.
The goal is to predict the paper category (172 classes) given the paper title.
The dataset contains more than 2 million samples collected from 2002 to 2017.
Titles belonging to the same year are grouped into the same CSV file and stored together.
Each year is mapped to a year starting from 1/1/1970.
Expand Down
2 changes: 1 addition & 1 deletion benchmark/wildtime_benchmarks/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def setup_logger():


def create_fake_timestamp(year: int, base_year: int) -> int:
timestamp = ((year - base_year) * DAY_LENGTH_SECONDS) + 1
timestamp = ((year - base_year) * DAY_LENGTH_SECONDS)
return timestamp


Expand Down
98 changes: 76 additions & 22 deletions benchmark/wildtime_benchmarks/data_generation_arxiv.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import os
import pickle

import torch
from benchmark_utils import create_timestamp, download_if_not_exists, setup_argparser_wildtime, setup_logger
from torch.utils.data import Dataset
from tqdm import tqdm

logger = setup_logger()

Expand All @@ -17,19 +15,27 @@ def main():
ArXivDownloader(args.dir).store_data(args.all, args.dummyyear)


# There are some lines in the train dataset that are corrupted, i.e. the csv file wrapper cannot properly read the data.
# We remove these lines from the dataset.
corrupted_idx_dict = {
2007: [33213],
2008: [22489],
2009: [64621, 165454],
2015: [42007, 94935],
2016: [111398],
2019: [41309, 136814],
2020: [102074],
2021: [32013, 55660]
}


class ArXivDownloader(Dataset):
time_steps = [i for i in range(2007, 2023)]
input_dim = 55
num_classes = 172
drive_id = "1H5xzHHgXl8GOMonkb6ojye-Y2yIp436V"
file_name = "arxiv.pkl"

def __getitem__(self, idx):
return self._dataset["title"][idx], torch.LongTensor([self._dataset["category"][idx]])[0]

def __len__(self):
return len(self._dataset["category"])

def __init__(self, data_dir):
super().__init__()

Expand All @@ -43,40 +49,88 @@ def __init__(self, data_dir):
self._dataset = datasets
self.path = data_dir

def store_data(self, store_all_data: bool, add_final_dummy_year: bool):
for year in tqdm(self._dataset):
def store_data(self, create_test_data: bool, add_final_dummy_year: bool):
# create directories
if not os.path.exists(self.path):
os.mkdir(self.path)

train_dir = os.path.join(self.path, "train")
os.makedirs(train_dir, exist_ok=True)

if create_test_data:
test_dir = os.path.join(self.path, "test")
os.makedirs(test_dir, exist_ok=True)

stats = {}

for year in self._dataset:
# for simplicity, instead of using years we map each day to a year from 1970
year_timestamp = create_timestamp(year=1970, month=1, day=year-2006)
year_rows = []

splits = [0, 1] if store_all_data else [0]
for split in splits:
def get_split_by_id(split: int) -> list[str]:
rows = []
for i in range(len(self._dataset[year][split]["title"])):
text = self._dataset[year][split]["title"][i].replace("\n", " ")
label = self._dataset[year][split]["category"][i]
csv_row = f"{text}\t{label}"
year_rows.append(csv_row)
rows.append(csv_row)
return rows

# store the year file
text_file = os.path.join(self.path, f"{year}.csv")
with open(text_file, "w", encoding="utf-8") as f:
f.write("\n".join(year_rows))
train_year_rows = get_split_by_id(0)
train_year_rows = self.filter_corrupted_lines(year, train_year_rows)
train_file = os.path.join(train_dir, f"{year}.csv")
with open(train_file, "w", encoding="utf-8") as f:
f.write("\n".join(train_year_rows))

# set timestamp
os.utime(text_file, (year_timestamp, year_timestamp))
os.utime(train_file, (year_timestamp, year_timestamp))

if create_test_data:
test_year_rows = get_split_by_id(1)
test_file = os.path.join(test_dir, f"{year}.csv")
with open(test_file, "w", encoding="utf-8") as f:
f.write("\n".join(test_year_rows))

# set timestamp
os.utime(test_file, (year_timestamp, year_timestamp))
stats[year] = {"train": len(train_year_rows), "test": len(test_year_rows)}
else:
stats[year] = {"train": len(train_year_rows)}
with open(os.path.join(self.path, "overall_stats.json"), "w") as f:
import json
json.dump(stats, f, indent=4)

if add_final_dummy_year:
dummy_year = year + 1
year_timestamp = create_timestamp(year=1970, month=1, day= dummy_year - 2006)
text_file = os.path.join(self.path, f"{dummy_year}.csv")
with open(text_file, "w", encoding="utf-8") as f:
train_dummy_file = os.path.join(train_dir, f"{dummy_year}.csv")
with open(train_dummy_file, "w", encoding="utf-8") as f:
f.write("\n".join(["dummy\t0"]))

# set timestamp
os.utime(text_file, (year_timestamp, year_timestamp))
os.utime(train_dummy_file, (year_timestamp, year_timestamp))

if create_test_data:
test_dummy_file = os.path.join(test_dir, f"{dummy_year}.csv")
with open(test_dummy_file, "w", encoding="utf-8") as f:
f.write("\n".join(["dummy\t0"]))

# set timestamp
os.utime(test_dummy_file, (year_timestamp, year_timestamp))

os.remove(os.path.join(self.path, "arxiv.pkl"))

@staticmethod
def filter_corrupted_lines(year, rows):
if year in corrupted_idx_dict:
corrupted_idx = corrupted_idx_dict[year]
goodlines = []
for i, l in enumerate(rows):
if i not in corrupted_idx:
goodlines.append(l)
return goodlines
return rows


if __name__ == "__main__":
main()
76 changes: 53 additions & 23 deletions benchmark/wildtime_benchmarks/data_generation_yearbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,43 +37,73 @@ def __init__(self, data_dir: str):
self._dataset = datasets
self.data_dir = data_dir

def _get_year_data(self, year: int, store_all_data: bool) -> list[Tuple]:
splits = [0, 1] if store_all_data else [0]
images = torch.FloatTensor(
np.array(
[ # transpose to transform from HWC to CHW (H=height, W=width, C=channels).
# Pytorch requires CHW format
img.transpose(2, 0, 1)
# _dataset has 3 dimensions [years][train=0,valid=1,test=2]["images"/"labels"]
for split in splits # just train if --all not specified, else test, train and val
for img in self._dataset[year][split]["images"]
]
def _get_year_data(self, year: int, create_test_data: bool) -> tuple[dict[str, list[tuple]], dict[str, int]]:
def get_split_by_id(split: int) -> list[Tuple]:
images = torch.FloatTensor(
np.array(
[ # transpose to transform from HWC to CHW (H=height, W=width, C=channels).
# Pytorch requires CHW format
img.transpose(2, 0, 1)
# _dataset has 2 dimensions [years][train=0,test=1]["images"/"labels"]
for img in self._dataset[year][split]["images"]
]
)
)
)
labels = torch.cat([torch.LongTensor(self._dataset[year][split]["labels"]) for split in splits])
return [(images[i], labels[i]) for i in range(len(images))]
labels = torch.LongTensor(self._dataset[year][split]["labels"])
return [(images[i], labels[i]) for i in range(len(images))]

if not create_test_data:
train_size = len(get_split_by_id(0))
ds = {"train": get_split_by_id(0)}
stats = { "train": train_size }
else:
train_size = len(get_split_by_id(0))
test_size = len(get_split_by_id(1))
ds = {"train": get_split_by_id(0), "test": get_split_by_id(1)}
stats = {"train": train_size, "test": test_size}
return ds, stats

def __len__(self) -> int:
return len(self._dataset["labels"])

def store_data(self, store_all_data: bool, add_final_dummy_year: bool) -> None:
def store_data(self, create_test_data: bool, add_final_dummy_year: bool) -> None:
# create directories
if not os.path.exists(self.data_dir):
os.mkdir(self.data_dir)

train_dir = os.path.join(self.data_dir, "train")
os.makedirs(train_dir, exist_ok=True)

if create_test_data:
test_dir = os.path.join(self.data_dir, "test")
os.makedirs(test_dir, exist_ok=True)

overall_stats = {}
for year in self.time_steps:
print(f"Saving data for year {year}")
ds = self._get_year_data(year, store_all_data)
self.create_binary_file(ds,
os.path.join(self.data_dir, f"{year}.bin"),
ds, stats = self._get_year_data(year, create_test_data)
overall_stats[year] = stats
self.create_binary_file(ds["train"],
os.path.join(train_dir, f"{year}.bin"),
create_fake_timestamp(year, base_year=1930))
if create_test_data:
self.create_binary_file(ds["test"],
os.path.join(test_dir, f"{year}.bin"),
create_fake_timestamp(year, base_year=1930))

with open(os.path.join(self.data_dir, "overall_stats.json"), "w") as f:
import json
json.dump(overall_stats, f, indent=4)

if add_final_dummy_year:
dummy_year = year + 1
dummy_data = [ ds[0] ] # get one sample from the previous year
dummy_data = [ ds["train"][0] ] # get one sample from the previous year
self.create_binary_file(dummy_data,
os.path.join(self.data_dir, f"{dummy_year}.bin"),
os.path.join(train_dir, f"{dummy_year}.bin"),
create_fake_timestamp(dummy_year, base_year=1930))
if create_test_data:
self.create_binary_file(dummy_data,
os.path.join(test_dir, f"{dummy_year}.bin"),
create_fake_timestamp(dummy_year, base_year=1930))

os.remove(os.path.join(self.data_dir, "yearbook.pkl"))

Expand All @@ -85,9 +115,9 @@ def create_binary_file(data, output_file_name: str, timestamp: int) -> None:
label_integer = tensor2.item()

features_size = len(features_bytes)
assert features_size == 4096
assert features_size == 12288

f.write(int.to_bytes(label_integer, length=4, byteorder="little"))
f.write(int.to_bytes(label_integer, length=4, byteorder="big"))
f.write(features_bytes)

os.utime(output_file_name, (timestamp, timestamp))
Expand Down
6 changes: 4 additions & 2 deletions benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,12 @@ data:
dataset_id: yearbook
transformations: []
bytes_parser_function: |
import warnings
import torch
import numpy as np
def bytes_parser_function(data: memoryview) -> torch.Tensor:
return torch.from_numpy(np.frombuffer(data, dtype=np.float32))
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=UserWarning)
return torch.frombuffer(data, dtype=torch.float32).reshape(3, 32, 32)
trigger:
id: TimeTrigger
Expand Down
45 changes: 40 additions & 5 deletions modyn/config/examples/modyn_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,33 @@ storage:
},
{
name: "yearbook",
description: "Yearbook Dataset from Wild-Time",
description: "Yearbook Dataset from Wild-Time (training set)",
version: "0.0.1",
base_path: "/datasets/yearbook",
base_path: "/datasets/yearbook/train",
filesystem_wrapper_type: "LocalFilesystemWrapper",
file_wrapper_type: "BinaryFileWrapper",
file_wrapper_config:
{
byteorder: "big",
record_size: 4100,
record_size: 12292,
label_size: 4,
file_extension: ".bin"
},
ignore_last_timestamp: false,
file_watcher_interval: 5,
selector_batch_size: 256,
},
{
name: "yearbook-test",
description: "Yearbook Dataset from Wild-Time (test set)",
version: "0.0.1",
base_path: "/datasets/yearbook/test",
filesystem_wrapper_type: "LocalFilesystemWrapper",
file_wrapper_type: "BinaryFileWrapper",
file_wrapper_config:
{
byteorder: "big",
record_size: 12292,
label_size: 4,
file_extension: ".bin"
},
Expand Down Expand Up @@ -101,9 +119,26 @@ storage:
},
{
name: "arxiv",
description: "Arxiv Dataset (from Wild-time)",
description: "Arxiv Dataset from Wild-time (training set)",
version: "0.0.1",
base_path: "/datasets/arxiv/train",
filesystem_wrapper_type: "LocalFilesystemWrapper",
file_wrapper_type: "CsvFileWrapper",
file_wrapper_config:
{
file_extension: ".csv",
separator: "\t", #tsv best option here since sentences contain commas and semicolons
label_index: 1
},
ignore_last_timestamp: false,
file_watcher_interval: 5,
selector_batch_size: 4096,
},
{
name: "arxiv-test",
description: "Arxiv Dataset from Wild-time (test set)",
version: "0.0.1",
base_path: "/datasets/arxiv",
base_path: "/datasets/arxiv/test",
filesystem_wrapper_type: "LocalFilesystemWrapper",
file_wrapper_type: "CsvFileWrapper",
file_wrapper_config:
Expand Down

0 comments on commit 1dc36ac

Please sign in to comment.