Skip to content

Commit

Permalink
Merge branch 'main' into features/vgs/storage_cp
Browse files Browse the repository at this point in the history
  • Loading branch information
vGsteiger committed Oct 16, 2023
2 parents ac2b637 + d37d9bf commit ed8a32e
Show file tree
Hide file tree
Showing 112 changed files with 5,741 additions and 407 deletions.
3 changes: 0 additions & 3 deletions .github/workflows/workflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,6 @@ jobs:
run: docker run modynbase mamba run -n modyn bash -c "pip install -r dev-requirements.txt && echo Running pytest && pytest"


# Tests whether docker-compose up starts all components successfully and integration tests run through
# Only one job to reduce Github CI usage
integrationtests:
timeout-minutes: 60
runs-on: ubuntu-latest
Expand All @@ -347,7 +345,6 @@ jobs:
- unittests
- isort
- black
- dockerized-unittests

steps:
- name: Check out code
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,6 @@ cmake-build-debug
environment.yml.original
docker-compose.yml.original
Dockerfile.original

# Experimental things
plots/
1 change: 1 addition & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ ignore-paths=^modyn/trainer_server/internal/grpc/generated/.*$,
^modyn/models/dlrm/cuda_src/.*$,
^modyn/models/dlrm/utils/.*$,
^modyn/models/dlrm/nn/.*$,
^modyn/trainer_server/internal/trainer/remote_downsamplers/deepcore_utils/.*$,

# Files or directories matching the regex patterns are skipped. The regex
# matches against base names, not paths. The default value ignores Emacs file
Expand Down
4 changes: 3 additions & 1 deletion benchmark/criteo_1TB/pipelines/exp0_finetune.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ training:
gpus: 1
device: "cuda:0"
amp: True
dataloader_workers: 8
dataloader_workers: 16
num_prefetched_partitions: 4
parallel_prefetch_requests: 2
use_previous_model: True
initial_model: random
initial_pass:
Expand Down
4 changes: 3 additions & 1 deletion benchmark/criteo_1TB/pipelines/exp1_finetune_ablation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ training:
gpus: 1
device: "cuda:0"
amp: True
dataloader_workers: 8
dataloader_workers: 16
num_prefetched_partitions: 4
parallel_prefetch_requests: 2
use_previous_model: True
initial_model: random
initial_pass:
Expand Down
4 changes: 3 additions & 1 deletion benchmark/criteo_1TB/pipelines/exp2_retrain_keep_model.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ training:
gpus: 1
device: "cuda:0"
amp: True
dataloader_workers: 8
dataloader_workers: 16
num_prefetched_partitions: 4
parallel_prefetch_requests: 2
use_previous_model: True
initial_model: random
initial_pass:
Expand Down
4 changes: 3 additions & 1 deletion benchmark/criteo_1TB/pipelines/exp3_retrain_new_model.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ training:
gpus: 1
device: "cuda:0"
amp: True
dataloader_workers: 8
dataloader_workers: 16
num_prefetched_partitions: 4
parallel_prefetch_requests: 2
use_previous_model: False # Same amount of computation (retraining on all data), but on different starting weights
initial_model: random
initial_pass:
Expand Down
4 changes: 3 additions & 1 deletion benchmark/criteo_1TB/pipelines/exp4_current_day_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ training:
gpus: 1
device: "cuda:0"
amp: True
dataloader_workers: 8
dataloader_workers: 16
num_prefetched_partitions: 4
parallel_prefetch_requests: 2
use_previous_model: False
initial_model: random
initial_pass:
Expand Down
13 changes: 13 additions & 0 deletions benchmark/wildtime_benchmarks/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@ def setup_argparser_wildtime(dataset: str) -> argparse.ArgumentParser:
"--dir", type=pathlib.Path, action="store", help="Path to data directory"
)

parser_.add_argument(
"--all", action="store_true", help="Store all the available data, including the validation and test sets."
)
parser_.add_argument(
"--dummyyear", action="store_true", help="Add a final dummy year to train also on the last trigger in Modyn"
)

if dataset == "fMoW":
parser_.add_argument(
"--daily", action="store_true", help="If specified, data is stored with real timestamps (dd/mm/yy)."
"Otherwise, only the year is considered (as done in the other "
"datasets).")

return parser_


Expand Down
27 changes: 20 additions & 7 deletions benchmark/wildtime_benchmarks/data_generation_arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def main():
args = parser.parse_args()

logger.info(f"Downloading data to {args.dir}")
ArXivDownloader(args.dir).store_data()
ArXivDownloader(args.dir).store_data(args.all, args.dummyyear)


class ArXivDownloader(Dataset):
Expand Down Expand Up @@ -43,16 +43,19 @@ def __init__(self, data_dir):
self._dataset = datasets
self.path = data_dir

def store_data(self):
def store_data(self, store_all_data: bool, add_final_dummy_year: bool):
for year in tqdm(self._dataset):
# for simplicity, instead of using years we map each day to a year from 1970
year_timestamp = create_timestamp(year=1970, month=1, day=year-2006)
year_rows = []
for i in range(len(self._dataset[year][0]["title"])):
text = self._dataset[year][0]["title"][i].replace("\n", " ")
label = self._dataset[year][0]["category"][i]
csv_row = f"{text}\t{label}"
year_rows.append(csv_row)

splits = [0, 1] if store_all_data else [0]
for split in splits:
for i in range(len(self._dataset[year][split]["title"])):
text = self._dataset[year][split]["title"][i].replace("\n", " ")
label = self._dataset[year][split]["category"][i]
csv_row = f"{text}\t{label}"
year_rows.append(csv_row)

# store the year file
text_file = os.path.join(self.path, f"{year}.csv")
Expand All @@ -62,6 +65,16 @@ def store_data(self):
# set timestamp
os.utime(text_file, (year_timestamp, year_timestamp))

if add_final_dummy_year:
dummy_year = year + 1
year_timestamp = create_timestamp(year=1970, month=1, day= dummy_year - 2006)
text_file = os.path.join(self.path, f"{dummy_year}.csv")
with open(text_file, "w", encoding="utf-8") as f:
f.write("\n".join(["dummy\t0"]))

# set timestamp
os.utime(text_file, (year_timestamp, year_timestamp))

os.remove(os.path.join(self.path, "arxiv.pkl"))


Expand Down
72 changes: 47 additions & 25 deletions benchmark/wildtime_benchmarks/data_generation_fmow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import shutil
from datetime import datetime

from benchmark_utils import download_if_not_exists, setup_argparser_wildtime, setup_logger
from benchmark_utils import create_timestamp, download_if_not_exists, setup_argparser_wildtime, setup_logger
from torch.utils.data import Dataset
from tqdm import tqdm
from wilds import get_dataset
Expand All @@ -13,13 +13,13 @@


def main() -> None:
parser = setup_argparser_wildtime("FMoW")
parser = setup_argparser_wildtime("fMoW")
args = parser.parse_args()

logger.info(f"Downloading data to {args.dir}")

downloader = FMOWDownloader(args.dir)
downloader.store_data()
downloader.store_data(args.daily, args.all, args.dummyyear)
downloader.clean_folder()


Expand Down Expand Up @@ -59,30 +59,52 @@ def move_file_and_rename(self, index: int) -> None:
new_name = os.path.join(self.data_dir, f"{index}.png")
os.rename(dest_file, new_name)

def store_data(self) -> None:
def store_data(self, store_daily: bool, store_all_data: bool, add_final_dummy_year: bool) -> None:

for year in tqdm(self._dataset):
split = 0 # just use training split for now
for i in range(len(self._dataset[year][split]["image_idxs"])):
index = self._dataset[year][split]["image_idxs"][i]
label = self._dataset[year][split]["labels"][i]
raw_timestamp = self.metadata[index]["timestamp"]

if len(raw_timestamp) == 24:
timestamp = datetime.strptime(raw_timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')
else:
timestamp = datetime.strptime(raw_timestamp, '%Y-%m-%dT%H:%M:%SZ')

# save label
label_file = os.path.join(self.data_dir, f"{index}.label")
with open(label_file, "w", encoding="utf-8") as f:
f.write(str(int(label)))
os.utime(label_file, (timestamp.timestamp(), timestamp.timestamp()))

# set image timestamp
self.move_file_and_rename(index)
image_file = os.path.join(self.data_dir, f"{index}.png")
os.utime(image_file, (timestamp.timestamp(), timestamp.timestamp()))
splits = [0, 1] if store_all_data else [0]
for split in splits:
for i in range(len(self._dataset[year][split]["image_idxs"])):
index = self._dataset[year][split]["image_idxs"][i]
label = self._dataset[year][split]["labels"][i]

if store_daily:
raw_timestamp = self.metadata[index]["timestamp"]

if len(raw_timestamp) == 24:
timestamp = datetime.strptime(raw_timestamp, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp()
else:
timestamp = datetime.strptime(raw_timestamp, '%Y-%m-%dT%H:%M:%SZ').timestamp()
else:
timestamp = create_timestamp(year=1970, month=1, day=year+1)

# save label
label_file = os.path.join(self.data_dir, f"{index}.label")
with open(label_file, "w", encoding="utf-8") as f:
f.write(str(int(label)))
os.utime(label_file, (timestamp, timestamp))

# set image timestamp
self.move_file_and_rename(index)
image_file = os.path.join(self.data_dir, f"{index}.png")
os.utime(image_file, (timestamp, timestamp))

if add_final_dummy_year:
dummy_year = year + 1
timestamp = create_timestamp(year=1970, month=1, day=dummy_year+1)
dummy_index = 1000000 #not used by any real sample (last: 99999)

to_copy_image_file = os.path.join(self.data_dir, f"{index}.png")
dummy_image_file = os.path.join(self.data_dir, f"{dummy_index}.png")
shutil.copy(to_copy_image_file, dummy_image_file)
os.utime(dummy_image_file, (timestamp, timestamp))

to_copy_label_file = os.path.join(self.data_dir, f"{index}.label")
dummy_label_file = os.path.join(self.data_dir, f"{dummy_index}.label")
shutil.copy(to_copy_label_file, dummy_label_file)
os.utime(dummy_label_file, (timestamp, timestamp))



@staticmethod
def parse_metadata(data_dir: str) -> list:
Expand Down
27 changes: 20 additions & 7 deletions benchmark/wildtime_benchmarks/data_generation_huffpost.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def main():
args = parser.parse_args()

logger.info(f"Downloading data to {args.dir}")
HuffpostDownloader(args.dir).store_data()
HuffpostDownloader(args.dir).store_data(args.all, args.dummyyear)


class HuffpostDownloader(Dataset):
Expand Down Expand Up @@ -44,15 +44,17 @@ def __init__(self, data_dir: str):
self._dataset = datasets
self.path = data_dir

def store_data(self) -> None:
def store_data(self, store_all_data: bool, add_final_dummy_year: bool) -> None:
for year in tqdm(self._dataset):
year_timestamp = create_timestamp(year=1970, month=1, day=year-2011)
year_rows = []
for i in range(len(self._dataset[year][0]["headline"])):
text = self._dataset[year][0]["headline"][i]
label = self._dataset[year][0]["category"][i]
csv_row = f"{text}\t{label}"
year_rows.append(csv_row)
splits = [0, 1] if store_all_data else [0]
for split in splits:
for i in range(len(self._dataset[year][split]["headline"])):
text = self._dataset[year][split]["headline"][i]
label = self._dataset[year][split]["category"][i]
csv_row = f"{text}\t{label}"
year_rows.append(csv_row)

# store the sentences
text_file = os.path.join(self.path, f"{year}.csv")
Expand All @@ -62,6 +64,17 @@ def store_data(self) -> None:
# set timestamp
os.utime(text_file, (year_timestamp, year_timestamp))

if add_final_dummy_year:
dummy_year = year + 1
year_timestamp = create_timestamp(year=1970, month=1, day= dummy_year - 2011)
text_file = os.path.join(self.path, f"{dummy_year}.csv")
with open(text_file, "w", encoding="utf-8") as f:
f.write("\n".join(["dummy\t0"]))

# set timestamp
os.utime(text_file, (year_timestamp, year_timestamp))


os.remove(os.path.join(self.path, "huffpost.pkl"))


Expand Down
25 changes: 17 additions & 8 deletions benchmark/wildtime_benchmarks/data_generation_yearbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def main():
logger.info(f"Downloading data to {args.dir}")

downloader = YearbookDownloader(args.dir)
downloader.store_data()
downloader.store_data(args.all, args.dummyyear)


class YearbookDownloader(Dataset):
Expand All @@ -38,35 +38,44 @@ def __init__(self, data_dir: str):
self._dataset = datasets
self.data_dir = data_dir

def _get_year_data(self, year: int) -> list[Tuple]:
def _get_year_data(self, year: int, store_all_data: bool) -> list[Tuple]:
splits = [0, 1] if store_all_data else [0]
images = torch.FloatTensor(
np.array(
[ # transpose to transform from HWC to CHW (H=height, W=width, C=channels).
# Pytorch requires CHW format
img.transpose(2, 0, 1)[0].reshape(*self.input_dim)
# _dataset has 3 dimensions [years][train=0,valid=1]["images"/"labels"]
for img in self._dataset[year][0]["images"]
img.transpose(2, 0, 1)[split].reshape(*self.input_dim)
# _dataset has 3 dimensions [years][train=0,valid=1,test=2]["images"/"labels"]
for split in splits # just train if --all not specified, else test, train and val
for img in self._dataset[year][split]["images"]
]
)
)
labels = torch.LongTensor(self._dataset[year][0]["labels"])
labels = torch.cat([torch.LongTensor(self._dataset[year][split]["labels"]) for split in splits])
return [(images[i], labels[i]) for i in range(len(images))]

def __len__(self) -> int:
return len(self._dataset["labels"])

def store_data(self) -> None:
def store_data(self, store_all_data: bool, add_final_dummy_year: bool) -> None:
# create directories
if not os.path.exists(self.data_dir):
os.mkdir(self.data_dir)

for year in self.time_steps:
print(f"Saving data for year {year}")
ds = self._get_year_data(year)
ds = self._get_year_data(year, store_all_data)
self.create_binary_file(ds,
os.path.join(self.data_dir, f"{year}.bin"),
create_fake_timestamp(year, base_year=1930))

if add_final_dummy_year:
dummy_year = year + 1
dummy_data = [ ds[0] ] # get one sample from the previous year
self.create_binary_file(dummy_data,
os.path.join(self.data_dir, f"{dummy_year}.bin"),
create_fake_timestamp(dummy_year, base_year=1930))

os.remove(os.path.join(self.data_dir, "yearbook.pkl"))

@staticmethod
Expand Down
1 change: 1 addition & 0 deletions black.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ extend-exclude = """\
.*/*\\_pb2.py|\
.*/generated/.*\
.*/benchmark/.*\
.*/plotting/.*\
"""
4 changes: 3 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,16 @@ dependencies:
- tqdm
- conda-forge::enlighten
- protobuf
- grpcio
- pip:
- grpcio
- jsonschema
- psycopg2
- sqlalchemy>=2.0
- pyaml
- numpy
- pandas
- tensorboard
- scipy
- pyftpdlib
- types-protobuf
- types-psycopg2
Expand Down
1 change: 1 addition & 0 deletions experiments/criteo_online_dataset/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is an experiment to evaluate the performance of the OnlineDataset with the Criteo dataset. If you are just a user and not developer of Modyn, you can safely ignore this.
Loading

0 comments on commit ed8a32e

Please sign in to comment.