From 48710687a154b2c9ee2476013f28306a46dde8e0 Mon Sep 17 00:00:00 2001 From: Kyle Matoba <22180455+kylematoba@users.noreply.github.com> Date: Tue, 22 Oct 2024 10:08:32 +0200 Subject: [PATCH] test commit --- validation_driver.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 validation_driver.py diff --git a/validation_driver.py b/validation_driver.py new file mode 100644 index 00000000..182f28a7 --- /dev/null +++ b/validation_driver.py @@ -0,0 +1,37 @@ +import datetime as dt + +import nanotron.config +from nanotron.trainer import DistributedTrainer +from nanotron.data.nanoset import Nanoset +from nanotron.config import DataArgs, DatasetStageArgs, NanosetDatasetsArgs, PretrainDatasetsArgs + + +if __name__ == "__main__": + fullfilename = "/Users/kylematoba/cscs/pretrain/config_dir/nanotron/fedf_ablation_3B.yaml" + cfg = nanotron.config.get_config_from_file(fullfilename) + + # trainer = DistributedTrainer(fullfilename) + + dataset_list = cfg.validation.datasets + vds = dataset_list[0] + data = vds["data"] + del vds + + # tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + # cfg.tokens.train_steps + # token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2 + token_size = 2 + global_batch_size = cfg.tokens.micro_batch_size * cfg.tokens.batch_accumulation_per_replica * cfg.parallelism.dp + + df = data["dataset"]["dataset_folder"] + folders = list(df.keys()) + weights = [df[_] for _ in folders] + train_dataset = Nanoset( + dataset_folders=folders, + dataset_weights=weights, + sequence_length=cfg.tokens.sequence_length, + token_size=token_size, + train_split_num_samples=cfg.tokens.train_steps * global_batch_size, + random_seed=data["seed"], + ) + print(f"done {dt.datetime.utcnow()}")