Skip to content

Commit

Permalink
test commit
Browse files Browse the repository at this point in the history
  • Loading branch information
kylematoba committed Oct 22, 2024
1 parent 6216fab commit 4871068
Showing 1 changed file with 37 additions and 0 deletions.
37 changes: 37 additions & 0 deletions validation_driver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import datetime as dt

import nanotron.config
from nanotron.trainer import DistributedTrainer
from nanotron.data.nanoset import Nanoset
from nanotron.config import DataArgs, DatasetStageArgs, NanosetDatasetsArgs, PretrainDatasetsArgs


if __name__ == "__main__":
fullfilename = "/Users/kylematoba/cscs/pretrain/config_dir/nanotron/fedf_ablation_3B.yaml"
cfg = nanotron.config.get_config_from_file(fullfilename)

# trainer = DistributedTrainer(fullfilename)

dataset_list = cfg.validation.datasets
vds = dataset_list[0]
data = vds["data"]
del vds

# tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# cfg.tokens.train_steps
# token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
token_size = 2
global_batch_size = cfg.tokens.micro_batch_size * cfg.tokens.batch_accumulation_per_replica * cfg.parallelism.dp

df = data["dataset"]["dataset_folder"]
folders = list(df.keys())
weights = [df[_] for _ in folders]
train_dataset = Nanoset(
dataset_folders=folders,
dataset_weights=weights,
sequence_length=cfg.tokens.sequence_length,
token_size=token_size,
train_split_num_samples=cfg.tokens.train_steps * global_batch_size,
random_seed=data["seed"],
)
print(f"done {dt.datetime.utcnow()}")

0 comments on commit 4871068

Please sign in to comment.