config.yaml

file_path: 'data/pretrain.csv'          # pretrain data path

epochs: 30                              # total number of epochs
batch_size: 100                         # batch size
lr_rate: 0.00005                        # learning rate
scheduler_type: 'linear'                # scheduler type
weight_decay: 0.0                       # weight decay for AdamW
warmup_ratio: 0.05                      # warm-up ratio for scheduler
save_strategy: 'epoch'                  # save strategy of trainer
overwrite_output_dir: True              # whether to overwrite output directory (i.e. True/False)
save_total_limit: 3                    # save total limit of trainer
fp16: True                              # float precision 16 (i.e. True/False)
logging_strategy: 'epoch'               # logging frequency
evaluation_strategy: 'epoch'            # validation frequency
report_to: 'tensorboard'                # integrations to report the results and logs to
dataloader_num_workers: 18              # Number of subprocesses to use for data loading
sharded_ddp: False                   # option of Sharded DDP training
save_path: 'ckpt/pretrain.pt'           # logging and save path of the pretrained model
load_checkpoint: False

max_position_embeddings: 514            # max position embeddings of Transformer
blocksize: 175                          # max length of sequences after tokenization
num_attention_heads: 12                 # number of attention heads in each hidden layer
num_hidden_layers: 6                    # number of hidden layers
hidden_dropout_prob: 0.1                # hidden layer dropout
attention_probs_dropout_prob: 0.1       # attention dropout
mlm_probability: 0.15                   # masked probability in mlm