-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfigs.py
56 lines (45 loc) · 1.25 KB
/
configs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from dataclasses import dataclass, field
from typing import Tuple
@dataclass
class GPTConfig:
block_size: int = 1024
vocab_size: int = 50257 # 50k BPE merges + 256 bytes tokens + <|endoftext|>
n_layer: int = 12
n_head: int = 12
n_embed: int = 768
@dataclass
class GPTTrainConfig:
micro_batch_size: int = 16
tokens_per_batch: int = 524288 # 2**19 ~0.5M
seed: int = 1337
float32_matmul_precision: str = "high"
val_microbatch_steps: int = 500
val_interval: int = 300
generate_interval: int = 300
run_name: str = "fineweb-train-full"
checkpoint_interval: int = 200
@dataclass
class OptimizerConfig:
betas: Tuple[float, float] = (
0.9,
0.95,
)
weight_decay: float = 0.1
eps: float = 1e-8
clip_grad_max_norm: float = 1.0
warmup_steps: int = 715
max_lr: float = 6e-4
min_lr: float = field(init=False)
max_steps: int = 19073 # 10B tokens / tokens_per_batch
def __post_init__(self):
self.min_lr = self.max_lr * 0.1
@dataclass
class GPTDataConfig:
path: str = "fineweb_edu"
limit_files: int = -1
@dataclass
class Config:
data_config: GPTDataConfig
model_config: GPTConfig
optimizer_config: OptimizerConfig
train_config: GPTTrainConfig