-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yaml
48 lines (47 loc) · 1.43 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
model:
dim: 768
seq: 1024
heads: 12
layers: 12
hidden_size: 2048
vocab_size: 30000
positional:
name: rotatory # Options: [absolute, rotary]. (default: rotary)
parameters:
theta: 10000 # (default: 10000)
# Refer https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity
activation:
name: SwiGLU # Options: [ReLU, LeakyReLU, PReLU, SiLU, GLU, GELU, SwiGLU]. (default: SwiGLU)
parameters:
approximate: none
# Refer https://pytorch.org/docs/stable/nn.html#normalization-layers
norm:
name: LayerNorm # Options: [LayerNorm, RMSNorm]. (default: LayerNorm)
parameters:
eps: 1e-05
# Whether to add bias to the linear projections of query, key & values. (default: False)
qkv_bias: false
# Share weights between embedding and output layer. (default: true)
tie_weights: true
# Grouped Query Attention. (default: disabled)
# gqa:
# kv_heads: 4
loop:
lr: 3e-4 # Ignored when scheduler is enabled
batch_size: 32
optimizer:
name: AdamW # Options [Adam, AdamW, RMSProp]. (default: AdamW)
parameters:
betas: [0.9, 0.99]
eps: 1e-8
weight_decay: 1e-2
fused: true
# No. of warmup steps
# warmup: 3000
# gradient_clipping: (default: disabled)
# type: norm # Options: [value, norm] (default: norm)
# clip_value: 1.0
# Learning rate scheduler (default: disabled)
# schedule:
# peak_lr: 3e-4
# weight_decay: 0.05