-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathexample_config.yaml
300 lines (283 loc) · 8.84 KB
/
example_config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
settings:
experiment_id: ${modalities_env:experiment_id}
config_file_path: ${modalities_env:config_file_path}
referencing_keys:
sample_key: input_ids
target_key: target_ids
prediction_key: logits
cuda_env:
local_rank: ${cuda_env:LOCAL_RANK}
global_rank: ${cuda_env:RANK}
world_size: ${cuda_env:WORLD_SIZE}
paths:
checkpoint_saving_path: ./checkpoints
train_dataset_path: ./data/mem_map/redpajama_v2_samples_512_train.pbin
val_dataset_path: ./data/mem_map/redpajama_v2_samples_512_test.pbin
intervals:
training_log_interval_in_steps: 48
checkpointing_interval_in_steps: 48
evaluation_interval_in_steps: 48
consistency_enforcement:
enforce_tokens_per_step_consistency: true
enforce_last_step_logged: false
enforce_last_step_evaluated: false
enforce_last_step_checkpointed: false
step_profile:
gradient_accumulation_steps: 1
local_train_micro_batch_size: 8
sequence_length: 512
training_target:
num_target_tokens:
component_key: number_conversion
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
config:
dataset_path: ${settings.paths.train_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
num_target_steps: # for the batch progress subscriber
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
global_num_tokens: ${settings.training_target.num_target_tokens}
sequence_length: ${settings.step_profile.sequence_length}
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
training_progress:
global_num_seen_tokens: 0
num_seen_steps: 0
num_seen_samples: 0
last_step: -1
collate_fn:
component_key: collate_fn
variant_key: gpt_2_llm_collator
config:
sample_key: ${settings.referencing_keys.sample_key}
target_key: ${settings.referencing_keys.target_key}
train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: ${settings.paths.train_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}
train_dataloader:
component_key: data_loader
variant_key: default
config:
num_workers: 2
pin_memory: true
dataloader_tag: train
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.step_profile.local_train_micro_batch_size}
drop_last: true
sampler:
component_key: sampler
variant_key: resumable_distributed_sampler
config:
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: true
seed: 42
drop_last: true
skip_num_global_samples: ${settings.training_progress.num_seen_samples}
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE
val_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: ${settings.paths.val_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}
val_dataloader:
component_key: data_loader
variant_key: default
config:
num_workers: 2
pin_memory: true
dataloader_tag: val
dataset:
instance_key: val_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.step_profile.local_train_micro_batch_size}
drop_last: true
sampler:
component_key: sampler
variant_key: distributed_sampler
config:
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: false
drop_last: true
dataset:
instance_key: val_dataset
pass_type: BY_REFERENCE
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE
eval_dataloaders:
- instance_key: val_dataloader
pass_type: BY_REFERENCE
checkpoint_saving:
component_key: checkpoint_saving
variant_key: default
config:
checkpoint_saving_strategy:
component_key: checkpoint_saving_strategy
variant_key: save_k_most_recent_checkpoints_strategy
config:
k: -1 # -1 to save all checkpoints
checkpoint_saving_execution:
component_key: checkpoint_saving_execution
variant_key: fsdp
config:
checkpoint_path: ${settings.paths.checkpoint_saving_path}
global_rank: ${settings.cuda_env.global_rank}
experiment_id: ${settings.experiment_id}
loss_fn:
component_key: loss
variant_key: clm_cross_entropy_loss
config:
target_key: ${settings.referencing_keys.target_key}
prediction_key: ${settings.referencing_keys.prediction_key}
wrapped_model:
component_key: model
variant_key: fsdp_wrapped
config:
model:
instance_key: model
pass_type: BY_REFERENCE
sync_module_states: true
mixed_precision_settings: BF_16
sharding_strategy: FULL_SHARD
block_names: [GPT2Block]
model:
component_key: model
variant_key: model_initialized
config:
model:
instance_key: model_raw
pass_type: BY_REFERENCE
model_initializer:
component_key: model_initialization
variant_key: composed
config:
model_type: gpt2
weight_init_type: scaled
mean: 0.0
std: 0.02
num_layers: ${model_raw.config.n_layer}
model_raw:
component_key: model
variant_key: gpt2
config:
sample_key: ${settings.referencing_keys.sample_key}
poe_type: NOPE
sequence_length: ${settings.step_profile.sequence_length}
prediction_key: ${loss_fn.config.prediction_key}
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 2
n_head_q: 8
n_head_kv: 8
ffn_hidden: 128
n_embd: 128
dropout: 0.0
bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
attention_config:
qkv_transforms:
- type_hint: RotaryTransform
config:
n_embd: ${model_raw.config.n_embd}
n_head: ${model_raw.config.n_head_q} #it has to be head_q here
seq_length_dim: -2
attention_implementation: manual
activation_type: gelu
attention_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model_raw.config.n_embd}
bias: true
epsilon: 1e-5
ffn_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model_raw.config.n_embd}
bias: true
epsilon: 1e-5
lm_head_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model_raw.config.n_embd}
bias: true
epsilon: 1e-5
scheduler:
component_key: scheduler
variant_key: step_lr
config:
optimizer:
instance_key: optimizer
pass_type: BY_REFERENCE
step_size: 1
gamma: 0.1
optimizer:
component_key: optimizer
variant_key: adam_w
config:
lr: 0.0001
betas: [0.9, 0.95]
eps: 1e-8
weight_decay: 1e-1
weight_decay_groups_excluded: [embedding, layernorm]
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE
gradient_clipper:
component_key: gradient_clipper
variant_key: fsdp
config:
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE
norm_type: P2_NORM
max_norm: 1.0
progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
global_rank: ${settings.cuda_env.global_rank}
num_seen_steps: ${settings.training_progress.num_seen_steps}
num_target_steps: ${settings.training_target.num_target_steps}
train_dataloader_tag: ${train_dataloader.config.dataloader_tag}
eval_dataloaders:
instance_key: eval_dataloaders
pass_type: BY_REFERENCE
evaluation_subscriber:
component_key: results_subscriber
variant_key: wandb
config:
global_rank: ${settings.cuda_env.global_rank}
project: modalities_getting_started
mode: OFFLINE
experiment_id: ${settings.experiment_id}
directory: wandb_storage
config_file_path: ${settings.config_file_path}